# Chapter 10 - Deep Learning

The following excercises, code, and explanations are adapted from An Introduction to Statistical Learning with Applications in Python (ISLP) (James, Witten, Hastie, Tibshirani, and Taylor 2023).

## Instructor Code

Make sure the following packages are installed prior to running code in this notebook:

Need a package: (!pip install ISLP)

In [None]:
#Load required packages  -- MIGHT HAVE TO PIP INSTALL "TORCHINFO"
from ISLP import load_data
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from IPython.display import Image
from torchinfo import summary
from torch.optim import RMSprop
from torch.utils.data import TensorDataset
from pytorch_lightning.loggers import CSVLogger

from pytorch_lightning import seed_everything
seed_everything(0, workers=True)
torch.use_deterministic_algorithms(True, warn_only=True)

In [None]:
from ISLP.torch.imdb import (load_lookup,
                             load_tensor,
                             load_sparse,
                             load_sequential)
from ISLP.torch import (SimpleDataModule,
                        SimpleModule,
                        ErrorTracker,
                        rec_num_workers)

from torch.optim import RMSprop
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning import Trainer

## Exercise for Neural Network.
This exercise is adapted from Chapter 10, Exercise 7 in ISLP. 

This problem invovles the `Default` data set which is part of the `ISLP` package.

Fit a neural network to the `Default` data. Use a single hidden layer with 10 untis, and dropout regularization. Have a look at Labs 10.9.1 - 10.9.2 for guidance. Compare the classification performance of your model with that of linear logistic regression.

In [None]:
# Load in the data
df = load_data("Default")

# Convert categorical variables to numeric
df['default'] = df['default'].map({'No': 0, 'Yes': 1})
df['student'] = df['student'].map({'No': 0, 'Yes': 1})

# Split into features and target
X = df[['student', 'balance', 'income']].values
y = df['default'].values

# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Create Tensor Datasets
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

# Data loaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter

# --- Basic stats ---
print("Number of samples:", len(df))
print("Number of features:", df.shape[1] - 1)  # excluding target
print("Number of classes:", df['default'].nunique())
print("\nClass distribution:")
print(df['default'].value_counts())

# --- Percentage of default ---
default_rate = df['default'].mean() * 100
print(f"\nDefault rate: {default_rate:.2f}%")

# --- Class distribution plot ---
plt.figure(figsize=(5,4))
sns.countplot(x='default', data=df)
plt.title('Default Distribution')
plt.show()

# --- Feature distributions by class ---
features = ['balance', 'income', 'student']
for col in features:
    plt.figure(figsize=(6,4))
    if df[col].dtype == 'O' or df[col].nunique() < 10:
        sns.countplot(x=col, hue='default', data=df)
    else:
        sns.histplot(data=df, x=col, hue='default', kde=True, bins=30)
    plt.title(f"{col} by Default Status")
    plt.show()

# --- Correlation heatmap ---
plt.figure(figsize=(6,4))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Neural Network with PyTorch
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(3, 10)  # 3 input features, 10 in hidden layer
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(10, 1)  # Output layer

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc2(x))
        return x

In [None]:
# Initialize the network
model = Net()

# Loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Train the model
for epoch in range(100):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        output = model(inputs)
        loss = criterion(output.squeeze(), labels)
        loss.backward()
        optimizer.step()

In [None]:
# Evaluate the model
model.eval()
with torch.no_grad():
    y_pred = model(X_test).squeeze()
    y_pred = (y_pred >= 0.5).float()
    print("Neural Network Performance:")
    print(classification_report(y_test.numpy(), y_pred.numpy()))

In [None]:
# Logistic Regression with scikit-learn
log_reg = LogisticRegression()
log_reg.fit(X_train.numpy(), y_train.numpy())
lr_predictions = log_reg.predict(X_test.numpy())

In [None]:
# Evaluate Logistic Regression
print("Logistic Regression Performance:")
print(classification_report(y_test.numpy(), lr_predictions))

## Recurrent Neural Networks (RNN)

In [None]:
# load in the movie data
(imdb_seq_train, imdb_seq_test) = load_sequential(root='data/IMDB')
padded_sample = np.asarray(imdb_seq_train.tensors[0][0])
sample_review = padded_sample[padded_sample > 0][:12]
sample_review

In [None]:
# load data in as a tensor
max_num_workers = 10

(imdb_train, imdb_test) = load_tensor(root='data/IMDB')
imdb_dm = SimpleDataModule(imdb_train,
                           imdb_test,
                           validation=2000,
                           num_workers=min(6, max_num_workers),
                           batch_size=512)

In [None]:
class IMDBModel(nn.Module):

    def __init__(self, input_size):
        super(IMDBModel, self).__init__()
        self.dense1 = nn.Linear(input_size, 16)
        self.activation = nn.ReLU()
        self.dense2 = nn.Linear(16, 16)
        self.output = nn.Linear(16, 1)

    def forward(self, x):
        val = x
        for _map in [self.dense1,
                     self.activation,
                     self.dense2,
                     self.activation,
                     self.output]:
            val = _map(val)
        
        return torch.flatten(val)


In [None]:
# create the model
imdb_model = IMDBModel(imdb_test.tensors[0].size()[1])

# describe the model architecture
summary(imdb_model, input_size=imdb_test.tensors[0].size(), col_names=['input_size', 'output_size', 'num_params'])

In [None]:
# define the optimizer
imdb_optimizer = RMSprop(imdb_model.parameters(), lr=0.001)

# create module for training
imdb_module = SimpleModule.binary_classification(imdb_model, optimizer=imdb_optimizer)

In [None]:
# create logger
imdb_logger = CSVLogger('logs', name='IMDB')

# define the training routine
imdb_trainer = Trainer(deterministic=True,
                       max_epochs=30,
                       logger=imdb_logger,
                       enable_progress_bar=False,
                       callbacks=[ErrorTracker()])

# train the model
imdb_trainer.fit(imdb_module, datamodule=imdb_dm)

# show testing results
test_results = imdb_trainer.test(imdb_module, datamodule=imdb_dm)
test_results