In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

### Exercise: the titanic

In [2]:
# Load the Titanic dataset
url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
titanic_df = pd.read_csv(url)

# Drop rows with missing values for simplicity
titanic_df = titanic_df.dropna()

In [3]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [4]:
titanic_df.corr()

  titanic_df.corr()


Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
Survived,1.0,-0.336528,-0.059665,-0.037082,0.080097,0.256179
Pclass,-0.336528,1.0,-0.391492,0.085026,0.020252,-0.548919
Age,-0.059665,-0.391492,1.0,-0.297669,-0.193741,0.112329
Siblings/Spouses Aboard,-0.037082,0.085026,-0.297669,1.0,0.414244,0.158839
Parents/Children Aboard,0.080097,0.020252,-0.193741,0.414244,1.0,0.21547
Fare,0.256179,-0.548919,0.112329,0.158839,0.21547,1.0


## Your exercise:

Create a neural network as above to model survival on the titanic dataset. The steps below break it down into pieces, mostly very parallel with the penguin exercise. Full credit if the thing works and you can train the model. Extra credit to the person with the highest-performing model (see steps below). 

Please leave all of the cell outputs in your submission for me to see, so that I can read the training and evaluation outcomes. Thanks!

## Question 0

Select which features you will use and do any sklearn / pandas preprocessing you want (standard scaling? encoding? etc.). Do a pandas train / test split with the settings 

In [5]:
encoder = LabelEncoder()
titanic_df['male'] = (titanic_df['Sex'] == 'male')
titanic_df['female'] = (titanic_df['Sex'] == 'female')
titanic_df['class1'] = (titanic_df['Pclass'] == 1)
titanic_df['class2'] = (titanic_df['Pclass'] == 2)
titanic_df['class3'] = (titanic_df['Pclass'] == 3)
titanic_df['female'] = encoder.fit_transform(titanic_df['female'])
titanic_df['male'] = encoder.fit_transform(titanic_df['male'])
titanic_df['class1'] = encoder.fit_transform(titanic_df['class1'])
titanic_df['class2'] = encoder.fit_transform(titanic_df['class2'])
titanic_df['class3'] = encoder.fit_transform(titanic_df['class3'])
titanic_df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,male,female,class1,class2,class3
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500,1,0,0,0,1
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833,0,1,1,0,0
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250,0,1,0,0,1
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000,0,1,1,0,0
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000,1,0,0,1,0
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000,0,1,1,0,0
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500,0,1,0,0,1
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000,1,0,1,0,0


In [6]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(titanic_df[['class1','class2','class3','male','female','Age']])
titanic_df[['class1','class2','class3','male','female','Age']] = scaled_features

In [7]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,male,female,class1,class2,class3
0,0,3,Mr. Owen Harris Braund,male,-0.529366,1,0,7.25,0.740266,-0.740266,-0.567369,-0.511601,0.906287
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,0.604265,1,0,71.2833,-1.350867,1.350867,1.762521,-0.511601,-1.103404
2,1,3,Miss. Laina Heikkinen,female,-0.245958,0,0,7.925,-1.350867,1.350867,-0.567369,-0.511601,0.906287
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,0.391709,1,0,53.1,-1.350867,1.350867,1.762521,-0.511601,-1.103404
4,0,3,Mr. William Henry Allen,male,0.391709,0,0,8.05,0.740266,-0.740266,-0.567369,-0.511601,0.906287


## Question 1

Define a TitanicDataset class which is a subclass of PyTorch Dataset. Define the requisite three functions based on your data from question 0.

In [8]:
# Define a custom PyTorch dataset
class TitanicDataset(Dataset):
    def __init__(self, data):
        self.X = data[['class1','class2','class3','male','female','Age']].values
        self.y = data['Survived'].values
        self.n_samples = len(data)
        
    def __getitem__(self, index):
        return torch.tensor(self.X[index],dtype = torch.float32), torch.tensor(self.y[index],dtype = torch.int64)

    def __len__(self):
        return self.n_samples

## Question 2

create a train_dataset and test_dataset using the train / test data from your split and your TitanicDataset class definition. Then create train_loader and test_loader dataloaders (batch size is up to you!). It never hurts to print out a few __getitem__'s to make sure things are working!

In [17]:
# Create PyTorch datasets and dataloaders
train_data, test_data = train_test_split(titanic_df, test_size=0.2, random_state=42)

train_dataset = TitanicDataset(train_data)
test_dataset = TitanicDataset(test_data)

train_loader = DataLoader(dataset = train_dataset, batch_size = 32, shuffle=True)
test_loader = DataLoader(dataset = test_dataset, batch_size = 32, shuffle=True)

In [18]:
train_dataset.__getitem__(10)

(tensor([-0.5674, -0.5116,  0.9063, -1.3509,  1.3509,  1.1002]), tensor(0))

In [19]:
test_dataset.__getitem__(100)

(tensor([ 1.7625, -0.5116, -1.1034,  0.7403, -0.7403,  1.5962]), tensor(0))

## Question 3

Define a neural network model, with __init__ and forward methods. Up to you how many hidden layers you use and which activation functions you choose. You will want either two outputs if you want to model the two classes as we did penguin species in class, or one output if you use the BCE loss and a sigmoid activation function to make the single output into a probability.

In [20]:
# Define a simple neural network with one hidden layer
class TitanicNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(TitanicNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

## Question 4

Create instances of your model, your loss criterion, and your optimizer. There are some choices to make here (hidden layer size(s), loss criterion to match your model output, adam or SGD optimizer, which learning rate?).

In [21]:
# Initialize the model, loss function, and optimizer
input_size = 6
hidden_size = 64
num_classes = len(encoder.classes_)
learning_rate = 0.001

model = TitanicNN(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

## Question 5

Run a training loop for however long you want to train your model. Make sure to print your loss at least a few times to verify that it is dropping as you train.

In [27]:
# Training loop
num_epochs = 50 #or whatever

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        #targets = targets.view(-1,1)
        #targets = targets.float()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

Epoch 1/50, Loss: 0.4748
Epoch 2/50, Loss: 0.4738
Epoch 3/50, Loss: 0.4765
Epoch 4/50, Loss: 0.4757
Epoch 5/50, Loss: 0.4737
Epoch 6/50, Loss: 0.4850
Epoch 7/50, Loss: 0.4782
Epoch 8/50, Loss: 0.4858
Epoch 9/50, Loss: 0.4736
Epoch 10/50, Loss: 0.4783
Epoch 11/50, Loss: 0.4773
Epoch 12/50, Loss: 0.4803
Epoch 13/50, Loss: 0.4906
Epoch 14/50, Loss: 0.4731
Epoch 15/50, Loss: 0.4749
Epoch 16/50, Loss: 0.4865
Epoch 17/50, Loss: 0.4784
Epoch 18/50, Loss: 0.4741
Epoch 19/50, Loss: 0.4819
Epoch 20/50, Loss: 0.4745
Epoch 21/50, Loss: 0.4860
Epoch 22/50, Loss: 0.4729
Epoch 23/50, Loss: 0.4822
Epoch 24/50, Loss: 0.4728
Epoch 25/50, Loss: 0.4725
Epoch 26/50, Loss: 0.4804
Epoch 27/50, Loss: 0.4809
Epoch 28/50, Loss: 0.4761
Epoch 29/50, Loss: 0.4723
Epoch 30/50, Loss: 0.4827
Epoch 31/50, Loss: 0.4823
Epoch 32/50, Loss: 0.4791
Epoch 33/50, Loss: 0.4753
Epoch 34/50, Loss: 0.4723
Epoch 35/50, Loss: 0.4820
Epoch 36/50, Loss: 0.4742
Epoch 37/50, Loss: 0.4745
Epoch 38/50, Loss: 0.4794
Epoch 39/50, Loss: 0.

## Question 6

Evaluate your model and print the accuracy percentage on the test set.

In [28]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

accuracy = correct / total
print(f"Accuracy on test set: {accuracy:.2%}")
# You can do all of this differently, just print the test accuracy please

Accuracy on test set: 75.84%


## BONUS POINTS

...to the person with the highest accuracy %! optimize away!