In [None]:
%pip install torch torchvision matplotlib torchinfo torchviz

import matplotlib.pyplot as plt
import numpy as np
import torch
import torchvision

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

torch.manual_seed(45)
np.random.seed(45)

In [108]:
from torch.utils.data import Dataset
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

In [117]:
### TASK 1 ###

class TitanicDataset(Dataset): 
    def __init__(self, DataFrame, Columns, categories, transformer, fit_transform=True, test=False):
        df = self.one_hot_encode(DataFrame, Columns, categories)
        df = self.scale(transformer, df, fit_transform)

        if (test == True): 
            X = df.values
            self.X = torch.tensor(X, dtype=torch.float32)
            self.y = None
            self.feats = df.columns.tolist()
        else: 
            y = df["Survived"].values
            X = df.drop(columns=["Survived"]).values

            self.X = torch.tensor(X, dtype=torch.float32)
            self.y = torch.tensor(y, dtype=torch.float32)
            self.feats = df.columns.tolist()

    def __len__(self): 
        return len(self.X) 

    def __getitem__(self, idx): 
        if (self.y is not None):
            return self.X[idx], self.y[idx]

        return self.X[idx]           

    def one_hot_encode(self, DataFrame, Columns, categories): 
        df = DataFrame.copy() #create a copy to preserve original df

        column_names = []
        # loop through provided columns and convert indices to column names for use
        for i in Columns: 
            if (isinstance(i, int)): 
                column_names.append(df.columns[i])
            else: 
                column_names.append(i)

        # loop through all columns 
        for col in column_names: 
            # get all categories within the column 
            values = categories[col]

            # for all but the first category (ie. drop the first category)
            for val in values[1:]:
                new_col_name = f"{col}_{val}" #create new name

                # create a new column with the new title. Then we must assign each sample 
                # a value in this column. If the original categorical column's value for the sample is 
                # equal to the category we are building a column for this comparison will return
                # true, converted to 1, otherwise it will return false, converted to 0. 
                df[new_col_name] = (df[col] == val).astype(np.float32)
        
            df = df.drop(columns = [col]) #drop the original categorical column
    
        all_column_names = df.columns.tolist()

        #convert df to matrix and return 
        #numpy_mat = df.to_numpy(dtype=np.float32)
        #return numpy_mat, all_column_names

        return df
    
    def scale(self, transformer, DataFrame, fit_transform=True): 
        df = DataFrame.copy()

        if(fit_transform):
            data = transformer.fit_transform(df)
        else:
            data = transformer.transform(df)

        # Add normalized data to dataframe
        for index, column in enumerate(transformer.get_feature_names_out()):
            df[column] = data[:,index]
        
        # Return
        return df
        


In [118]:
# Reading in data
train_df_ori = pd.read_csv('train.csv')
test_df_ori = pd.read_csv('test.csv')

# Saving passenger id's for kaggle submission
passenger_ids = test_df_ori["PassengerId"]

train_df = train_df_ori.copy()
test_df = test_df_ori.copy()


In [119]:
irrelevant_feats = ['PassengerId', 'Name', 'Cabin', 'Ticket'] 
train_df.drop(irrelevant_feats, axis=1, inplace=True)
test_df.drop(irrelevant_feats, axis=1, inplace=True)

In [120]:
# Imputing - This step will fill in blank or missing values in the dataset

numerical_feats = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
categorical_feats = ["Sex", "Embarked"]

# We will replace missing numerical values with the median for that feature
num_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
# We will replace missing categorical values with the mode for that feature 
cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# For every column other than "Survived", search for missing values
# and replace them using the corresponding imputer 
for column in train_df.columns[1:]: 
    if (column in numerical_feats): 
        fill = num_imputer.fit_transform(train_df[column].values.reshape(-1,1))
        train_df[column] = fill.ravel()
        fill = num_imputer.transform(test_df[column].values.reshape(-1,1))
        test_df[column] = fill.ravel()
    else: 
        fill = cat_imputer.fit_transform(train_df[column].values.reshape(-1,1))
        train_df[column] = fill.ravel()
        fill = cat_imputer.transform(test_df[column].values.reshape(-1,1))
        test_df[column] = fill.ravel()

In [121]:
# Using ColumnTransformer to apply specific normalization to each column in our data
norm_scaler = ColumnTransformer(
    transformers=[
        ('minmax', MinMaxScaler(), ['Pclass', 'Parch']),  
        ('standard', StandardScaler(), ['Age', 'SibSp', 'Fare'])
    ], verbose_feature_names_out=False
)

In [122]:
df_train, df_val = train_test_split(train_df, test_size=0.15, stratify = train_df['Survived'], random_state=45)

In [41]:
df_train.head

<bound method NDFrame.head of      Survived  Pclass     Sex    Age  SibSp  Parch     Fare Embarked
644         1     3.0  female   0.75    2.0    1.0  19.2583        C
661         0     3.0    male  40.00    0.0    0.0   7.2250        C
890         0     3.0    male  32.00    0.0    0.0   7.7500        Q
109         1     3.0  female  28.00    1.0    0.0  24.1500        Q
838         1     3.0    male  32.00    0.0    0.0  56.4958        S
..        ...     ...     ...    ...    ...    ...      ...      ...
790         0     3.0    male  28.00    0.0    0.0   7.7500        Q
368         1     3.0  female  28.00    0.0    0.0   7.7500        Q
568         0     3.0    male  28.00    0.0    0.0   7.2292        C
3           1     1.0  female  35.00    1.0    0.0  53.1000        S
182         0     3.0    male   9.00    4.0    2.0  31.3875        S

[757 rows x 8 columns]>

In [123]:
categorical_feats = ["Sex", "Embarked"]

# When using the custom dataset class we need to create a dictionary ordering 
# the categories for onehotencoding, otherwise the test, val, and training sets
# may end up with different columns
categories = {}
for col in ['Sex', 'Embarked']:
    categories[col] = train_df[col].unique().tolist()

train_dataset = TitanicDataset(df_train, categorical_feats, categories, norm_scaler, True, False)
val_dataset = TitanicDataset(df_val, categorical_feats, categories, norm_scaler, False, False)
test_dataset = TitanicDataset(test_df, categorical_feats, categories, norm_scaler, False, True)

In [None]:
print(train_dataset.feats)
print(train_dataset.X[0])

print("\n", val_dataset.feats)
print(val_dataset.X[0])

print("\n", test_dataset.feats)
print(test_dataset.X[0])

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Embarked_C', 'Embarked_Q']
tensor([ 1.0000, -2.2340,  1.3450,  0.2000, -0.2647,  1.0000,  1.0000,  0.0000])

, ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Embarked_C', 'Embarked_Q']
tensor([ 1.0000,  2.3021, -0.4846,  0.0000, -0.4948,  0.0000,  0.0000,  0.0000])

 ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 'Embarked_C', 'Embarked_Q']
tensor([ 1.0000,  0.3943, -0.4846,  0.0000, -0.4837,  0.0000,  0.0000,  1.0000])


In [124]:
import torch.nn as nn
import torch.optim

In [125]:
# TASK 2

class MLP_Network(nn.Module): 
    def __init__(self, input_size, hidden_size, num_classes, network_depth, learning_rate, regularization): 
        super(MLP_Network, self).__init__()

        self.model = nn.Sequential()

        in_size = input_size
        out_size = hidden_size
        for layer in range(network_depth): 
            self.model.add_module(f"layer_{layer}", nn.Linear(in_size, out_size))
            self.model.add_module(f"activation_{layer}", nn.ReLU())

            in_size = out_size
            out_size = max(1, out_size//2)

        self.model.add_module(f"output_layer", nn.Linear(in_size, num_classes))
        #self.model.add_module(f"output_activation", nn.ReLU())

        self.loss = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.SGD(params=self.model.parameters(), lr=learning_rate, weight_decay = regularization)

    def forward(self, x): 
        return self.model(x)    


In [126]:
x = MLP_Network(8, 16, 2, 3, 0.01, 0.0)

x.to(device)

print(x)

MLP_Network(
  (model): Sequential(
    (layer_0): Linear(in_features=8, out_features=16, bias=True)
    (activation_0): ReLU()
    (layer_1): Linear(in_features=16, out_features=8, bias=True)
    (activation_1): ReLU()
    (layer_2): Linear(in_features=8, out_features=4, bias=True)
    (activation_2): ReLU()
    (output_layer): Linear(in_features=4, out_features=2, bias=True)
  )
  (loss): CrossEntropyLoss()
)


In [127]:
train_loader = DataLoader(
    # these variable primarily affect efficiency at fetching data
    train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=0, # when running on windows num_worker must equal zero, there is a well known pytorch bug forcing this 
    pin_memory=False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=64,
    shuffle=False,
    num_workers=0,
    pin_memory=False
)

In [132]:
def train(model, train_loader, val_loader, num_epochs): 
    for epoch in range(num_epochs): 
        model.train()

        # We will track loss and accuracy for each epoch 
        running_loss = 0.0
        running_corrects = 0

        # each epoch is broken into batches of 64 images
        for inputs, labels in train_loader: 
            inputs, labels = inputs.to(device), labels.to(device)
            labels=labels.long()

            model.optimizer.zero_grad()
            outputs = model(inputs)
            loss = model.loss(outputs, labels)
            loss.backward()
            model.optimizer.step()

            _, preds = torch.max(outputs,1)
            running_loss += loss.item() * inputs.size(0)
            running_corrects += (preds == labels).sum().item()

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = running_corrects / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_loss} | Accuracy: {epoch_acc}")

        # Validation loop 
        model.eval()
        val_loss = 0.0
        val_corrects = 0
        with torch.no_grad(): 
            for inputs, labels in val_loader: 
                inputs, labels = inputs.to(device), labels.to(device)
                labels = labels.long()
                
                outputs = model(inputs)
                loss = model.loss(outputs, labels)
                _, preds = torch.max(outputs, 1)
                val_loss += loss.item() * inputs.size(0)
                val_corrects += (preds == labels).sum().item()
        
        val_loss /= len(val_loader.dataset)
        val_acc = val_corrects / len(val_loader.dataset)
        print(f"Validation Loss: {val_loss} | Accuracy: {val_acc}")

In [135]:
train(x, train_loader, val_loader, 250)

Epoch 1/250 | Train Loss: 0.6638335035215753 | Accuracy: 0.6155878467635403
Validation Loss: 0.6613600334124778 | Accuracy: 0.6194029850746269
Epoch 2/250 | Train Loss: 0.663709176263847 | Accuracy: 0.6155878467635403
Validation Loss: 0.6612513216573801 | Accuracy: 0.6194029850746269
Epoch 3/250 | Train Loss: 0.6636092702180265 | Accuracy: 0.6155878467635403
Validation Loss: 0.6611372445946309 | Accuracy: 0.6194029850746269
Epoch 4/250 | Train Loss: 0.6635142672644582 | Accuracy: 0.6155878467635403
Validation Loss: 0.6610281814390154 | Accuracy: 0.6194029850746269
Epoch 5/250 | Train Loss: 0.6634170256328709 | Accuracy: 0.6155878467635403
Validation Loss: 0.6609170579198581 | Accuracy: 0.6194029850746269
Epoch 6/250 | Train Loss: 0.6633031581635519 | Accuracy: 0.6155878467635403
Validation Loss: 0.6608141954265424 | Accuracy: 0.6194029850746269
Epoch 7/250 | Train Loss: 0.6632159508203735 | Accuracy: 0.6155878467635403
Validation Loss: 0.6607068376754647 | Accuracy: 0.6194029850746269
