# Titanic suvival prediction using pytorch

In [None]:
# download data from kaggle using kaggle cli

# set the path to kaggle.json file

# download the dataset
!kaggle competitions download -c titanic -p dataset
!unzip dataset/titanic.zip -d dataset/titanic
!rm dataset/titanic.zip

In [1]:
# read the dataset
import pandas as pd
df = pd.read_csv('dataset/titanic/train.csv')
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Data Preprocessing

# tokenization for name column
def tokenize_name(name):
    # split the name into tokens
    tokens = name.split()
    # remove the punctuations and commas
    tokens = [token.replace(',', '').replace('.', '') for token in tokens]
    return tokens


# tokenize the ticket column
def tokenize_ticket(ticket):
    # convert the ticket number to a list chars
    chars = list(ticket)
    return chars


df['Name'] = df['Name'].apply(tokenize_name)
df['Ticket'] = df['Ticket'].apply(tokenize_ticket)



In [3]:
# check for missing values
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
# fill in the missing values as 'unknown'
df['Cabin'] = df['Cabin'].fillna('unknown')
df['Embarked'] = df['Embarked'].fillna('unknown')
df['Sex'] = df['Sex'].fillna('unknown')

# fill in the missing age with mean age
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [5]:
# encode the categorical columns
from sklearn.preprocessing import LabelEncoder

sex_encoder = LabelEncoder()
embarke_encoder = LabelEncoder()
cabin_encoder = LabelEncoder()



df['Sex'] = sex_encoder.fit_transform(df['Sex'])
df['Embarked'] = embarke_encoder.fit_transform(df['Embarked'].astype(str))
df['Cabin'] = cabin_encoder.fit_transform(df['Cabin'].astype(str))

# pclass need to start from 0 
df['Pclass'] = df['Pclass'] - 1

In [6]:
# print the cardinality of each categorical column

print('sex', len(sex_encoder.classes_))
print('embarked', len(embarke_encoder.classes_))
print('cabin', len(cabin_encoder.classes_))

sex 2
embarked 4
cabin 148


In [7]:
# dictionary to convert the name tokens to integers
name_token_to_int = {}
int_to_name_token = {}

# convert the name tokens to integers
for name_tokens in df['Name']:
    for token in name_tokens:
        if token not in name_token_to_int:
            name_token_to_int[token] = len(name_token_to_int)
            int_to_name_token[len(int_to_name_token)] = token

# dictionary to convert the ticket chars to integers
ticket_char_to_int = {}
int_to_ticket_char = {}

# convert the ticket chars to integers
for ticket_chars in df['Ticket']:
    for char in ticket_chars:
        if char not in ticket_char_to_int:
            ticket_char_to_int[char] = len(ticket_char_to_int)
            int_to_ticket_char[len(int_to_ticket_char)] = char

# add padding values to the dictionaries
name_token_to_int['<pad>'] = len(name_token_to_int)
int_to_name_token[len(int_to_name_token)] = '<pad>'
ticket_char_to_int['<pad>'] = len(ticket_char_to_int)
int_to_ticket_char[len(int_to_ticket_char)] = '<pad>'

# size of the vocabulary for name and ticket columns

print('name vocabulary size', len(name_token_to_int))
print('ticket vocabulary size', len(ticket_char_to_int))

name vocabulary size 1639
ticket vocabulary size 36


In [8]:
# convert the name tokens to integers
df['Name'] = df['Name'].apply(lambda tokens: [name_token_to_int[token] for token in tokens])

# convert the ticket chars to integers
df['Ticket'] = df['Ticket'].apply(lambda chars: [ticket_char_to_int[char] for char in chars])

In [9]:
# split the dataset into train and test sets

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print('train size', len(train_df))
print('test size', len(test_df))

train size 712
test size 179


In [10]:
# define the pytorch dataclass that wraps the dataset

import torch
from torch.utils.data import Dataset

class TitanicDataset(Dataset):

    def __init__(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df
        self.is_train = True

    def set_mode(self, is_train=True):
        self.is_train = is_train

    def __len__(self):
        if self.is_train:
            return len(self.train_df)
        else:
            return len(self.test_df)

    def __getitem__(self, idx):
        if self.is_train:
            row = self.train_df.iloc[idx]
        else:
            row = self.test_df.iloc[idx]

        # convert the row to a dict of tensors
        # label 
        label = torch.tensor(row['Survived'], dtype=torch.long)

        # features 
        # drop the unnecessary columns
        cols_to_drop = ['PassengerId', 'Survived']
        features = row.drop(cols_to_drop)

        # convert the features to tensors
        features = {
            'name': torch.tensor(features['Name'], dtype=torch.long),
            'pclass': torch.tensor(features['Pclass'], dtype=torch.long),
            'ticket': torch.tensor(features['Ticket'], dtype=torch.long),
            'sex': torch.tensor(features['Sex'], dtype=torch.long),
            'cabin': torch.tensor(features['Cabin'], dtype=torch.long),
            'embarked': torch.tensor(features['Embarked'], dtype=torch.long),
            'numericals': torch.tensor(features[['Age', 'SibSp', 'Parch', 'Fare']].astype(float).values, dtype=torch.float),
            'label': label
        }

        return features


In [11]:
# create the train and test datasets
dataset = TitanicDataset(train_df, test_df)

print('train size', len(dataset))
print('test size', len(dataset))

# sample a row from the dataset
print(dataset[0])

train size 712
test size 712
{'name': tensor([737,   1, 738]), 'pclass': tensor(0), 'ticket': tensor([ 5,  5, 15, 16, 18, 15]), 'sex': tensor(1), 'cabin': tensor(56), 'embarked': tensor(2), 'numericals': tensor([45.5000,  0.0000,  0.0000, 28.5000]), 'label': tensor(0)}


In [12]:
# define the pytorch dataloader

from torch.utils.data import DataLoader

batch_size = 32

# define a custom collate function to pad the sequences of name and ticket columns
def collate_fn(batch):
    # batch is a list of dicts
    # each dict is a row from the dataset
    # we need to convert the list of dicts to a dict of tensors
    # we need to pad the sequences of name and ticket columns

    # convert the list of dicts to a dict of tensors
    batch = {key: [d[key] for d in batch] for key in batch[0]}

    # pad the sequences of name and ticket columns
    batch['name'] = torch.nn.utils.rnn.pad_sequence(batch['name'], batch_first=True, padding_value=name_token_to_int['<pad>'])
    batch['ticket'] = torch.nn.utils.rnn.pad_sequence(batch['ticket'], batch_first=True, padding_value=ticket_char_to_int['<pad>'])

    # for other columns, convert the list of tensors to a tensor
    for k in batch:
        if k not in ['name', 'ticket']:
            batch[k] = torch.stack(batch[k], dim=0)

    return batch

train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# sample a batch from the dataloader

for batch in train_loader:
    print(batch)
    break


{'name': tensor([[1476,   12, 1477, 1638, 1638, 1638, 1638, 1638],
        [ 995,    5,  326,   22, 1381, 1382, 1638, 1638],
        [1207,   12,   46,   34, 1638, 1638, 1638, 1638],
        [1110,    1,    6, 1111,  206, 1638, 1638, 1638],
        [ 580,  377,  119,  581, 1638, 1638, 1638, 1638],
        [1380,    1,  553,  978, 1638, 1638, 1638, 1638],
        [1305,    1,  283,  842, 1638, 1638, 1638, 1638],
        [1248,    1, 1376, 1364, 1638, 1638, 1638, 1638],
        [ 142,    1,  143, 1638, 1638, 1638, 1638, 1638],
        [1603,    1, 1604, 1638, 1638, 1638, 1638, 1638],
        [1495,    1, 1496, 1638, 1638, 1638, 1638, 1638],
        [ 611,    5,   21,   79,  383,  612, 1638, 1638],
        [1406,    1,  295,   24, 1638, 1638, 1638, 1638],
        [ 664,    5,  681,   27,  463, 1030, 1031, 1032],
        [ 617,    5,  164,  136,  618, 1638, 1638, 1638],
        [1285,    1, 1286, 1287, 1638, 1638, 1638, 1638],
        [ 475,    5,   24,   72,  476,  477, 1638, 1638],
     

In [13]:
print('n pclass', len(train_df['Pclass'].unique()))
print('n embarked', len(train_df['Embarked'].unique()))
print('n sex', len(train_df['Sex'].unique()))

n pclass 3
n embarked 4
n sex 2


In [14]:
# define the model architecture

import torch.nn as nn
import torch.nn.functional as F

class SurivalModel(nn.Module):
    """
    For name, ticket, we use an embedding layer, followed by a RNN layer
    For categorical columns, we use embedding layers and concat the embeddings with the numericals

    Then we pass the concatenated embeddings and numericals through a MLP
    To produce logits for the binary classification task
    """

    def __init__(self, n_name_tokens, n_ticket_tokens):

        super(SurivalModel, self).__init__()
        name_embedding_dim = 32
        ticket_embedding_dim = 32
        categorical_embedding_dim = 8

        n_numerical_features = 4

        self.name_embedding = nn.Embedding(n_name_tokens, name_embedding_dim)
        self.ticket_embedding = nn.Embedding(n_ticket_tokens, ticket_embedding_dim)

        self.name_rnn = nn.GRU(name_embedding_dim, name_embedding_dim, batch_first=True)
        self.ticket_rnn = nn.GRU(ticket_embedding_dim, ticket_embedding_dim, batch_first=True)

        self.pclass_embedding = nn.Embedding(4, categorical_embedding_dim)
        self.embarke_embedding = nn.Embedding(4, categorical_embedding_dim)
        self.sex_embedding = nn.Embedding(2, categorical_embedding_dim)

        self.mlp = nn.Sequential(
            nn.Linear(
                name_embedding_dim + ticket_embedding_dim + 3 * categorical_embedding_dim + n_numerical_features,
                32
            ),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
        )

    def forward(self, features):
        # feature is a dict of tensors
        # each tensor is of shape (batch_size, seq_len) or (batch_size,)
        # seq_len is the length of the sequence for name and ticket columns

        # name
        name = features['name'] # (batch_size, seq_len)
        name = self.name_embedding(name) # (batch_size, seq_len, name_embedding_dim)
        name, _ = self.name_rnn(name) # (batch_size, seq_len, name_embedding_dim)
        name = name[:, -1, :] # (batch_size, name_embedding_dim)

        # ticket
        ticket = features['ticket'] # (batch_size, seq_len)
        ticket = self.ticket_embedding(ticket) # (batch_size, seq_len, ticket_embedding_dim)
        ticket, _ = self.ticket_rnn(ticket) # (batch_size, seq_len, ticket_embedding_dim)
        ticket = ticket[:, -1, :] # (batch_size, ticket_embedding_dim)

        # categorical columns

        pclass = features['pclass'] # (batch_size,)
        pclass = self.pclass_embedding(pclass) # (batch_size, categorical_embedding_dim)

        embarked = features['embarked'] # (batch_size,)
        embarked = self.embarke_embedding(embarked) # (batch_size, categorical_embedding_dim)

        sex = features['sex'] # (batch_size,)
        sex = self.sex_embedding(sex) # (batch_size, categorical_embedding_dim)

        # numerical columns
        numericals = features['numericals'] # (batch_size, n_numerical_features)

        # concat the embeddings and numericals
        # import pdb; pdb.set_trace()


        hidden = torch.concat(
            (name, 
            ticket,
            pclass,
            embarked,
            sex,
            numericals),
            dim=1
        ) # (batch_size, name_embedding_dim + ticket_embedding_dim + 3 + 3 + 2 + n_numerical_features) 

        # pass through the MLP
        logits = self.mlp(hidden) # (batch_size, 1)

        return logits









In [15]:
# try pass a batch through the model

model = SurivalModel(len(name_token_to_int), len(ticket_char_to_int))

for batch in train_loader:

    # pass the batch through the model
    logits = model(batch)

    print(logits.shape)

    break

torch.Size([32, 1])


In [16]:
# training logic 

import torch.optim as optim






def train_model(model, data_loader, num_epochs, optimizer):

    for epoch in range(num_epochs):

        model.train()

        for batch_idx, batch in enumerate(data_loader):

            # pass the batch through the model
            logits = model(batch)

            # compute the loss
            loss = F.binary_cross_entropy_with_logits(logits, batch['label'].unsqueeze(1).float())

            # compute the gradients
            optimizer.zero_grad()
            loss.backward()

            # update the parameters
            optimizer.step()

            if batch_idx % log_interval == 0:
                print('Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(batch), len(data_loader.dataset),
                    100. * batch_idx / len(data_loader), loss.item()))



def eval_model(model, data_loader):
    
        model.eval()
    
        correct = 0
    
        with torch.no_grad():
            for batch_idx, batch in enumerate(data_loader):
    
                # pass the batch through the model
                logits = model(batch)
    
                # compute the loss
                loss = F.binary_cross_entropy_with_logits(logits, batch['label'].unsqueeze(1).float())
    
                # compute the accuracy
                preds = torch.sigmoid(logits)
                preds = (preds > 0.5).long()
                correct += preds.eq(batch['label'].view_as(preds)).sum().item()
    
        print('Test accuracy: {}/{} ({:.0f}%)\n'.format(
            correct, len(data_loader.dataset),
            100. * correct / len(data_loader.dataset)))

In [19]:
# train the model
lr = 1e-3
num_epochs = 20
log_interval = 10

model = SurivalModel(len(name_token_to_int), len(ticket_char_to_int))

optimizer = optim.Adam(model.parameters(), lr=lr)

train_model(model, train_loader, num_epochs, optimizer)



In [20]:
# evaluate the model

test_dataset = TitanicDataset(train_df, test_df)
test_dataset.set_mode('test')

test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

eval_model(model, test_loader)


Test accuracy: 692/712 (97%)

