<a href="https://colab.research.google.com/github/wondercha/DL_experiments/blob/main/01a_titanic_dnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import pandas as pd

train= pd.read_csv('https://raw.githubusercontent.com/rpi-techfundamentals/fall2018-materials/master/input/train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/rpi-techfundamentals/fall2018-materials/master/input/test.csv')

In [None]:
#Print to standard output, and see the results in the "log" section below after running your script
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
#Now let's fix the missing value problem in the age field
train["Age"] = train["Age"].fillna(train["Age"].median())
test["Age"] = test["Age"].fillna(test["Age"].median())

train["Fare"] = train["Fare"].fillna(train["Fare"].median())
test["Fare"] = test["Fare"].fillna(test["Fare"].median())

In [None]:
#For Recoding Data, we can use what we know of selecting rows and columns
train["Embarked"] = train["Embarked"].fillna("S")
test["Embarked"] = test["Embarked"].fillna("S")

#Let's create a new feature called namelength
train['NameLength'] = train['Name'].map(lambda x: len(x))
test['NameLength'] = test['Name'].map(lambda x: len(x))

dataset_title = [i.split(',')[1].split('.')[0].strip() for i in train['Name']]
train['Title'] = pd.Series(dataset_title)
train['Title'].value_counts()

#Replace Infrequent classes
train['Title'] = train['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')
train['Title'].value_counts()

#Double check that the same works for the Test. 
dataset_title = [i.split(',')[1].split('.')[0].strip() for i in test['Name']]
test['Title'] = pd.Series(dataset_title)
test['Title'] = test['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')
test['Title'].value_counts()

#Create a field for the family count. 
train['FamilyS'] = train['SibSp'] + train['Parch'] + 1
test['FamilyS'] = test['SibSp'] + test['Parch'] + 1


In [None]:
#Create a categorical variable from the family count 
def family(x):
    if x < 2:
        return 'Single'
    elif x == 2:
        return 'Couple'
    elif x <= 4:
        return 'InterM'
    else:
        return 'Large'
    
train['FamilyS'] = train['FamilyS'].apply(family)
test['FamilyS'] = test['FamilyS'].apply(family)

#Drop some colums that won't be modeled. 
train_min=train.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)
train_min
test_min=test.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)
train_min

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,NameLength,Title,FamilyS
0,0,3,male,22.0,7.2500,S,23,Mr,Couple
1,1,1,female,38.0,71.2833,C,51,Mrs,Couple
2,1,3,female,26.0,7.9250,S,22,Miss,Single
3,1,1,female,35.0,53.1000,S,44,Mrs,Couple
4,0,3,male,35.0,8.0500,S,24,Mr,Single
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,S,21,Rare,Single
887,1,1,female,19.0,30.0000,S,28,Miss,Single
888,0,3,female,28.0,23.4500,S,40,Miss,InterM
889,1,1,male,26.0,30.0000,C,21,Mr,Single


In [None]:
#Adding .values will make whatever comes out into a numpy array. 
X_train = train_min.iloc[:, 1:9].values
Y_train = train_min.iloc[:, 0].values
X_test = test_min.iloc[:, 0:8].values

print(X_test[1], X_train[1])

[3 'female' 47.0 7.0 'S' 32 'Mrs' 'Couple'] [1 'female' 38.0 71.2833 'C' 51 'Mrs' 'Couple']


In [None]:
# Previously we used get dummies (part of pandas)
#Here we will first transform string labels to numeric categories. 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X_1 = LabelEncoder()
X_train[:, 1] = labelencoder_X_1.fit_transform(X_train[:, 1]) #Female/male
X_train[:, 4] = labelencoder_X_1.fit_transform(X_train[:, 4]) #Embarked
X_train[:, 6] = labelencoder_X_1.fit_transform(X_train[:, 6]) #Title
X_train[:, 7] = labelencoder_X_1.fit_transform(X_train[:, 7]) #Couple


labelencoder_X_2 = LabelEncoder()
X_test[:, 1] = labelencoder_X_2.fit_transform(X_test[:, 1])
X_test[:, 4] = labelencoder_X_2.fit_transform(X_test[:, 4])
X_test[:, 6] = labelencoder_X_2.fit_transform(X_test[:, 6]) 
X_test[:, 7] = labelencoder_X_2.fit_transform(X_test[:, 7])
X_test.shape

(418, 8)

In [None]:
print(pd.DataFrame(X_test).isna().sum())
print(pd.DataFrame(X_test).isna().sum())

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
dtype: int64
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
dtype: int64


In [None]:
from sklearn.compose import ColumnTransformer
categorical_features = [1, 4, 6, 7]
ct = ColumnTransformer([("OneHot", OneHotEncoder(), [1, 4, 6, 7])], remainder = "passthrough")
#X_train = np.array(ct.fit_transform(X_train), dtype = np.float64)
#X_test = np.array(ct.fit_transform(X_test), dtype = np.float64)
X_train = ct.fit_transform(X_train.tolist())
X_test = ct.fit_transform(X_test.tolist())
X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

# Converting numeric categories values to one-hot representation
#one_hot_encoder = OneHotEncoder(categorical_features = [0, 1, 4, 6, 7])
#X_train = one_hot_encoder.fit_transform(X_train).toarray()
#X_test = one_hot_encoder.fit_transform(X_test).toarray()

In [None]:
#Double check no missing values or values that aren't numeric.
print(np.isnan(X_train).sum(),np.isnan(X_test).sum())
print(X_train[0])

0 0
[ 0.    1.    0.    0.    1.    0.    0.    1.    0.    0.    1.    0.
  0.    0.    3.   22.    7.25 23.  ]


In [None]:
#Split the data
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size = 0.2)
x_train.shape



(712, 18)

In [None]:
y_val.shape

(179,)

In [None]:
#Define the model 
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        #Note that 17 is the number of columns in the input matrix. 
        self.fc1 = nn.Linear(18, 270)
        #270 is arbitrary, but needs to be consistent.  2 is the number of classes in the output (died/survived)
        self.fc2 = nn.Linear(270, 2)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.dropout(x, p=0.1)
        x = F.relu(x)
        x = self.fc2(x)
        x = torch.sigmoid(x)  #signmoid flattens to probability 0-1. 
        
        return x
    
net = Net()

In [None]:
#Define training
batch_size = 50
num_epochs = 100
learning_rate = 0.01
batch_no = len(x_train) // batch_size

In [None]:
#define loss function and optimizer 
#Adam is a specific flavor of gradient decent which is typically better
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
from sklearn.utils import shuffle
from torch.autograd import Variable
running_loss = 0.0
for epoch in range(num_epochs):
    x_train, y_train = shuffle(x_train, y_train)
    # Mini batch learning
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        inputs = Variable(torch.FloatTensor(x_train[start:end]))
        labels = Variable(torch.LongTensor(y_train[start:end]))
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        
    print('Epoch {}'.format(epoch+1), "loss: ",running_loss)
    running_loss = 0.0

        
        

Epoch 1 loss:  9.73462575674057
Epoch 2 loss:  9.704062223434448
Epoch 3 loss:  9.704062223434448
Epoch 4 loss:  9.704062223434448
Epoch 5 loss:  9.704062223434448
Epoch 6 loss:  9.704062223434448
Epoch 7 loss:  9.704062223434448
Epoch 8 loss:  9.704062223434448
Epoch 9 loss:  9.704062223434448
Epoch 10 loss:  9.704062223434448
Epoch 11 loss:  9.704062223434448
Epoch 12 loss:  9.704062223434448
Epoch 13 loss:  9.704062223434448
Epoch 14 loss:  9.704062223434448
Epoch 15 loss:  9.704062223434448
Epoch 16 loss:  9.704062223434448
Epoch 17 loss:  9.704062223434448
Epoch 18 loss:  9.704062223434448
Epoch 19 loss:  9.704062223434448
Epoch 20 loss:  9.704062223434448
Epoch 21 loss:  9.704062223434448
Epoch 22 loss:  9.704062223434448
Epoch 23 loss:  9.704062223434448
Epoch 24 loss:  9.704062223434448
Epoch 25 loss:  9.704062223434448
Epoch 26 loss:  9.704062223434448
Epoch 27 loss:  9.704062223434448
Epoch 28 loss:  9.704062223434448
Epoch 29 loss:  9.704062223434448
Epoch 30 loss:  9.704062

In [None]:
#This is a little bit tricky to get the resulting prediction.  
def calculate_accuracy(x,y=[]):
  # Evaluate the model with the test set. 
  test_var = Variable(torch.FloatTensor(x), requires_grad=True)
  with torch.no_grad():   
      result = net(test_var) #This outputs the probability for each class.
  values, labels = torch.max(result, 1)
  if len(y) != 0:
      num_right = np.sum(labels.data.numpy() == y)
      print('Accuracy {:.2f}'.format(num_right / len(y)), "for a total of ", len(y), "records")
  else:
      print("returning predictions")
      return labels.data.numpy()
 

In [None]:
# Check out the accuracy. 
calculate_accuracy(x_train, y_train)
calculate_accuracy(x_val, y_val)
predictions=calculate_accuracy(X_test)
len(predictions)
predictions

Accuracy 0.61 for a total of  712 records
Accuracy 0.69 for a total of  179 records
returning predictions


array([0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,

In [None]:
#Writing to File
submission=pd.DataFrame(test.loc[:,['PassengerId']])
submission['Survived']=predictions
#Any files you save will be available in the output tab below

submission.to_csv('submission.csv', index=False)


In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Evaluate the model
test_var = Variable(torch.FloatTensor(x_val), requires_grad=True)
with torch.no_grad():
    result = net(test_var)
values, labels = torch.max(result, 1)
num_right = np.sum(labels.data.numpy() == y_val)
print('Accuracy {:.2f}'.format(num_right / len(y_val)))

Accuracy 0.69
