# Trying to fit a basic deep learning model with pytorch

## Preparing data first in the most direct way

In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [29]:
from matplotlib import pyplot as plt

df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')
df_all = pd.concat([df_train, df_test])
df_train.name = 'Training Set'
df_test.name = 'Test Set'
df_all.name = 'All Set'
df_all.sample(5)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
865,866,1.0,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0,,S
814,815,0.0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S
627,628,1.0,1,"Longley, Miss. Gretchen Fiske",female,21.0,0,0,13502,77.9583,D9,S
66,67,1.0,2,"Nye, Mrs. (Elizabeth Ramell)",female,29.0,0,0,C.A. 29395,10.5,F33,S
31,923,,2,"Jefferys, Mr. Clifford Thomas",male,24.0,2,0,C.A. 31029,31.5,,S


In [30]:
def prepare_features_basic(data):
  # missing values
  data.loc[data['Age'].isna(), 'Age'] = data['Age'].mode()[0].item()
  data.loc[data['Embarked'].isna(), 'Embarked'] = data['Embarked'].mode().item()
  data['Fare'].fillna(data['Fare'].mode().item())
  # Cabin: no need as we do delete it
  # categorical
  data = pd.get_dummies(data, columns = ['Sex', 'Pclass', 'Embarked'])
  # useless columns
  data = data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis = 1)
  # numerical
  data['Age'] /= data['Age'].max() 
  data['SibSp'] /= data['SibSp'].max()
  data['Parch'] /= data['Parch'].max()
  data['Fare'] = np.log1p(data['Fare'])
  data['Fare'] /= data['Fare'].max()
  return data

In [31]:
def prepare_features_initial(data):
  # missing values
  f_age_mean = data.loc[(~data['Age'].isna()) & (data['Sex'].str.contains("female")), 'Age'].mean()
  data.loc[(data['Age'].isna()) & (data['Sex'].str.contains("female")), 'Age'] = f_age_mean  
  m_age_mean = data.loc[(~data['Age'].isna()) & (data['Sex'].str.contains("male")), 'Age'].mean()
  data.loc[(data['Age'].isna()) & (data['Sex'].str.contains("male")), 'Age'] = m_age_mean  
  data['Age'] = data['Age'].astype(float)
  
  data.loc[data['PassengerId'] == 62, 'Embarked'] = 'S'
  data.loc[data['PassengerId'] == 830, 'Embarked'] = 'S'
  
  r = data.loc[(data['Pclass'] == 3) & (data['Embarked'].str.contains('S')) & (data['SibSp'] == 0) & (data['Parch'] == 0), 'Fare'].mode().item()
  data.loc[data['PassengerId'] == 1044, 'Fare'] = r
  # Cabin: no need as we do delete it
  # categorical
  data = pd.get_dummies(data, columns = ['Sex', 'Pclass', 'Embarked'])
  # useless columns
  data = data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis = 1)
  # numerical
  data['Age'] /= data['Age'].max() 
  data['SibSp'] /= data['SibSp'].max()
  data['Parch'] /= data['Parch'].max()
  data['Fare'] = np.log1p(data['Fare'])
  data['Fare'] /= data['Fare'].max()
  return data

In [32]:
def prepare_features_medium(data):
  # missing values
  f_age_mean = data.loc[(~data['Age'].isna()) & (data['Sex'].str.contains("female")), 'Age'].mean()
  data.loc[(data['Age'].isna()) & (data['Sex'].str.contains("female")), 'Age'] = f_age_mean  
  m_age_mean = data.loc[(~data['Age'].isna()) & (data['Sex'].str.contains("male")), 'Age'].mean()
  data.loc[(data['Age'].isna()) & (data['Sex'].str.contains("male")), 'Age'] = m_age_mean  
  data['Age'] = data['Age'].astype(float)
  
  data.loc[data['PassengerId'] == 62, 'Embarked'] = 'S'
  data.loc[data['PassengerId'] == 830, 'Embarked'] = 'S'
  
  r = data.loc[(data['Pclass'] == 3) & (data['Embarked'].str.contains('S')) & (data['SibSp'] == 0) & (data['Parch'] == 0), 'Fare'].mode().item()
  data.loc[data['PassengerId'] == 1044, 'Fare'] = r
  # Cabin: no need as we do delete it

  # we combine SibSp and Parch to measure the family size including the passenger
  data['Family'] = 1 + data['SibSp'] + data['Parch']
  # categorical
  data = pd.get_dummies(data, columns = ['Sex', 'Pclass', 'Embarked'])
  # useless columns
  data = data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId', 'SibSp', 'Parch'], axis = 1)
  # numerical
  data['Age'] /= data['Age'].max() 
  data['Family'] /= data['Family'].max()
  data['Fare'] = np.log1p(data['Fare'])
  data['Fare'] /= data['Fare'].max()
  return data

In [33]:
df_all = prepare_features_medium(df_all)

In [34]:
df_train = df_all.iloc[:891]
df_train_y = df_train['Survived']
df_train_x = df_train.drop(['Survived'], axis=1)

df_test = df_all.iloc[891:]
df_train.tail(5)

Unnamed: 0,Survived,Age,Fare,Family,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
886,0.0,0.3375,0.422864,0.090909,0,1,0,1,0,0,0,1
887,1.0,0.2375,0.550238,0.090909,1,0,1,0,0,0,0,1
888,0.0,0.358589,0.512205,0.363636,1,0,0,0,1,0,0,1
889,1.0,0.325,0.550238,0.090909,0,1,1,0,0,1,0,0
890,0.0,0.4,0.347554,0.090909,0,1,0,0,1,0,1,0


In [35]:
df_test.head()

Unnamed: 0,Survived,Age,Fare,Family,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,,0.43125,0.348997,0.090909,0,1,0,0,1,0,1,0
1,,0.5875,0.333195,0.181818,1,0,0,0,1,0,0,1
2,,0.775,0.379604,0.090909,0,1,0,1,0,0,1,0
3,,0.3375,0.363449,0.090909,0,1,0,0,1,0,0,1
4,,0.275,0.414494,0.272727,1,0,0,0,1,0,0,1


## Now fitting a deep learning model 

In [36]:
import torch
import torch.nn.functional as F

train = torch.tensor(df_train_x.values, dtype=torch.float)
labels = torch.tensor(df_train_y.values, dtype=torch.float)
# no bias because we do hot encoding rather than dummy 
train.shape



torch.Size([891, 11])

In [37]:
def compute_preds(coeffs, x):
    return torch.sigmoid(x @ coeffs)

In [38]:
def compute_loss(coeffs, x, y):
    #preds = compute_preds(coeffs, x)
    #return torch.where(y == 1.0, 1.0 - preds, preds).sum()
    return torch.abs(compute_preds(coeffs, x) - y).mean()
    #return torch.square(compute_preds(coeffs, x) - y).mean()

In [39]:
def upgrade_coeffs(coeffs, rate):
    coeffs.sub_(coeffs.grad * rate)
    coeffs.grad.zero_()

In [40]:
def run_epoch(weights, x, y, rate):
    loss = compute_loss(weights, x, y)
    loss.backward()
    with torch.no_grad():
        upgrade_coeffs(weights, rate)
    print(f"{loss:.3f}", end="; ")

In [41]:
def init_coeffs(n_coeff=12, n_hidden=20):
    layer1 = torch.rand(n_coeff) - 0.5
    layer1.requires_grad_()
    return layer1

In [42]:
def train_model(epochs=30, lr=0.01):
    weights = init_coeffs(11, 20)
    #weights.retain_grad()
    for e in range(epochs):
            run_epoch(weights, train, labels, lr)
    return weights

In [43]:
def accuracy(coeffs, x, y):
    with torch.no_grad():
        preds = compute_preds(coeffs, x)
    return ((preds > 0.5) == y.bool()).float().mean()

In [44]:
parameters = train_model(60, 1.5)

0.238; 0.226; 0.215; 0.205; 0.197; 0.189; 0.183; 0.178; 0.174; 0.170; 0.167; 0.165; 0.163; 0.161; 0.159; 0.158; 0.157; 0.156; 0.155; 0.154; 0.153; 0.153; 0.152; 0.152; 0.151; 0.151; 0.150; 0.150; 0.150; 0.150; 0.149; 0.149; 0.149; 0.149; 0.148; 0.148; 0.148; 0.148; 0.148; 0.147; 0.147; 0.147; 0.147; 0.147; 0.147; 0.147; 0.147; 0.147; 0.146; 0.146; 0.146; 0.146; 0.146; 0.146; 0.146; 0.146; 0.146; 0.146; 0.146; 0.146; 

In [45]:
accuracy(parameters, train, labels)

tensor(0.7969)

* (30, 1.5) gives 0.7868 accuracy
* (50, 1.5) gives 0.7991 accuracy
* (75, 1.5) gives 0.7856 accuracy
* (60, 1.5) gives 0.8126 accuracy

(60, 1.5) MAE gives 0.8114 accuracy
(60, 1.5) MSE gives 0.7969 accuracy

In [46]:
def show_coeffs():
    with torch.no_grad():
        s = dict(zip(df_train_x.columns, parameters))
    return s
show_coeffs()

{'Age': tensor(-0.1446, requires_grad=True),
 'Fare': tensor(0.1016, requires_grad=True),
 'Family': tensor(-0.3368, requires_grad=True),
 'Sex_female': tensor(1.4585, requires_grad=True),
 'Sex_male': tensor(-0.8770, requires_grad=True),
 'Pclass_1': tensor(0.4772, requires_grad=True),
 'Pclass_2': tensor(0.0303, requires_grad=True),
 'Pclass_3': tensor(-0.9942, requires_grad=True),
 'Embarked_C': tensor(0.2309, requires_grad=True),
 'Embarked_Q': tensor(0.0236, requires_grad=True),
 'Embarked_S': tensor(-0.2916, requires_grad=True)}

In [47]:
with torch.no_grad():
    df_test_x = torch.tensor(df_test.drop(['Survived'], axis=1).values, dtype=torch.float)
    result = compute_preds(parameters, df_test_x)
result

tensor([0.1295, 0.5151, 0.2835, 0.0993, 0.5208, 0.1015, 0.6078, 0.2276, 0.6607,
        0.0962, 0.0987, 0.3205, 0.8339, 0.2210, 0.8271, 0.8415, 0.2942, 0.1578,
        0.5245, 0.6497, 0.4347, 0.0982, 0.8346, 0.4500, 0.8826, 0.0937, 0.8941,
        0.1575, 0.3230, 0.1499, 0.2251, 0.2285, 0.5101, 0.5126, 0.4457, 0.1584,
        0.5315, 0.5352, 0.0997, 0.1014, 0.1508, 0.3270, 0.0969, 0.7605, 0.8273,
        0.0995, 0.4440, 0.1305, 0.8907, 0.5154, 0.3254, 0.3430, 0.7484, 0.8177,
        0.3420, 0.1198, 0.0979, 0.0994, 0.0969, 0.8968, 0.1008, 0.2345, 0.1006,
        0.6113, 0.4367, 0.7622, 0.6131, 0.3218, 0.4501, 0.8089, 0.6104, 0.1001,
        0.5313, 0.4511, 0.8966, 0.4574, 0.0987, 0.8185, 0.2350, 0.6104, 0.1548,
        0.3131, 0.3193, 0.0987, 0.2958, 0.1531, 0.6092, 0.5363, 0.6084, 0.2347,
        0.5289, 0.0987, 0.8231, 0.0987, 0.4524, 0.0995, 0.8201, 0.0988, 0.5353,
        0.0982, 0.8903, 0.2324, 0.1305, 0.0993, 0.6482, 0.1004, 0.1323, 0.1305,
        0.0988, 0.2388, 0.3373, 0.6084, 

In [48]:
results = torch.where(result <= 0.5, 0.0, 1.0)
results


tensor([0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0.,
        1., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
        1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1., 1., 0.,
        1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0.,
        1., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1.,
        0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0.,
        1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1.,
        0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 1., 0., 1., 0., 1., 0., 

In [49]:
results = pd.DataFrame(results)
results.columns = ['Survived']
results.head()

Unnamed: 0,Survived
0,0.0
1,1.0
2,0.0
3,0.0
4,1.0


In [50]:
df_submit = df_test.fillna(results)
df_submit.head()

Unnamed: 0,Survived,Age,Fare,Family,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0.0,0.43125,0.348997,0.090909,0,1,0,0,1,0,1,0
1,1.0,0.5875,0.333195,0.181818,1,0,0,0,1,0,0,1
2,0.0,0.775,0.379604,0.090909,0,1,0,1,0,0,1,0
3,0.0,0.3375,0.363449,0.090909,0,1,0,0,1,0,0,1
4,1.0,0.275,0.414494,0.272727,1,0,0,0,1,0,0,1


In [51]:
df_submit = df_submit.drop(['Age', 'Family', 'Fare', 'Sex_female', 'Sex_male', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S'], axis=1)
df_submit['Survived'] = df_submit['Survived'].astype('int')
df_submit['PassengerId'] = (892 + df_submit.index) 
df_submit.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Survived     418 non-null    int64
 1   PassengerId  418 non-null    int64
dtypes: int64(2)
memory usage: 9.8 KB


In [52]:
df_submit.to_csv('/kaggle/working/20230525_titanic_preds_deep1l_mse.csv', columns=['PassengerId', 'Survived'], header=True, index=False, sep=',')

In [53]:
!tail /kaggle/working/20230525_titanic_preds_deep1l_mse.csv
!tail /kaggle/input/titanic/gender_submission.csv

1300,1
1301,1
1302,1
1303,1
1304,1
1305,0
1306,1
1307,0
1308,0
1309,0
1300,1
1301,1
1302,1
1303,1
1304,1
1305,0
1306,1
1307,0
1308,0
1309,0


conclusion: **scored 0.77511** worst than GB and SVC from last year but better than linear with Pytorch and **0.69377** with pytorch.where() loss function instead of MAE

improving on missing values **scored 0.77751** with MAE

improving on missing values scored **0.77751** with MAE and family data preparation

improving on missing values scored **0.77511** with MSE and family data preparation

conclusion: best loss function so far is MAE, impact from feature engineering is very low