In [61]:
import numpy as np
import pandas as pd
import re

In [85]:
train = pd.read_csv('data/raw/train.csv').set_index('PassengerId')
test = pd.read_csv('data/raw/test.csv').set_index('PassengerId')

In [86]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [87]:
test.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Imputation

In [88]:
train.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

## Embarked

In [126]:
train.loc[(train.Sex == 'female') & (train.Pclass == 1) & train.Ticket.str.contains('113'), :].groupby(['Embarked']).size()

Embarked
C     3
S    12
dtype: int64

In [127]:
train.loc[train.Embarked.isnull(), 'Embarked'] = 'S'

PassengerId
62     NaN
830    NaN
Name: Embarked, dtype: object

# Cabin

In [130]:
train.groupby(["Pclass", "Sex", 'Embarked']).size().reset_index()

Unnamed: 0,Pclass,Sex,Embarked,0
0,1,female,C,43
1,1,female,Q,1
2,1,female,S,48
3,1,male,C,42
4,1,male,Q,1
5,1,male,S,79
6,2,female,C,7
7,2,female,Q,2
8,2,female,S,67
9,2,male,C,10


In [128]:
train.loc[train.Cabin.isnull(), :] 

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...
885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


# Title

In [73]:
def get_title(name):
    try:
        result = re.findall(r'(Miss|Mr|Mrs|Ms)\b', name)
        return result[0]
    except:
        return 'Other'

In [74]:
train['Title'] = train.Name.apply(lambda x: get_title(x))
test['Title'] = test.Name.apply(lambda x: get_title(x))

In [75]:
train_title = train.groupby(['PassengerId', 'Title']).size().unstack(fill_value = 0).drop(['Other'], axis = 1).add_prefix("Title_")
test_title = test.groupby(['PassengerId', 'Title']).size().unstack(fill_value = 0).drop(['Other'], axis = 1).add_prefix("Title_")

# Sex

In [76]:
train.Sex = [1 if sex == 'male' else 0 for sex in train.Sex]
test.Sex = [1 if sex == 'male' else 0 for sex in test.Sex]

# Embarked

In [77]:
train_embarked = train.groupby(['PassengerId', 'Embarked']).size().unstack(fill_value = 0).drop(['S'], axis = 1).add_prefix("Embarked_")
test_embarked = test.groupby(['PassengerId', 'Embarked']).size().unstack(fill_value = 0).drop(['S'], axis = 1).add_prefix("Embarked_")

# Cabin

In [78]:
train['cabin'] = [str(x)[0].lower() for x in train.Cabin]
train_cabin = train.groupby(['PassengerId', 'cabin']).size().unstack(fill_value = 0).drop(['t'], axis = 1).add_prefix("Cabin_")

In [79]:
test['cabin'] = [str(x)[0].lower() for x in test.Cabin]
test_cabin = test.groupby(['PassengerId', 'cabin']).size().unstack(fill_value = 0).add_prefix("Cabin_")

# Combine & Drop

In [80]:
train = train.drop(['Embarked', 'Name', 'Ticket', 'Cabin', 'cabin', 'Title'], axis = 1).merge(train_embarked, left_index = True, right_index = True).merge(train_cabin, left_index = True, right_index = True).merge(train_title, left_index = True, right_index = True)
train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Cabin_a,...,Cabin_c,Cabin_d,Cabin_e,Cabin_f,Cabin_g,Cabin_n,Title_Miss,Title_Mr,Title_Mrs,Title_Ms
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,3,1,22.0,1,0,7.25,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,1,1,0,38.0,1,0,71.2833,1,0,0,...,1,0,0,0,0,0,0,0,1,0
3,1,3,0,26.0,0,0,7.925,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,1,1,0,35.0,1,0,53.1,0,0,0,...,1,0,0,0,0,0,0,0,1,0
5,0,3,1,35.0,0,0,8.05,0,0,0,...,0,0,0,0,0,1,0,1,0,0


In [81]:
train.shape

(889, 21)

In [82]:
test = test.drop(['Embarked', 'Name', 'Ticket', 'Cabin', 'cabin', 'Title'], axis = 1).merge(test_embarked, left_index = True, right_index = True).merge(test_cabin, left_index = True, right_index = True).merge(test_title, left_index = True, right_index = True)
test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Cabin_a,Cabin_b,Cabin_c,Cabin_d,Cabin_e,Cabin_f,Cabin_g,Cabin_n,Title_Miss,Title_Mr,Title_Mrs,Title_Ms
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
892,3,1,34.5,0,0,7.8292,0,1,0,0,0,0,0,0,0,1,0,1,0,0
893,3,0,47.0,1,0,7.0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
894,2,1,62.0,0,0,9.6875,0,1,0,0,0,0,0,0,0,1,0,1,0,0
895,3,1,27.0,0,0,8.6625,0,0,0,0,0,0,0,0,0,1,0,1,0,0
896,3,0,22.0,1,1,12.2875,0,0,0,0,0,0,0,0,0,1,0,0,1,0


In [83]:
test.shape

(418, 20)

# Imputation

In [87]:
from sklearn.impute import KNNImputer

In [88]:
train.isna().sum()

Survived        0
Pclass          0
Sex             0
Age           177
SibSp           0
Parch           0
Fare            0
Embarked_C      0
Embarked_Q      0
Cabin_a         0
Cabin_b         0
Cabin_c         0
Cabin_d         0
Cabin_e         0
Cabin_f         0
Cabin_g         0
Cabin_n         0
dtype: int64

In [89]:
test.isna().sum()

Pclass         0
Sex            0
Age           86
SibSp          0
Parch          0
Fare           1
Embarked_C     0
Embarked_Q     0
Cabin_a        0
Cabin_b        0
Cabin_c        0
Cabin_d        0
Cabin_e        0
Cabin_f        0
Cabin_g        0
Cabin_n        0
dtype: int64

In [90]:
imputer = KNNImputer(n_neighbors = 5, add_indicator = False).fit(train.iloc[:, 1:])
imputer

KNNImputer(add_indicator=False, copy=True, metric='nan_euclidean',
           missing_values=nan, n_neighbors=5, weights='uniform')

In [91]:
train.iloc[:, 1:] = imputer.transform(train.iloc[:, 1:])
train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Cabin_a,Cabin_b,Cabin_c,Cabin_d,Cabin_e,Cabin_f,Cabin_g,Cabin_n
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,3.0,1.0,22.0,1.0,0.0,7.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1.0,0.0,38.0,1.0,0.0,71.2833,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1,3.0,0.0,26.0,0.0,0.0,7.925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1.0,0.0,35.0,1.0,0.0,53.1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0,3.0,1.0,35.0,0.0,0.0,8.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [92]:
train.isna().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Cabin_a       0
Cabin_b       0
Cabin_c       0
Cabin_d       0
Cabin_e       0
Cabin_f       0
Cabin_g       0
Cabin_n       0
dtype: int64

In [93]:
test[:] = imputer.transform(test)
test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Cabin_a,Cabin_b,Cabin_c,Cabin_d,Cabin_e,Cabin_f,Cabin_g,Cabin_n
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892,3.0,1.0,34.5,0.0,0.0,7.8292,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
893,3.0,0.0,47.0,1.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
894,2.0,1.0,62.0,0.0,0.0,9.6875,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
895,3.0,1.0,27.0,0.0,0.0,8.6625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
896,3.0,0.0,22.0,1.0,1.0,12.2875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [94]:
test.isna().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Embarked_C    0
Embarked_Q    0
Cabin_a       0
Cabin_b       0
Cabin_c       0
Cabin_d       0
Cabin_e       0
Cabin_f       0
Cabin_g       0
Cabin_n       0
dtype: int64

# Write

In [96]:
train.to_csv("data/clean/train.csv")
test.to_csv("data/clean/test.csv")