In [1]:
# Import libraries and the data set
import pandas as pd
import numpy as np
import sidetable
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
import DataScience 

df = pd.read_csv('train.csv', index_col=['PassengerId'])
X_test = pd.read_csv('test.csv', index_col=['PassengerId'])

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.Cabin.unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

Column __Name__ and __Ticket__ has nothing to do with predicting __Survived__, it will be dropped

In [4]:
df.drop(columns=['Name'], inplace=True)
X_test.drop(columns=['Name'], inplace=True)

In [5]:
# See the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,687,891,77.10%
Age,177,891,19.87%
Embarked,2,891,0.22%
Survived,0,891,0.00%
Pclass,0,891,0.00%
Sex,0,891,0.00%
SibSp,0,891,0.00%
Parch,0,891,0.00%
Ticket,0,891,0.00%
Fare,0,891,0.00%


As column __Cabin__ has 77% missing values, the column will be dropped.
<br>Column __Age__ can be imputed.
<br>As column __Embarked__ has 2 missing rows, the rows will be dropped.

In [6]:
# As Pclass includes fare, I will drop 'Fare'
df.drop(columns=['Ticket', 'Cabin'], inplace=True, axis=1)

In [7]:
X_test.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,327,418,78.23%
Age,86,418,20.57%
Fare,1,418,0.24%
Pclass,0,418,0.00%
Sex,0,418,0.00%
SibSp,0,418,0.00%
Parch,0,418,0.00%
Ticket,0,418,0.00%
Embarked,0,418,0.00%


As column __Cabin__ has 77% missing values, the column will be dropped.
<br>Column __Age__ can be imputed.
<br>As column __Fare__ has 2 missing rows, the rows will be dropped.

In [8]:
# As column Cabin is missing a lot, and column Ticket has a lot of unique values, which is hard to encode, I will drop them
X_test.drop(columns=['Ticket', 'Cabin'], axis=1, inplace=True)

In [9]:
# Recheck the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Age,177,891,19.87%
Embarked,2,891,0.22%
Survived,0,891,0.00%
Pclass,0,891,0.00%
Sex,0,891,0.00%
SibSp,0,891,0.00%
Parch,0,891,0.00%
Fare,0,891,0.00%


In [10]:
X_test.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Age,86,418,20.57%
Fare,1,418,0.24%
Pclass,0,418,0.00%
Sex,0,418,0.00%
SibSp,0,418,0.00%
Parch,0,418,0.00%
Embarked,0,418,0.00%


In [11]:
# Get X_train and y_train
y_train = df.Survived.copy()
X_train = df.drop(columns='Survived').copy()

In [12]:
# Retrive num_cols and cat_cols
num_cols = list(X_train._get_numeric_data().columns)
cat_cols = list(set(X_train.columns) - set(num_cols))

In [13]:
# Encode cat_cols
X_train = pd.get_dummies(data=X_train, columns=cat_cols, drop_first=True)
X_test = pd.get_dummies(data=X_test, columns=cat_cols, drop_first=True)

In [14]:
print(num_cols, cat_cols)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] ['Embarked', 'Sex']


### As no categorical columns have missing data, there is no need to impute them

In [15]:
DataScience.test_imputations(X_train, y_train, num_cols)

Imputing row 1/891 with 0 missing, elapsed time: 0.123
Imputing row 101/891 with 0 missing, elapsed time: 0.124
Imputing row 201/891 with 0 missing, elapsed time: 0.125
Imputing row 301/891 with 1 missing, elapsed time: 0.126
Imputing row 401/891 with 0 missing, elapsed time: 0.127
Imputing row 501/891 with 0 missing, elapsed time: 0.128
Imputing row 601/891 with 0 missing, elapsed time: 0.130
Imputing row 701/891 with 0 missing, elapsed time: 0.130
Imputing row 801/891 with 0 missing, elapsed time: 0.131
The best imputation:  ('Iterative', 0.1656569845878877)


In [16]:
# Impute numeric data
from sklearn.impute import IterativeImputer
num_imputer = IterativeImputer()
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = num_imputer.fit_transform(X_test[num_cols])

In [17]:
X_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3.0,22.0,1.0,0.0,7.25,0,1,1
2,1.0,38.0,1.0,0.0,71.2833,0,0,0
3,3.0,26.0,0.0,0.0,7.925,0,1,0
4,1.0,35.0,1.0,0.0,53.1,0,1,0
5,3.0,35.0,0.0,0.0,8.05,0,1,1


In [18]:
# Scaling data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train[num_cols])
X_scaled = pd.DataFrame(data=X_scaled, columns=num_cols, index=X_train.index)

In [19]:
X_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3.0,22.0,1.0,0.0,7.25,0,1,1
2,1.0,38.0,1.0,0.0,71.2833,0,0,0
3,3.0,26.0,0.0,0.0,7.925,0,1,0
4,1.0,35.0,1.0,0.0,53.1,0,1,0
5,3.0,35.0,0.0,0.0,8.05,0,1,1


In [20]:
# Update cat_cols
cat_cols = set(X_train.columns) - set(num_cols)

In [21]:
X_train[cat_cols]

Unnamed: 0_level_0,Embarked_S,Embarked_Q,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,0,1
2,0,0,0
3,1,0,0
4,1,0,0
5,1,0,1
...,...,...,...
887,1,0,1
888,1,0,0
889,1,0,0
890,0,0,1


In [22]:
X_scaled

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.827377,-0.536193,0.432793,-0.473674,-0.502445
2,-1.566107,0.640250,0.432793,-0.473674,0.786845
3,0.827377,-0.242082,-0.474545,-0.473674,-0.488854
4,-1.566107,0.419667,0.432793,-0.473674,0.420730
5,0.827377,0.419667,-0.474545,-0.473674,-0.486337
...,...,...,...,...,...
887,-0.369365,-0.168554,-0.474545,-0.473674,-0.386671
888,-1.566107,-0.756776,-0.474545,-0.473674,-0.044381
889,0.827377,-0.599967,0.432793,2.008933,-0.176263
890,-1.566107,-0.242082,-0.474545,-0.473674,-0.044381


In [24]:
X_train = pd.concat([X_scaled, X_train[cat_cols]], axis=1)

In [25]:
X_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Embarked_S,Embarked_Q,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.827377,-0.536193,0.432793,-0.473674,-0.502445,1,0,1
2,-1.566107,0.64025,0.432793,-0.473674,0.786845,0,0,0
3,0.827377,-0.242082,-0.474545,-0.473674,-0.488854,1,0,0
4,-1.566107,0.419667,0.432793,-0.473674,0.42073,1,0,0
5,0.827377,0.419667,-0.474545,-0.473674,-0.486337,1,0,1


### As XGBoost can handle missing and unscaled data, I only have to encode categorical data

In [26]:
predictions = DataScience.xgb_classifier(X_train, y_train, X_test)

Fitting 50 folds for each of 96 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.9s


KeyboardInterrupt: 

In [None]:
# Get the index of the predicted table
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['Survived'], index=index)

# saving the DataFrame as a CSV file 
csv_data = predictions.to_csv('Predictions.csv', index = True) 

In [None]:
# Verify the prediction's format
print(predictions.head())