In [1]:
# Import libraries and the data set
import pandas as pd
import numpy as np
import sidetable
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
import DataScience 

df = pd.read_csv('train.csv', index_col=['PassengerId'])
X_test = pd.read_csv('test.csv', index_col=['PassengerId'])

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Column __Name__ and __Ticket__ has nothing to do with predicting __Survived__, it will be dropped

In [3]:
df.drop(columns=['Name', 'Ticket'], inplace=True)
X_test.drop(columns=['Name', 'Ticket'], inplace=True)

In [4]:
# See the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,687,891,77.10%
Age,177,891,19.87%
Embarked,2,891,0.22%
Survived,0,891,0.00%
Pclass,0,891,0.00%
Sex,0,891,0.00%
SibSp,0,891,0.00%
Parch,0,891,0.00%
Fare,0,891,0.00%


As column __Cabin__ has 77% missing values, the column will be dropped.
<br>Column __Age__ can be imputed.
<br>As column __Embarked__ has 2 missing rows, the rows will be dropped.

In [5]:
# As Pclass includes fare, I will drop 'Fare'
df.drop(columns=['Cabin'], inplace=True, axis=1)
df.dropna(subset=['Embarked'], axis=0, inplace=True)

In [6]:
X_test.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Cabin,327,418,78.23%
Age,86,418,20.57%
Fare,1,418,0.24%
Pclass,0,418,0.00%
Sex,0,418,0.00%
SibSp,0,418,0.00%
Parch,0,418,0.00%
Embarked,0,418,0.00%


As column __Cabin__ has 77% missing values, the column will be dropped.
<br>Column __Age__ can be imputed.
<br>As column __Fare__ has 2 missing rows, the rows will be dropped.

In [7]:
# As there are only two missing rows of column Embarked, I will drop the rows
X_test.drop(columns=['Cabin'], axis=1, inplace=True)
# X_test.dropna(subset=['Fare'], axis=0, inplace=True)

In [8]:
# Recheck the missing data in columns
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Age,177,889,19.91%
Survived,0,889,0.00%
Pclass,0,889,0.00%
Sex,0,889,0.00%
SibSp,0,889,0.00%
Parch,0,889,0.00%
Fare,0,889,0.00%
Embarked,0,889,0.00%


In [9]:
X_test.stb.missing(style=True)

Unnamed: 0,missing,total,percent
Age,86,418,20.57%
Fare,1,418,0.24%
Pclass,0,418,0.00%
Sex,0,418,0.00%
SibSp,0,418,0.00%
Parch,0,418,0.00%
Embarked,0,418,0.00%


In [10]:
# Get X_train and y_train
y_train = df.Survived.copy()
X_train = df.drop(columns='Survived').copy()

In [11]:
# Retrive num_cols and cat_cols
num_cols = list(X_train._get_numeric_data().columns)
cat_cols = list(set(X_train.columns) - set(num_cols))

In [12]:
print(num_cols, cat_cols)

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] ['Embarked', 'Sex']


In [13]:
# Encode cat_cols
X_train = pd.get_dummies(data=X_train, columns=cat_cols, drop_first=True)
X_test = pd.get_dummies(data=X_test, columns=cat_cols, drop_first=True)

### As XGBoost can handle missing and unscaled data, I only have to encode categorical data

In [14]:
predictions = DataScience.xgb_classifier(X_train, y_train, X_test)

Fitting 5 folds for each of 12960 candidates, totalling 64800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done 1668 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 3068 tasks      | elapsed:   48.2s
[Parallel(n_jobs=-1)]: Done 4868 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 7068 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 9668 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 12668 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 16068 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 19868 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 24068 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 28668 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 33668 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 39068 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done 44868 tas

{'colsample_bytree': 0.6, 'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 5, 'n_estimators': 180, 'scoring': 'roc_auc', 'subsample': 1.0}


In [15]:
# Get the index of the predicted table
index = X_test.index
predictions = pd.DataFrame(predictions, columns=['Survived'], index=index)

# saving the DataFrame as a CSV file 
csv_data = predictions.to_csv('Predictions.csv', index = True) 

In [16]:
# Verify the prediction's format
print(predictions.head())

             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 0
