In [77]:
# Import
import pandas as pd
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from lib.Utils.dataframe_computation import *

In [51]:
# Read train data
data = pd.read_csv("data/train.csv")

In [52]:
# Head
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
# Summary
get_my_info(data)

Unnamed: 0,Type,Count,Unique,Count_nan,Top,Freq,Mean,Median,Std,Min,Max
PassengerId,int64,891,891,0,891,1,446.0,446.0,257.354,1.0,891.0
Survived,int64,891,2,0,0,549,0.383838,0.0,0.486592,0.0,1.0
Pclass,int64,891,3,0,3,491,2.30864,3.0,0.836071,1.0,3.0
Name,object,891,891,0,"Dahlberg, Miss. Gerda Ulrika",1,,,,,
Sex,object,891,2,0,male,577,,,,,
Age,float64,891,88,177,24,30,29.6991,28.0,14.5265,0.42,80.0
SibSp,int64,891,7,0,0,608,0.523008,0.0,1.10274,0.0,8.0
Parch,int64,891,7,0,0,678,0.381594,0.0,0.806057,0.0,6.0
Ticket,object,891,681,0,CA. 2343,7,,,,,
Fare,float64,891,248,0,8.05,43,32.2042,14.4542,49.6934,0.0,512.329


In [98]:
# Delete irrelevant columns
train = data.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=False)

In [99]:
# Imputing Missing Values
train['Age'].fillna(train['Age'].mean(),inplace=True)
train['Embarked'].fillna(train['Embarked'].value_counts().index[0], inplace=True)

In [100]:
# Getting Dummy Variables and Dropping the Original Categorical Variables
dummy_classes = ['Sex','Embarked']
categorical_variables = train[dummy_classes]
dummies = pd.get_dummies(categorical_variables, drop_first=True)
train = train.drop(dummy_classes, axis=1)
train = pd.concat([train, dummies], axis=1)

In [101]:
get_my_info(train)

Unnamed: 0,Type,Count,Unique,Count_nan,Top,Freq,Mean,Median,Std,Min,Max
Pclass,int64,891,3,0,3.0,491,2.308642,3.0,0.836071,1.0,3.0
Age,float64,891,89,0,29.699118,177,29.699118,29.699118,13.002015,0.42,80.0
SibSp,int64,891,7,0,0.0,608,0.523008,0.0,1.102743,0.0,8.0
Parch,int64,891,7,0,0.0,678,0.381594,0.0,0.806057,0.0,6.0
Fare,float64,891,248,0,8.05,43,32.204208,14.4542,49.693429,0.0,512.3292
Sex_male,uint8,891,2,0,1.0,577,0.647587,1.0,0.47799,0.0,1.0
Embarked_Q,uint8,891,2,0,0.0,814,0.08642,0.0,0.281141,0.0,1.0
Embarked_S,uint8,891,2,0,1.0,646,0.725028,1.0,0.446751,0.0,1.0


In [103]:
# Standardization
sc = StandardScaler()
train = sc.fit_transform(train)

In [104]:
# Splitting data
y = data['Survived']
X = train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [105]:
# Train model
model = tree.DecisionTreeClassifier() # Decision tree
model = GradientBoostingClassifier(learning_rate=0.1,max_depth=3) # Gradient boosting
model = model.fit(X_train, y_train)

In [106]:
# Prediction
prediction = model.predict(X_test)

In [107]:
# Accuracy
accuracy_score(y_test, prediction)

0.8470149253731343

In [108]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88       158
           1       0.93      0.68      0.79       110

    accuracy                           0.85       268
   macro avg       0.87      0.82      0.83       268
weighted avg       0.86      0.85      0.84       268



In [111]:
# Apply for test data
data_test = pd.read_csv("data/test.csv")
test = data_test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=False)
test['Age'].fillna(test['Age'].mean(),inplace=True)
test['Fare'].fillna(test['Fare'].mean(),inplace=True)
categorical_variables = test[dummy_classes]
dummies = pd.get_dummies(categorical_variables, drop_first=True)
test = test.drop(dummy_classes, axis=1)
test = pd.concat([test, dummies], axis=1)

In [112]:
get_my_info(test)

Unnamed: 0,Type,Count,Unique,Count_nan,Top,Freq,Mean,Median,Std,Min,Max
Pclass,int64,418,3,0,3.0,218,2.26555,3.0,0.841838,1.0,3.0
Age,float64,418,80,0,30.27259,86,30.27259,30.27259,12.634534,0.17,76.0
SibSp,int64,418,7,0,0.0,283,0.447368,0.0,0.89676,0.0,8.0
Parch,int64,418,8,0,0.0,324,0.392344,0.0,0.981429,0.0,9.0
Fare,float64,418,170,0,7.75,21,35.627188,14.4542,55.8405,0.0,512.3292
Sex_male,uint8,418,2,0,1.0,266,0.636364,1.0,0.481622,0.0,1.0
Embarked_Q,uint8,418,2,0,0.0,372,0.110048,0.0,0.313324,0.0,1.0
Embarked_S,uint8,418,2,0,1.0,270,0.645933,1.0,0.478803,0.0,1.0


In [113]:
test = sc.fit_transform(test)

In [114]:
# Prediction
prediction = model.predict(test)

In [115]:
# Save results
ids = data_test[['PassengerId']] # create a sub-dataset for submission file and saving it
results = ids.assign(Survived=prediction) # assign predictions to ids
results.to_csv("titanic-results.csv", index=False) # write the final dataset to a csv file.

In [116]:
# Using standardization
sc = StandardScaler()
test = sc.fit_transform(test)