In [145]:
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn import tree, metrics
from sklearn.model_selection import cross_val_score

In [56]:
train_csv_path = "../data/train.csv"
train_df = pd.read_csv(train_csv_path)

1- Cleaning the data

In [57]:
# Removing the features which should not influence at all the survival rate
cleaned_train_df = train_df.drop(columns=['PassengerId', 'Name', 'Ticket'])

In [58]:
cleaned_train_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [59]:
# Removing the features without enough valid values
cleaned_train_df2 = cleaned_train_df.drop(columns=['Cabin'])
cleaned_train_df2 = cleaned_train_df2.drop(columns=['Age'])

In [97]:
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Age']

2- Turning qualitative features in quantitative ones

In [68]:
def sex_enum(sex: str):
    if sex == 'male':
        return 0
    elif sex == 'female':
        return 1
    else:
        return np.NaN
    
def port_enum(port: str):
    if port == 'C':
        return 0
    elif port == "Q":
        return 1
    elif port == "S":
        return 2
    else:
        return np.NaN

In [72]:
quantitative_train_df = deepcopy(cleaned_train_df2)
quantitative_train_df['Sex'] = quantitative_train_df['Sex'].apply(lambda sex: sex_enum(sex))
quantitative_train_df['Embarked'] = quantitative_train_df['Embarked'].apply(lambda port: port_enum(port))

In [98]:
quantitative_transformations = [('Sex', sex_enum), ('Embarked', port_enum)]

3- Removing the data with NaN values

In [85]:
quantitative_train_df = quantitative_train_df.dropna()

4- Preparing the model inputs

In [87]:
input_labels = quantitative_train_df.columns[1:]
output_label = ['Survived']
train_inputs = quantitative_train_df.as_matrix(columns=input_labels)
train_outputs = quantitative_train_df.as_matrix(columns=output_label)

5- Building the model

In [147]:
clf = tree.DecisionTreeClassifier()

6- Training the model

In [148]:
limit_validation = int(len(train_inputs)*3/4)
validation_inputs = train_inputs[limit_validation:]
model_input = train_inputs[:limit_validation+1]
exp_model_outputs = train_outputs[:limit_validation+1]
exp_validation_outputs = train_outputs[limit_validation:]
# clf = clf.fit(model_input, exp_model_outputs)

In [149]:
# clf.score(model_input, exp_model_outputs)

In [153]:
scores = cross_val_score(clf, train_inputs, train_outputs, cv=10, scoring='roc_auc')
scores

array([0.69491979, 0.6986631 , 0.66604278, 0.8026738 , 0.89786096,
       0.80748663, 0.77673797, 0.81550802, 0.82566845, 0.76034858])

7- Testing the model

In [108]:
test_csv_path = "../data/test.csv"
test_df = pd.read_csv(test_csv_path)

In [109]:
cleaned_test_df = test_df.drop(columns=columns_to_drop)

In [110]:
quantitative_test_df = deepcopy(cleaned_test_df)
for trans in quantitative_transformations:
    quantitative_test_df[trans[0]] =  quantitative_test_df[trans[0]].apply(lambda x: trans[1](x))

In [112]:
quantitative_test_df.isnull().sum()

Pclass      0
Sex         0
SibSp       0
Parch       0
Fare        1
Embarked    0
dtype: int64

In [114]:
def nan_to_mean(el, mean):
    if np.isnan(el):
        return mean
    else:
        return el
    
mean_fare = np.mean(quantitative_test_df['Fare']) 
quantitative_test_df['Fare'] = quantitative_test_df['Fare'].apply(lambda fare: nan_to_mean(fare, mean_fare))

In [115]:
test_inputs = quantitative_test_df.as_matrix(columns=input_labels)

In [117]:
test_predictions = clf.predict(test_inputs)

8- Creating submission file

In [130]:
submission_df = pd.DataFrame(data=test_predictions, columns=['Survived'])
submission_df = submission_df.reset_index()
submission_df['PassengerId'] = submission_df['index'].apply(lambda index: index + 892)
submission_df.drop(columns=['index'])

Unnamed: 0,Survived,PassengerId
0,0,892
1,1,893
2,0,894
3,0,895
4,1,896
5,0,897
6,0,898
7,0,899
8,1,900
9,0,901


In [135]:
submission_df.to_csv(path_or_buf="../data/submission.csv", columns=['PassengerId', 'Survived'], index=False)