In [1]:
from sklearn import tree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
titanic_data = pd.read_csv("train.csv")
titanic_data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
X = titanic_data.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin"], axis=1)
y = titanic_data.Survived

In [5]:
X = pd.get_dummies(X)
X = X.fillna({'Age': X.Age.median()})

In [6]:
clf = tree.DecisionTreeClassifier(criterion='entropy')

In [7]:
clf.fit(X, y)

In [8]:
from sklearn.model_selection import train_test_split, cross_val_score

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
clf = tree.DecisionTreeClassifier()

In [12]:
params = {'criterion': ['entropy', 'gini'], 
          'max_depth': [7],
          'min_samples_split': [3],
          'min_samples_leaf': [2]}

In [13]:
grid_search_cv_clf = GridSearchCV(clf, params, cv=5)

In [14]:
grid_search_cv_clf.fit(X_train, y_train)

In [15]:
best_clf = grid_search_cv_clf.best_estimator_

In [16]:
best_clf.score(X_test, y_test)

0.7864406779661017

In [39]:
from sklearn.metrics import precision_score, recall_score

In [40]:
precision_score(y_true=y_test, y_pred=best_clf.predict(X_test), average='macro')

0.793848308554191

In [41]:
recall_score(y_true=y_test, y_pred=best_clf.predict(X_test), average='macro')

0.7597619047619047

In [42]:
y_pred_proba = best_clf.predict_proba(X_test)

In [43]:
y_pred = np.where(y_pred_proba[:, 1] > 0.25, 1, 0)

In [44]:
precision_score(y_test, y_pred, average='macro')

0.8033799533799534

In [45]:
recall_score(y_test, y_pred, average='macro')

0.8098809523809524

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [49]:
params = {'n_estimators': [50, 100, 150],
          'max_depth': [2, 4, 6, 8, 10, 12]}
gscv = GridSearchCV(RandomForestClassifier(), params)

In [50]:
gscv.fit(X_train, y_train)

In [51]:
rf = gscv.best_estimator_

In [52]:
rf.score(X_test, y_test)

0.8203389830508474

In [53]:
feature_importances = rf.feature_importances_

In [54]:
fi_df = pd.DataFrame({'features': list(X_train),
                      'feature_importances': feature_importances}).sort_values('feature_importances', ascending=False)
fi_df

Unnamed: 0,features,feature_importances
5,Sex_female,0.232087
6,Sex_male,0.218353
4,Fare,0.16808
1,Age,0.126123
0,Pclass,0.123381
2,SibSp,0.042116
3,Parch,0.037745
9,Embarked_S,0.022063
7,Embarked_C,0.020773
8,Embarked_Q,0.009278


# Predict and export data

In [55]:
test = pd.read_csv('test.csv')

In [56]:
X = test.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

In [57]:
X = pd.get_dummies(X)
X = X.fillna({'Age': X.Age.median()})

In [58]:
y_pred = rf.predict(X)

In [59]:
survived = pd.DataFrame({'Survived': y_pred}, index=test['PassengerId'])

In [60]:
survived.head(5)

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0


In [61]:
survived.to_csv('submission.csv')