# Титаник. Кто выживет?

https://www.kaggle.com/c/titanic/

In [3]:
import numpy as np
import pandas as pd

In [4]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

%config InlineBackend.figure_format = 'retina'
from pylab import rcParams
rcParams['figure.figsize'] = (9, 6)

### Данные

In [5]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [9]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [14]:
y_train = train.Survived
train.drop('Survived', axis=1, inplace=True)

# Подготовка данных

In [16]:
train['is_test'] = 0
test['is_test'] = 1

In [23]:
df = pd.concat([train, test])

In [24]:
df["isMale"] = df.Sex.replace({"male": 1, "female":0})
df.drop(["Sex", "Cabin", "Ticket", "Name", "PassengerId"], axis=1, inplace=True)

In [116]:
DEmbarked = df.Embarked.value_counts(normalize=True).to_dict()
df.Embarked = df.Embarked.replace(DEmbarked)

In [35]:
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,is_test,isMale
0,3,22.0,1,0,7.25,0.699311,0,1
1,1,38.0,1,0,71.2833,0.20658,0,0
2,3,26.0,0,0,7.925,0.699311,0,0
3,1,35.0,1,0,53.1,0.699311,0,0
4,3,35.0,0,0,8.05,0.699311,0,1


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 8 columns):
Pclass      1309 non-null int64
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Fare        1308 non-null float64
Embarked    1307 non-null float64
is_test     1309 non-null int64
isMale      1309 non-null int64
dtypes: float64(3), int64(5)
memory usage: 92.0 KB


In [113]:
# Заполнение пропущенных значений
from sklearn.preprocessing import Imputer
columns = df.columns
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)
imputer.fit(df)
df_imputed = imputer.transform(df)
df_imputed = pd.DataFrame(df_imputed, columns=columns)

In [114]:
df_imputed.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,is_test,isMale
0,3.0,22.0,1.0,0.0,7.25,0.699311,0.0,1.0
1,1.0,38.0,1.0,0.0,71.2833,0.20658,0.0,0.0
2,3.0,26.0,0.0,0.0,7.925,0.699311,0.0,0.0
3,1.0,35.0,1.0,0.0,53.1,0.699311,0.0,0.0
4,3.0,35.0,0.0,0.0,8.05,0.699311,0.0,1.0


In [115]:
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 8 columns):
Pclass      1309 non-null float64
Age         1309 non-null float64
SibSp       1309 non-null float64
Parch       1309 non-null float64
Fare        1309 non-null float64
Embarked    1309 non-null float64
is_test     1309 non-null float64
isMale      1309 non-null float64
dtypes: float64(8)
memory usage: 81.9 KB


In [78]:
X_train = df_imputed[df_imputed.is_test==0].drop('is_test', axis=1)
X_test = df_imputed[df_imputed.is_test==1].drop('is_test', axis=1)

In [79]:
X_train.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Embarked,isMale
0,3.0,22.0,1.0,0.0,7.25,0.699311,1.0
1,1.0,38.0,1.0,0.0,71.2833,0.20658,0.0
2,3.0,26.0,0.0,0.0,7.925,0.699311,0.0
3,1.0,35.0,1.0,0.0,53.1,0.699311,0.0
4,3.0,35.0,0.0,0.0,8.05,0.699311,1.0
5,3.0,29.881138,0.0,0.0,8.4583,0.094109,1.0
6,1.0,54.0,0.0,0.0,51.8625,0.699311,1.0
7,3.0,2.0,3.0,1.0,21.075,0.699311,1.0
8,3.0,27.0,0.0,2.0,11.1333,0.699311,0.0
9,2.0,14.0,1.0,0.0,30.0708,0.20658,0.0


In [80]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null float64
Age         891 non-null float64
SibSp       891 non-null float64
Parch       891 non-null float64
Fare        891 non-null float64
Embarked    891 non-null float64
isMale      891 non-null float64
dtypes: float64(7)
memory usage: 55.7 KB


### Обучение с кросс-валидацией

In [84]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [85]:
clf = DecisionTreeClassifier(max_depth=4)

In [87]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [88]:
clf.feature_importances_

array([0.20260669, 0.07429728, 0.04262917, 0.00787   , 0.07587536,
       0.        , 0.59672151])

In [90]:
X_train.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'isMale'], dtype='object')

In [98]:
depths = np.arange(1,10)
features_num = np.arange(1,7)
grid = {'max_depth': depths, 'max_features': features_num}
gridsearch = GridSearchCV(DecisionTreeClassifier(), grid, scoring='neg_log_loss', cv=5)

In [99]:
%%time
gridsearch.fit(X_train, y_train)

Wall time: 736 ms


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_features': array([1, 2, 3, 4, 5, 6])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0)

In [100]:
sorted(gridsearch.grid_scores_, key = lambda x: -x.mean_validation_score)



[mean: -0.47224, std: 0.10443, params: {'max_depth': 3, 'max_features': 5},
 mean: -0.47250, std: 0.02497, params: {'max_depth': 2, 'max_features': 6},
 mean: -0.48242, std: 0.02052, params: {'max_depth': 2, 'max_features': 5},
 mean: -0.48560, std: 0.13344, params: {'max_depth': 4, 'max_features': 4},
 mean: -0.50178, std: 0.06973, params: {'max_depth': 2, 'max_features': 3},
 mean: -0.50783, std: 0.01757, params: {'max_depth': 2, 'max_features': 4},
 mean: -0.51588, std: 0.02661, params: {'max_depth': 1, 'max_features': 5},
 mean: -0.51588, std: 0.02661, params: {'max_depth': 1, 'max_features': 6},
 mean: -0.55578, std: 0.05658, params: {'max_depth': 2, 'max_features': 1},
 mean: -0.55914, std: 0.05952, params: {'max_depth': 1, 'max_features': 2},
 mean: -0.55968, std: 0.10331, params: {'max_depth': 3, 'max_features': 6},
 mean: -0.56006, std: 0.09989, params: {'max_depth': 3, 'max_features': 2},
 mean: -0.56090, std: 0.11748, params: {'max_depth': 3, 'max_features': 4},
 mean: -0.56

In [101]:
gridsearch.best_params_

{'max_depth': 3, 'max_features': 5}

In [103]:
best_depth = gridsearch.best_params_["max_depth"]

# Предсказание

In [104]:
from sklearn.metrics import accuracy_score

In [105]:
clf = DecisionTreeClassifier(max_depth=best_depth)

In [106]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [107]:
y_pred = clf.predict(X_test)

In [108]:
y_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

### Сохраняем результат

In [109]:
submussion = 'PassengerId,Survived\n'
submussion += "\n".join(["{},{}".format(pid, prediction) for pid, prediction in zip(test.PassengerId, y_pred)])

In [118]:
with open('submission.csv', 'w') as file:
    file.write(submussion)

### Визуализируем дерево

In [119]:
from sklearn.tree import export_graphviz

def get_tree_dot_view(clf, feature_names=None, class_names=None):
    print(export_graphviz(clf, out_file=None, filled=True, feature_names=feature_names, class_names=class_names))

In [120]:
get_tree_dot_view(clf, list(X_train.columns), {0: "Not Survived", 1: "Survived"})

digraph Tree {
node [shape=box, style="filled", color="black"] ;
0 [label="isMale <= 0.5\ngini = 0.473\nsamples = 891\nvalue = [549, 342]\nclass = Not Survived", fillcolor="#e5813960"] ;
1 [label="Pclass <= 2.5\ngini = 0.383\nsamples = 314\nvalue = [81, 233]\nclass = Survived", fillcolor="#399de5a6"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="Age <= 2.5\ngini = 0.1\nsamples = 170\nvalue = [9, 161]\nclass = Survived", fillcolor="#399de5f1"] ;
1 -> 2 ;
3 [label="gini = 0.5\nsamples = 2\nvalue = [1, 1]\nclass = Not Survived", fillcolor="#e5813900"] ;
2 -> 3 ;
4 [label="gini = 0.091\nsamples = 168\nvalue = [8, 160]\nclass = Survived", fillcolor="#399de5f2"] ;
2 -> 4 ;
5 [label="Fare <= 23.35\ngini = 0.5\nsamples = 144\nvalue = [72, 72]\nclass = Not Survived", fillcolor="#e5813900"] ;
1 -> 5 ;
6 [label="gini = 0.484\nsamples = 117\nvalue = [48, 69]\nclass = Survived", fillcolor="#399de54e"] ;
5 -> 6 ;
7 [label="gini = 0.198\nsamples = 27\nvalue = [24, 3]\nclas