In [351]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.tree import export_graphviz
from graphviz import Source
import subprocess
from os import system
from sklearn import ensemble

**Функция для формирования csv-файла посылки на Kaggle:**

In [114]:
def write_to_submission_file(predicted_labels, out_file, train_num=891,
                    target='Survived', index_label="PassengerId"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(train_num + 1,
                                                  train_num + 1 +
                                                  predicted_labels.shape[0]),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

**Считываем обучающую и тестовую выборки.**

In [115]:
train_data = pd.read_csv("../../../datasets/titanic_train.csv") 
test_data = pd.read_csv("../../../datasets/titanic_test.csv")

In [116]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [117]:
y = train_data['Survived']

**Приведение данных**

In [118]:
train_data.Embarked.fillna(train_data.Embarked.mode()[0], inplace = True)
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)
test_data['Age'].fillna(train_data['Age'].median(), inplace=True)

In [119]:
test_data['Sex'] = LabelEncoder().fit_transform(test_data['Sex'])
train_data['Sex'] = LabelEncoder().fit_transform(train_data['Sex'])
test_data['Embarked'] = LabelEncoder().fit_transform(test_data['Embarked'])
train_data['Embarked'] = LabelEncoder().fit_transform(train_data['Embarked'])

In [120]:
train_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], 
              axis=1, inplace=True)
test_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], 
              axis=1, inplace=True)

In [121]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [122]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.5,0,0,7.8292,1
1,3,0,47.0,1,0,7.0,2
2,2,1,62.0,0,0,9.6875,1
3,3,1,27.0,0,0,8.6625,2
4,3,0,22.0,1,1,12.2875,2


## 1. Дерево решений без настройки параметров

**Обучите на имеющейся выборке дерево решений (DecisionTreeClassifier) максимальной глубины 2. Используйте параметр random_state=17 для воспроизводимости результатов.**

In [123]:
X = train_data.drop(['Survived',], axis=1)

In [124]:
tree=DecisionTreeClassifier(max_depth=2, random_state=17)
tree.fit(X, y)
print('Результат на обучающей выборке: ', tree.score(X, y))

Результат на обучающей выборке:  0.7957351290684624


**Сделайте с помощью полученной модели прогноз для тестовой выборки**

In [125]:
predictions = tree.predict(test_data)

**Сформируйте файл посылки и отправьте на Kaggle**

In [126]:
write_to_submission_file(predicted_labels=predictions,out_file='submission.csv')

Каков результат первой посылки (дерево решений без настройки параметров) в публичном рейтинге соревнования Titanic?
- **0.746**
- 0.756
- 0.766
- 0.776

**Отобразите дерево с помощью export_graphviz и dot.**

In [128]:
dotfile = open(r'Titanic_tree.dot', 'w')

graph = Source(export_graphviz(tree, out_file=dotfile, 
               rounded = True, proportion = False, 
                precision = 2, filled = True))
dotfile.close()
system("dot -Tpng Titanic_tree.dot -o Titanic_tree.png")

0

Сколько признаков задействуются при прогнозе деревом решений глубины 2?

- 2
- **3**
- 4
- 5

## 2. Дерево решений с настройкой параметров

**Обучите на имеющейся выборке дерево решений (DecisionTreeClassifier). Также укажите random_state=17. Максимальную глубину и минимальное число элементов в листе настройте на 5-кратной кросс-валидации с помощью GridSearchCV.**

In [132]:
# tree params for grid search
tree_params = {'max_depth': list(range(1, 5)), 
               'min_samples_leaf': list(range(1, 5))}

tree_grid = GridSearchCV(tree, tree_params,
                         cv=5, n_jobs=-1,
                        verbose=True)
tree_grid.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   15.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=2,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=17,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'max_depth': [1, 2, 3, 4],
                         'min_samples_leaf': [1, 2, 3, 4]},
             pre_dispatch='2*n_

In [134]:
tree_grid.best_params_

{'max_depth': 3, 'min_samples_leaf': 3}

In [135]:
tree_grid.best_score_

0.8114478114478114

Каковы лучшие параметры дерева, настроенные на кросс-валидации с помощью GridSearchCV?

- max_depth=2, min_samples_leaf=1
- max_depth=2, min_samples_leaf=4
- max_depth=3, min_samples_leaf=2
- **max_depth=3, min_samples_leaf=3**

Какой получилась средняя доля верных ответов на кросс-валидации для дерева решений с лучшим сочетанием гиперпараметров max_depth и min_samples_leaf?

- 0.77
- 0.79
- **0.81**
- 0.83

**Сделайте с помощью полученной модели прогноз для тестовой выборки.**

In [137]:
grid_predictions = tree_grid.predict(test_data)

**Сформируйте файл посылки и отправьте на Kaggle.**

In [138]:
write_to_submission_file(predicted_labels=grid_predictions,out_file='grid_submission.csv')


Каков результат второй посылки (дерево решений с настройкой гиперпараметров) в публичном рейтинге соревнования Titanic?

- 0.7499
- 0.7599
- 0.7699
- 0.7799

## 3

In [326]:
train_data = pd.read_csv("../../../datasets/titanic_train.csv") 
test_data = pd.read_csv("../../../datasets/titanic_test.csv")
y = train_data['Survived']

In [327]:
td = pd.concat([train_data, test_data], ignore_index=True, sort  = False)

In [328]:
td['Family'] = td.Parch + td.SibSp
td['Is_Alone'] = td.Family == 0
td['Fare_Category'] = pd.cut(td['Fare'], bins=[0,7.90,14.45,31.28,120], labels=['Low','Mid', 'High_Mid','High'])

In [329]:
td.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family,Is_Alone,Fare_Category
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,False,Low
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,False,High
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,True,Mid
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,False,High
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,True,Mid


In [330]:
td.Embarked.fillna(td.Embarked.mode()[0], inplace = True)
td.Fare_Category.fillna('High', inplace=True)
td.Cabin = td.Cabin.fillna('NA')

In [331]:
td.Embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

In [332]:
td['Salutation'] = td.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
grp = td.groupby(['Sex', 'Pclass'])
grp.Age.apply(lambda x: x.fillna(x.median()))
td.Age.fillna(td.Age.median(), inplace = True)

In [333]:
td['Age_Range'] = pd.cut(td.Age, [0, 10, 20, 30, 40, 50, 60,70,80])

In [334]:
td.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family,Is_Alone,Fare_Category,Salutation,Age_Range
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,False,Low,Mr,"(20, 30]"
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,False,High,Mrs,"(30, 40]"
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,True,Mid,Miss,"(20, 30]"
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,False,High,Mrs,"(30, 40]"
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,True,Mid,Mr,"(30, 40]"


In [335]:
td = pd.concat([td,pd.get_dummies(td.Age_Range, prefix="Age_Range"), pd.get_dummies(td.Embarked, prefix="Emb"),pd.get_dummies(td.Fare_Category, prefix="Fare")], axis=1)
td['Sex'] = LabelEncoder().fit_transform(td['Sex'])
td['Is_Alone'] = LabelEncoder().fit_transform(td['Is_Alone'])

In [336]:
td.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Family', 'Is_Alone',
       'Fare_Category', 'Salutation', 'Age_Range', 'Age_Range_(0, 10]',
       'Age_Range_(10, 20]', 'Age_Range_(20, 30]', 'Age_Range_(30, 40]',
       'Age_Range_(40, 50]', 'Age_Range_(50, 60]', 'Age_Range_(60, 70]',
       'Age_Range_(70, 80]', 'Emb_C', 'Emb_Q', 'Emb_S', 'Fare_Low', 'Fare_Mid',
       'Fare_High_Mid', 'Fare_High'],
      dtype='object')

In [337]:
td.drop(['Fare','Cabin', 'Fare_Category','Name','Salutation', 'Ticket','Embarked', 'Age_Range', 'SibSp', 'Parch', 'Age'], axis=1, inplace=True)

In [338]:
td.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Family,Is_Alone,"Age_Range_(0, 10]","Age_Range_(10, 20]","Age_Range_(20, 30]","Age_Range_(30, 40]",...,"Age_Range_(50, 60]","Age_Range_(60, 70]","Age_Range_(70, 80]",Emb_C,Emb_Q,Emb_S,Fare_Low,Fare_Mid,Fare_High_Mid,Fare_High
0,1,0.0,3,1,1,0,0,0,1,0,...,0,0,0,0,0,1,1,0,0,0
1,2,1.0,1,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
2,3,1.0,3,0,0,1,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
3,4,1.0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,5,0.0,3,1,0,1,0,0,0,1,...,0,0,0,0,0,1,0,1,0,0


In [344]:
# Data to be predicted
X_to_be_predicted = td[td.Survived.isnull()]
X_to_be_predicted = X_to_be_predicted.drop(['Survived'], axis = 1)

#Training data
train_data = td
train_data = train_data.dropna()
y = train_data['Survived']
y = y.astype('int')
X = train_data.drop(['Survived'], axis = 1)
train_data.shape #891 x 28

(891, 21)

In [345]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Family', 'Is_Alone',
       'Age_Range_(0, 10]', 'Age_Range_(10, 20]', 'Age_Range_(20, 30]',
       'Age_Range_(30, 40]', 'Age_Range_(40, 50]', 'Age_Range_(50, 60]',
       'Age_Range_(60, 70]', 'Age_Range_(70, 80]', 'Emb_C', 'Emb_Q', 'Emb_S',
       'Fare_Low', 'Fare_Mid', 'Fare_High_Mid', 'Fare_High'],
      dtype='object')

In [346]:
tree_params = {'max_depth': list(range(1, 10)), 
               'min_samples_leaf': list(range(1, 10)),
               'criterion': ['gini', 'entropy']}

tree_grid = GridSearchCV(tree, tree_params,
                         cv=5, n_jobs=-1,
                        verbose=True)
tree_grid.fit(X, y)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed:    0.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=2,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=17,
                                              splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                 

In [347]:
tree_grid.best_params_

{'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 6}

In [348]:
tree_grid.best_score_

0.8249158249158249

In [349]:
new_grid_predictions = tree_grid.predict(X_to_be_predicted)

In [350]:
write_to_submission_file(predicted_labels=new_grid_predictions,out_file='new2_grid_submission.csv')

## 4.

In [368]:
rf = ensemble.RandomForestClassifier(random_state=11)
forest_params = { "n_estimators"      : list(range(1,100)),
                  "criterion"         : ["gini", "entropy"],
                  "max_depth"         : [5, 10, 20],
                  "bootstrap": [True, False]}
forest_grid = GridSearchCV(rf, forest_params,
                         cv=5, n_jobs=-1,
                        verbose=True)
forest_grid.fit(X, y)

Fitting 5 folds for each of 1188 candidates, totalling 5940 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.0s
[Parallel(n_jobs=-1)]: Done 848 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 1705 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 2761 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 4393 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 5887 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 5940 out of 5940 | elapsed:  2.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=11,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'bo

In [369]:
forest_grid.best_params_

{'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'n_estimators': 68}

In [370]:
forest_grid.best_score_

0.8305274971941639

In [371]:
forest_predictions = forest_grid.predict(X_to_be_predicted)

In [372]:
write_to_submission_file(predicted_labels=forest_predictions,out_file='forest3_grid_submission.csv')