# Titanic: Machine Learning from Disaster

In [1]:
# https://www.kaggle.com/c/titanic

In [2]:
import sklearn
import pandas as pd

In [3]:
# Загружаем данные из файлов
train = pd.read_csv('./titanic/train.csv')
test = pd.read_csv('./titanic/test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Предобработка данных

In [6]:
# Заполняем пропуски в данных медианными 
# значениями факторов на обучающей выборке
train_median = train.median()
train_imp = train.fillna(train_median)
test_imp = test.fillna(train_median)

In [7]:
# Бинаризуем категориальные признаки
CATEGORY_COL = ['Sex', 'Pclass', 'Embarked']
train_dummies = pd.get_dummies(train_imp, columns=CATEGORY_COL, drop_first=True)
test_dummies = pd.get_dummies(test_imp, columns=CATEGORY_COL, drop_first=True)

In [8]:
train_dummies.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,1,0,1


In [9]:
# Удаляем лишние столбцы
DROP_COL = ['PassengerId', 'Name', 'Ticket', 'Cabin']
TARGET_COL = 'Survived'
X_train = train_dummies.drop(DROP_COL + [TARGET_COL], axis=1)
y_train = train_dummies[TARGET_COL]
X_test = test_dummies.drop(DROP_COL, axis=1)

## Предсказание моделей для стеккинга

In [37]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

def cross_val_predict_proba(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True, random_state=None)
    return cross_val_predict(estimator, X_train, y_train, cv=kfold, method='predict_proba')

# TODO: подобрать гиперпараметры для ансамблей
param_grid_random_forest = {
    'n_estimators':[5, 10, 15, 20, 25],
    'max_features':[1, 3, 5, 7],
    'max_depth':[3, 5, 7],
    }

random_forest_grid = GridSearchCV(RandomForestClassifier(), param_grid_random_forest,
                    scoring=make_scorer(accuracy_score),
                    cv=KFold(n_splits=4, shuffle=True, random_state=None))

random_forest_grid.fit(X_train, y_train).best_params_

{'max_depth': 5, 'max_features': 3, 'n_estimators': 20}

In [44]:
param_grid_gradient_boosting = {
    'n_estimators':[10, 50, 100],
    'max_features':[3, 5, 7, None],
    'max_depth':[3, 5, 7],
    'max_leaf_nodes':[3,5,30],
    'min_samples_split':[2,10,20],
    }

gradient_boosting_grid = GridSearchCV(GradientBoostingClassifier(), param_grid_gradient_boosting,
                    scoring=make_scorer(accuracy_score),
                    cv=KFold(n_splits=4, shuffle=True, random_state=None))

gradient_boosting_grid.fit(X_train, y_train).best_params_

{'max_depth': 5,
 'max_features': 5,
 'max_leaf_nodes': 30,
 'min_samples_split': 20,
 'n_estimators': 50}

In [45]:
# инициализирем модели с подобранными гиперпараметрами
rf_estimator = RandomForestClassifier(**{'max_depth': 5, 'max_features': 3, 'n_estimators': 20})
gb_estimator = GradientBoostingClassifier(
    **{'max_depth': 5,
       'max_features': 5,
       'max_leaf_nodes': 30,
       'min_samples_split': 20,
       'n_estimators': 50}
)

# получаем предсказания вероятностей ансамблей на кросс-валидации для обучающей выборки
rf_train_pred = cross_val_predict_proba(rf_estimator, X_train, y_train)
gb_train_pred = cross_val_predict_proba(gb_estimator, X_train, y_train)

X_train_stack = np.stack([rf_train_pred[:,1], gb_train_pred[:,1]], axis=1)

# получаем предсказания ансамблей для тестовой выборки
rf_test_pred = rf_estimator.fit(X_train, y_train).predict_proba(X_test)
gb_test_pred = gb_estimator.fit(X_train, y_train).predict_proba(X_test)

X_test_stack = np.stack([rf_test_pred[:,1], gb_test_pred[:,1]], axis=1)

## Объединяем предсказания ансамблей с помощью логистической регрессии

In [72]:
from sklearn.linear_model import LogisticRegression

# TODO: подобрать гиперпараметры LogisticRegression

param_grid_logreg = {
    'penalty': ['l1', 'l2'],
    'C': [1, 10, 100, 1000, 5000, 10000],
}

logreg_grid = GridSearchCV(LogisticRegression(), param_grid_logreg,
                    scoring=make_scorer(accuracy_score),
                    cv=KFold(n_splits=4, shuffle=True, random_state=None))

logreg_grid.fit(X_train_stack, y_train)

# logreg = LogisticRegression().fit(X_train_stack, y_train)
predicted = logreg_grid.predict(X_test_stack)

## Формируем файл для отправки

In [74]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))