# Titanic: Machine Learning from Disaster

In [1]:
# https://www.kaggle.com/c/titanic

In [2]:
import sklearn
import pandas as pd

In [3]:
# Загружаем данные из файлов
train = pd.read_csv('./homework/train.csv')
test = pd.read_csv('./homework/test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Предобработка данных

In [5]:
# Заполняем пропуски в данных медианными 
# значениями факторов на обучающей выборке
train_median = train.median()
train_imp = train.fillna(train_median)
test_imp = test.fillna(train_median)

In [6]:
# Бинаризуем категориальные признаки
CATEGORY_COL = ['Sex', 'Pclass', 'Embarked']
train_dummies = pd.get_dummies(train_imp, columns=CATEGORY_COL, drop_first=True)
test_dummies = pd.get_dummies(test_imp, columns=CATEGORY_COL, drop_first=True)

In [7]:
train_dummies.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,0,1,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,0,1,0,1


In [8]:
# Удаляем лишние столбцы
DROP_COL = ['PassengerId', 'Name', 'Ticket', 'Cabin']
TARGET_COL = 'Survived'
X_train = train_dummies.drop(DROP_COL + [TARGET_COL], axis=1)
y_train = train_dummies[TARGET_COL]
X_test = test_dummies.drop(DROP_COL, axis=1)

## Предсказание моделей для стеккинга

In [11]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

def cross_val_predict_proba(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True, random_state=12345)
    return cross_val_predict(estimator, X_train, y_train, cv=kfold, method='predict_proba')

# TODO: подобрать гиперпараметры для ансамблей
def choose_best_estimator(estimator, param_grid, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True)
    grid_search = GridSearchCV(estimator, param_grid,
                          scoring = make_scorer(accuracy_score),
                          cv=kfold)
    bestmodel = grid_search.fit(X_train, y_train)
    return bestmodel

rf_param_grid = {'max_depth': range(1,10, 1), 'n_estimators': range(1,100,10)}
gb_param_grid = {'n_estimators': range(1,100, 10)}
# инициализирем модели с подобранными гиперпараметрами
LEARNING_RATE = 0.05
MAX_DEPTH = 5
SUBSAMPLE = 0.75
MAX_FEATURES = 0.5
LEAF = 8
rf_estimator = choose_best_estimator(RandomForestClassifier( max_features="sqrt",
                                                           ), rf_param_grid, X_train, y_train)
gb_estimator = choose_best_estimator(GradientBoostingClassifier(learning_rate=LEARNING_RATE,
                                 max_depth=MAX_DEPTH, 
                                 subsample=SUBSAMPLE,
                                 max_features=MAX_FEATURES,
                                 min_samples_leaf=LEAF), gb_param_grid, X_train, y_train)
ada_estimator = choose_best_estimator(AdaBoostClassifier(learning_rate=LEARNING_RATE), gb_param_grid, X_train, y_train)

# получаем предсказания вероятностей ансамблей на кросс-валидации для обучающей выборки
rf_train_pred = cross_val_predict_proba(rf_estimator, X_train, y_train)
gb_train_pred = cross_val_predict_proba(gb_estimator, X_train, y_train)
ada_train_pred = cross_val_predict_proba(ada_estimator, X_train, y_train)

X_train_stack = np.stack([rf_train_pred[:,1], gb_train_pred[:,1], ada_train_pred[:,1]], axis=1)

# получаем предсказания ансамблей для тестовой выборки
#rf_test_pred = rf_estimator.fit(X_train, y_train).predict_proba(X_test)
#gb_test_pred = gb_estimator.fit(X_train, y_train).predict_proba(X_test)
rf_test_pred = rf_estimator.predict_proba(X_test)
gb_test_pred = gb_estimator.predict_proba(X_test)
ada_test_pred = ada_estimator.predict_proba(X_test)

X_test_stack = np.stack([rf_test_pred[:,1], gb_test_pred[:,1], ada_test_pred[:,1]],  axis=1)

In [12]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
print( rf_train_pred.shape, gb_train_pred.shape)
def cross_val_score_get(estimator, X_train, y_train):
    kfold = KFold(n_splits=4, shuffle=True)
    score = cross_val_score(estimator, 
                        X_train, y_train, groups = None,
                       scoring = make_scorer(accuracy_score),
                       cv = kfold)
    return score

print (cross_val_score_get(rf_estimator, X_train, y_train))
print (cross_val_score_get(gb_estimator, X_train, y_train))
print (cross_val_score_get(ada_estimator, X_train, y_train))

(891, 2) (891, 2)
[ 0.86098655  0.79820628  0.79372197  0.86036036]
[ 0.81165919  0.84753363  0.83856502  0.8018018 ]
[ 0.8161435   0.76233184  0.75784753  0.84234234]


## Объединяем предсказания ансамблей с помощью логистической регрессии

In [14]:
from sklearn.linear_model import LogisticRegression

print (X_test_stack.shape, X_train_stack.shape)
# TODO: подобрать гиперпараметры LogisticRegression
lr_params = {'C': range(1,10000, 1000)}
logreg = choose_best_estimator(LogisticRegression(), lr_params, X_train_stack, y_train)
#logres.fit(X_train_stack, y_train)
predicted = logreg.predict(X_test_stack)
print (cross_val_score_get(logreg, X_train_stack, y_train))

(418, 3) (891, 3)
[ 0.86995516  0.82511211  0.85201794  0.78378378]


## Формируем файл для отправки

In [15]:
with open('submission.txt', 'w') as out:
    out.write('PassengerId,Survived\n')
    for passenger, y in zip(test['PassengerId'], predicted):
        out.write('%s,%s\n' % (passenger, y))