# Building the model

The model is an ensemble of:

* a decision tree that tries to capture the ["women and children first"](https://en.wikipedia.org/wiki/Women_and_children_first) protocol,

* support vector machines, and

* a random forest classifier.

The predicted class is decided with majority voting. 

Decision trees are used to impute missing `Fare` and `Age` values before building the model.

In [1]:
import random
import pprint

import numpy as np
import pandas as pd
import patsy

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.grid_search import GridSearchCV

from util import get_random_seed, ModelImputer, ColumnSelector

## Random seeds

In [2]:
# random_seed = get_random_seed()
# print('np.random.seed:', random_seed)
np.random.seed(762272023)

# random_seed = get_random_seed()
# print('random.seed:', random_seed)
random.seed(152070533)

## Load training data

In [3]:
train_data = pd.read_csv('clean_data/train.csv')

formula = 'Survived ~ Embarked + Class + Cabin + Fare + Title + Sex + Age + Relatives - 1'
y_train, X_train = patsy.dmatrices(formula, train_data)

Save the indices of the columns used as parameters in the model pipeline below.

In [4]:
columns = pd.DataFrame({'Column': X_train.design_info.column_names})
Age = columns.index[columns['Column'] == 'Age'].tolist()[0]
Sex = columns.index[columns['Column'] == 'Sex[T.male]'].tolist()[0]
Fare = columns.index[columns['Column'] == 'Fare'].tolist()[0]

## Build the model pipeline

In [5]:
ensemble = VotingClassifier(voting='hard', estimators=[
    ('women_and_children', make_pipeline(ColumnSelector([Sex, Age]), 
                                         DecisionTreeClassifier())),
    ('svm', SVC(kernel='rbf')),
    ('rf', RandomForestClassifier()), 
])

complete_columns = [i for i in range(columns.shape[1]) if i not in (Fare, Age)]

pipeline = Pipeline([
    ('fare_imputer', ModelImputer(DecisionTreeRegressor(), Fare, 
                                  complete_columns)),
    ('age_imputer', ModelImputer(DecisionTreeRegressor(), Age, 
                                 complete_columns + [Fare])),
    ('ensemble', ensemble),
])

## Estimate the model parameters

In [6]:
param_grid = {
    'ensemble__svm__C': np.logspace(-2, 4, 10),
    'ensemble__svm__gamma': np.logspace(-4, 2, 10),
    'ensemble__rf__n_estimators': [5, 11, 17, 23],
}

model = GridSearchCV(pipeline, param_grid, cv=10, n_jobs=-1, verbose=1)
model.fit(np.asarray(X_train), np.asarray(y_train.ravel()))

print('Best score:', model.best_score_)
print('Best weights:', pprint.pformat(model.best_params_), sep='\n')

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 744 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 1134 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1833 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 2557 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3246 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed:  4.3min finished


Best score: 0.838020247469
Best weights:
{'ensemble__rf__n_estimators': 17,
 'ensemble__svm__C': 2154.4346900318824,
 'ensemble__svm__gamma': 0.0001}


## Create a submission

In [7]:
test_data = pd.read_csv('clean_data/test.csv')

X_test = patsy.build_design_matrices([X_train.design_info], test_data)[0]
y_test = model.predict(X_test)

submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 
                           'Survived': y_test}, dtype=int)
submission.to_csv('submission/submission.csv', index=False)