# Building the model

Build a random forest model. Decision trees are used to impute missing `Fare` and `Age` values.

In [1]:
%matplotlib inline

import random
import itertools

import numpy as np
import pandas as pd
import patsy
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV

from util import get_random_seed, ModelImputer

## Random seeds

In [2]:
random_seed = get_random_seed()
print('np.random.seed:', random_seed)
np.random.seed(random_seed)

random_seed = get_random_seed()
print('random.seed:', random_seed)
random.seed(random_seed)

np.random.seed: 4134178650
random.seed: 1945908556


## Load training data

In [3]:
train_data = pd.read_csv('clean_data/train.csv')

formula = 'Survived ~ Embarked + Class + Cabin + Fare + Sex + Age + Relatives - 1'
y_train, X_train = patsy.dmatrices(formula, train_data, NA_action='raise')

Print the indices of the variables (used as `ModelImputer` parameters below).

In [4]:
pd.DataFrame({'Variable': X_train.design_info.column_names})

Unnamed: 0,Variable
0,Embarked[Cherbourg]
1,Embarked[Queenstown]
2,Embarked[Southampton]
3,Class[T.second]
4,Class[T.third]
5,Cabin[T.B]
6,Cabin[T.C]
7,Cabin[T.D]
8,Cabin[T.E]
9,Cabin[T.F]


## Build the model pipeline

In [5]:
pipeline = Pipeline([
    ('fare_imputer', ModelImputer(DecisionTreeRegressor(), 14, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])),
    ('age_imputer', ModelImputer(DecisionTreeRegressor(), 15, [13, 14, 16])),
    ('forest', RandomForestClassifier()),
])

## Estimate the model parameters

In [6]:
param_grid = {'forest__n_estimators': list(range(10, 101, 10))}

model = GridSearchCV(pipeline, param_grid=param_grid, cv=10, n_jobs=4, verbose=1)
model.fit(np.asarray(X_train), np.asarray(y_train.ravel()))

print('Best accuracy:', model.best_score_)
print('Best parameters:', model.best_params_)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done  50 jobs       | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done  94 out of 100 | elapsed:    2.7s remaining:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    3.0s finished


Best accuracy: 0.817772778403
Best parameters: {'forest__n_estimators': 40}


## Create a submission

In [7]:
test_data = pd.read_csv('clean_data/test.csv')

X_test = patsy.build_design_matrices([X_train.design_info], test_data, NA_action='raise')[0]
y_test = model.predict(X_test)

submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_test}, dtype=int)
submission.to_csv('submission/submission.csv', index=False)