In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import feature_process_helper
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

# RF Model 1

#### Load data

In [3]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

#### Transform variables

In [5]:
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.dummies(train, test)
train, test = feature_process_helper.drop(train, test)

#### Tune hyper-parameters

In [8]:
rf1 = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

In [12]:
param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700]}

gs = GridSearchCV(estimator=rf1,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [14]:
print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.826038159371
{'min_samples_split': 10, 'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}


#### Fit model

In [20]:
rf1 = RandomForestClassifier(criterion='gini', 
                             n_estimators=100,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf1.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf1.oob_score_ 

0.8238


#### Obtain cross-validation score with optimal hyperparameters

In [16]:
scores1 = cross_val_score(rf1, train.iloc[:, 2:], train.iloc[:, 1], n_jobs=-1)
scores1.mean()

0.82603815937149283

#### Inspect feature ranking

In [21]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf1.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
1,Fare,0.142931
20,Name_Title_Mr.,0.131256
7,Sex_male,0.111903
2,Name_Len,0.107163
0,Age,0.098652
8,Sex_female,0.090083
4,Pclass_3,0.063281
21,Name_Title_Mrs.,0.037778
22,Name_Title_Miss.,0.035874
12,Cabin_Letter_n,0.034054


#### Generate submission file

In [33]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)

In [34]:
predictions = rf1.predict(test.iloc[:, 1:])

In [38]:
predictions = pd.DataFrame(predictions, columns=['Survived'])

In [40]:
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)

In [44]:
predictions.to_csv(os.path.join('submission_files', 'y_test1.csv'), sep=",", index = False)