In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import feature_process_helper
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

# RF Model 1

#### Load data

In [2]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

#### Transform variables

In [3]:
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.dummies(train, test)
train, test = feature_process_helper.drop(train, test)

#### Tune hyper-parameters

In [4]:
rf1 = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

In [6]:
param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700]}

gs = GridSearchCV(estimator=rf1,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [7]:
print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.826038159371
{'min_samples_split': 10, 'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}


#### Fit model

In [9]:
rf1 = RandomForestClassifier(criterion='gini', 
                             n_estimators=100,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf1.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf1.oob_score_ 

0.8238


#### Obtain cross-validation score with optimal hyperparameters

In [16]:
scores1 = cross_val_score(rf1, train.iloc[:, 2:], train.iloc[:, 1], n_jobs=-1)
scores1.mean()

0.82603815937149283

#### Inspect feature ranking

In [21]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf1.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
1,Fare,0.142931
20,Name_Title_Mr.,0.131256
7,Sex_male,0.111903
2,Name_Len,0.107163
0,Age,0.098652
8,Sex_female,0.090083
4,Pclass_3,0.063281
21,Name_Title_Mrs.,0.037778
22,Name_Title_Miss.,0.035874
12,Cabin_Letter_n,0.034054


#### Generate submission file

In [None]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf1.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test1.csv'), sep=",", index = False)

# RF Model 2

#### Load data

In [4]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

#### Transform variables

In [5]:
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Cabin_Letter', 'Name_Title',
                                                                    'SibSp', 'Parch'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

#### Tune hyperparameters

In [6]:
rf1 = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

In [7]:
param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700]}

gs = GridSearchCV(estimator=rf1,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [8]:
print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.824915824916
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


#### Fit model

In [9]:
rf2 = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf2.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf2.oob_score_ 

0.8238


#### Obtain cross-validation score with optimal hyperparameters

In [10]:
scores2 = cross_val_score(rf1, train.iloc[:, 2:], train.iloc[:, 1], n_jobs=-1)
scores2.mean()

0.80359147025813693

#### Inspect feature ranking

In [12]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf2.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
20,Name_Title_Mr.,0.116731
7,Sex_male,0.115737
8,Sex_female,0.112738
2,Name_Len,0.105312
1,Fare,0.103418
0,Age,0.084431
4,Pclass_3,0.062422
22,Name_Title_Miss.,0.035632
21,Name_Title_Mrs.,0.032156
5,Pclass_1,0.024577


#### Generate submission file

In [13]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf2.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test1.csv'), sep=",", index = False)

In [15]:
train.columns

Index([u'PassengerId', u'Survived', u'Age', u'Fare', u'Name_Len',
       u'Age_Null_Flag', u'Pclass_3', u'Pclass_1', u'Pclass_2', u'Sex_male',
       u'Sex_female', u'Embarked_S', u'Embarked_C', u'Embarked_Q',
       u'Cabin_Letter_n', u'Cabin_Letter_C', u'Cabin_Letter_E',
       u'Cabin_Letter_G', u'Cabin_Letter_D', u'Cabin_Letter_A',
       u'Cabin_Letter_B', u'Cabin_Letter_F', u'Name_Title_Mr.',
       u'Name_Title_Mrs.', u'Name_Title_Miss.', u'Name_Title_Master.',
       u'Name_Title_Rev.', u'Name_Title_Dr.', u'Name_Title_Ms.',
       u'Name_Title_Col.', u'SibSp_1', u'SibSp_0', u'SibSp_3', u'SibSp_4',
       u'SibSp_2', u'SibSp_5', u'SibSp_8', u'Parch_0', u'Parch_1', u'Parch_2',
       u'Parch_5', u'Parch_3', u'Parch_4', u'Parch_6'],
      dtype='object')

# Model 3

In [10]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))

In [11]:
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.dummies(train, test)
train, test = feature_process_helper.drop(train, test)

In [12]:
y_train = train.iloc[:, 1]

In [13]:
del train['Survived']

In [15]:
rf3 = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

In [16]:
param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700]}

gs = GridSearchCV(estimator=rf3,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train, y_train)

In [17]:
print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.83164983165
{'min_samples_split': 10, 'n_estimators': 400, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [20]:
rf3 = RandomForestClassifier(min_samples_split=10, 
                             n_estimators=400, 
                             criterion='entropy', 
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf3.fit(train, y_train)
print "%.4f" % rf3.oob_score_ 

0.8249


In [21]:
scores3 = cross_val_score(rf3, train, y_train, n_jobs=-1)
scores3.mean()

0.83164983164983164

In [22]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf3.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,Age_Null_Flag,0.133448
3,Pclass_3,0.107066
0,Fare,0.10365
1,Name_Len,0.103143
21,Name_Title_Miss.,0.099552
9,Embarked_C,0.092503
8,Embarked_S,0.087784
5,Pclass_2,0.052513
13,Cabin_Letter_E,0.033119
6,Sex_male,0.026176


In [32]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf3.predict(test)
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test3.csv'), sep=",", index = False)