In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import feature_process_helper
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

## RF 18

#### Load data & transform variables

In [11]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket', 'SibSp', 'Parch'])

#### Tune hyper-parameters

In [5]:
rf14 = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf14,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [6]:
print(gs.best_score_)
print(gs.best_params_)
#print(gs.cv_results_)

0.83950617284
{'min_samples_split': 4, 'n_estimators': 400, 'criterion': 'gini', 'min_samples_leaf': 1}


#### Fit model

In [7]:
rf14 = RandomForestClassifier(criterion='gini', 
                             n_estimators=400,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf14.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf14.oob_score_ 

0.8350


#### Obtain cross-validation score with optimal hyperparameters

In [4]:
scores1 = cross_val_score(rf14, train.iloc[:, 2:], train.iloc[:, 1], n_jobs=-1)
scores1.mean()

0.83838383838383834

#### Inspect feature ranking

In [8]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf14.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,Name_Len,0.113751
1,Fare,0.107715
33,Name_Title_Mr.,0.105001
0,Age,0.102882
11,Sex_male,0.086788
12,Sex_female,0.078397
7,Ticket_Len,0.03797
8,Pclass_3,0.036294
35,Name_Title_Miss.,0.029315
41,Fam_Size_Big,0.024041


#### Generate submission file

In [9]:
predictions = rf14.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test18.csv'), sep=",", index = False)

Leaderboard score: 0.77990  

## RF 19

In [27]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [18]:
np.shape(train)

(891, 37)

In [19]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [20]:
print(gs.best_score_)
print(gs.best_params_)

0.83164983165
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [28]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8215


In [None]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

In [29]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_testss.csv'), sep=",", index = False)

score: 0.79904

## RF 20

In [24]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.lda(train, test, train.iloc[:, 1])
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [25]:
np.shape(train)

(891, 36)

In [26]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [27]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [28]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8283


In [29]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
10,Sex_male,0.140371
11,Sex_female,0.138155
5,0,0.134577
23,Name_Title_Mr.,0.106221
0,Name_Len,0.08159
7,Pclass_3,0.053532
6,Ticket_Len,0.040853
15,Cabin_Letter_n,0.040356
33,Fam_Size_Big,0.03024
31,Fam_Size_Nuclear,0.028742


In [30]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test20.csv'), sep=",", index = False)

In [33]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=500,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8316


In [34]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test22.csv'), sep=",", index = False)

## RF 23

In [36]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [37]:
np.shape(train)

(891, 34)

In [38]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [39]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 5}


In [40]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [41]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
9,Sex_female,0.177362
21,Name_Title_Mr.,0.14072
1,Fare,0.106389
8,Sex_male,0.090796
0,Age,0.075417
2,Name_Len,0.06748
5,Pclass_3,0.061468
31,Fam_Size_Big,0.034148
6,Pclass_1,0.033386
13,Cabin_Letter_n,0.031946


In [42]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test23.csv'), sep=",", index = False)

score: 0.80383

## RF 24

In [44]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
for i in [train, test]:
    i['Fam_Size'] = i['SibSp']+i['Parch']
    i['Ticket_Len'] = i['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket', 'SibSp', 'Parch'])

In [45]:
np.shape(train)

(891, 32)

In [46]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [47]:
print(gs.best_score_)
print(gs.best_params_)

0.829405162738
{'min_samples_split': 10, 'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}


In [48]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=100,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8227


In [49]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
9,Sex_male,0.116926
10,Sex_female,0.115533
1,Fare,0.113527
22,Name_Title_Mr.,0.101934
0,Age,0.095598
2,Name_Len,0.092359
6,Pclass_3,0.058144
4,Fam_Size,0.052039
5,Ticket_Len,0.044522
14,Cabin_Letter_n,0.035665


In [50]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test24.csv'), sep=",", index = False)

score: 0.77

## RF 24

In [53]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [54]:
np.shape(train)

(891, 43)

In [55]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [56]:
print(gs.best_score_)
print(gs.best_params_)

0.83950617284
{'min_samples_split': 4, 'n_estimators': 700, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [57]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=700,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8384


In [58]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
1,Fare,0.12746
0,Age,0.1252
2,Name_Len,0.122841
30,Name_Title_Mr.,0.082411
8,Sex_male,0.074802
9,Sex_female,0.072543
4,Ticket_Len,0.043962
5,Pclass_3,0.036189
22,Cabin_Letter_n,0.024189
38,Fam_Size_Nuclear,0.022195


In [59]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test24.csv'), sep=",", index = False)

score: 0.77033

## RF 25

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [60]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=500,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [61]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test25.csv'), sep=",", index = False)

score: 0.81340

## RF 26

In [62]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=500,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8350


In [63]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test26.csv'), sep=",", index = False)

score: 0.77990

## RF 27

In [4]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [5]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [6]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 10, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [9]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=500,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8283


In [10]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test28.csv'), sep=",", index = False)

score: .77

## RF 29

In [20]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [25]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [26]:
print(gs.best_score_)
print(gs.best_params_)

0.83950617284
{'min_samples_split': 4, 'n_estimators': 700, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [23]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=700,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8328


In [24]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test34.csv'), sep=",", index = False)

score: 0.81340

## RF 25

In [None]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [60]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=500,
                             min_samples_split=12,
                             min_samples_leaf=5,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [61]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test25.csv'), sep=",", index = False)

score: 0.81340

## RF 36

In [32]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train['Age'] = train['Age'].fillna(train['Age'].mean())
test['Age'] = test['Age'].fillna(train['Age'].mean())
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [18]:
np.shape(train)

(891, 37)

In [33]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [34]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 16, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 1}


In [36]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=1000,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8339


In [37]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
23,Name_Title_Mr.,0.112752
1,Fare,0.111009
10,Sex_male,0.100946
11,Sex_female,0.100348
2,Name_Len,0.094834
0,Age,0.084043
6,Ticket_Len,0.050457
7,Pclass_3,0.049251
15,Cabin_Letter_n,0.032286
25,Name_Title_Miss.,0.030424


In [38]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test36.csv'), sep=",", index = False)

score: 0.79904

## RF 37

In [39]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [40]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [41]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [42]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8272


In [43]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test37.csv'), sep=",", index = False)

score: .77

## RF 38

In [46]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [51]:
np.shape(train)

(891, 49)

In [47]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [48]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 12, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 1}


In [49]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=1000,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8339


In [52]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
36,Name_Title_Mr.,0.124768
10,Sex_female,0.123612
9,Sex_male,0.114358
1,Name_Len,0.097597
0,Fare,0.096677
6,Pclass_3,0.045402
38,Name_Title_Miss.,0.033037
5,Ticket_Len,0.032207
28,Cabin_Letter_n,0.0304
37,Name_Title_Mrs.,0.029873


In [50]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test38.csv'), sep=",", index = False)

score: 0.80861

## RF 39

In [54]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
#test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [51]:
np.shape(train)

(891, 49)

In [55]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [56]:
print(gs.best_score_)
print(gs.best_params_)

0.846240179574
{'min_samples_split': 12, 'n_estimators': 100, 'criterion': 'gini', 'min_samples_leaf': 1}


In [57]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=100,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8373


In [58]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
31,Name_Title_Mr.,0.12775
9,Sex_male,0.117048
10,Sex_female,0.109127
1,Name_Len,0.103448
0,Fare,0.099746
6,Pclass_3,0.042916
32,Name_Title_Mrs.,0.03182
5,Ticket_Len,0.031763
23,Cabin_Letter_n,0.031427
33,Name_Title_Miss.,0.031422


In [59]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test39.csv'), sep=",", index = False)

score: 0.80383

## RF 40

In [75]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
#train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [76]:
np.shape(train)

(891, 46)

In [77]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [78]:
print(gs.best_score_)
print(gs.best_params_)

0.845117845118
{'min_samples_split': 12, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [79]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8361


In [80]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
7,Sex_female,0.139713
33,Name_Title_Mr.,0.114777
1,Name_Len,0.09893
6,Sex_male,0.092425
0,Fare,0.090744
3,Pclass_3,0.045756
2,Ticket_Len,0.038454
34,Name_Title_Mrs.,0.038107
35,Name_Title_Miss.,0.032036
25,Cabin_Letter_n,0.031778


In [81]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test40.csv'), sep=",", index = False)

score: 0.80383

## RF 41

In [2]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
del train['Age']
del test['Age']
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [3]:
np.shape(train)

(891, 49)

In [4]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [5]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 12, 'n_estimators': 1000, 'criterion': 'gini', 'min_samples_leaf': 1}


In [7]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=1000,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8339


In [8]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
36,Name_Title_Mr.,0.124768
10,Sex_female,0.123612
9,Sex_male,0.114358
1,Name_Len,0.097597
0,Fare,0.096677
6,Pclass_3,0.045402
38,Name_Title_Miss.,0.033037
5,Ticket_Len,0.032207
28,Cabin_Letter_n,0.0304
37,Name_Title_Mrs.,0.029873


In [9]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test41.csv'), sep=",", index = False)

score: 0.79904

## RF 42

In [22]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)

In [23]:
from sklearn.preprocessing import StandardScaler
def scale(train, test, cols=['Age', 'Fare']):
    for i in cols:
        sc = StandardScaler()
        train[i] = sc.fit_transform(train[i].reshape(-1, 1))
        test[i] = sc.transform(test[i].reshape(-1, 1))
    return train, test

In [24]:
train, test = scale(train, test)

In [27]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = scale(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)

train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [28]:
np.shape(train)

(891, 51)

In [29]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [30]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [31]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8260


In [32]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,Name_Len,0.117427
0,Age,0.110091
1,Fare,0.107424
38,Name_Title_Mr.,0.097366
11,Sex_male,0.076799
12,Sex_female,0.07153
8,Pclass_3,0.04785
40,Name_Title_Miss.,0.038935
7,Ticket_Len,0.03587
30,Cabin_Letter_n,0.032825


In [33]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test42.csv'), sep=",", index = False)

score: 0.76

## RF 43

In [34]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
#train, test = feature_process_helper.titles_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [35]:
np.shape(train)

(891, 51)

In [36]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [37]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [38]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [39]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,Name_Len,0.129119
1,Fare,0.122111
0,Age,0.115754
38,Name_Title_Mr.,0.084287
12,Sex_female,0.06626
11,Sex_male,0.062517
7,Ticket_Len,0.04347
8,Pclass_3,0.042318
40,Name_Title_Miss.,0.034178
30,Cabin_Letter_n,0.025585


In [40]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test43.csv'), sep=",", index = False)

score: 0.77

## RF 44

In [42]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
#train, test = feature_process_helper.age_impute(train, test)
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.titles_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket', 'Age'])

In [43]:
np.shape(train)

(891, 47)

In [44]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [45]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 10, 'n_estimators': 100, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [46]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=100,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8373


In [47]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
0,Fare,0.131729
1,Name_Len,0.12517
36,Name_Title_Mr.,0.110004
10,Sex_female,0.084183
9,Sex_male,0.084101
6,Pclass_3,0.047644
5,Ticket_Len,0.039481
28,Cabin_Letter_n,0.035526
37,Name_Title_Mrs.,0.031796
38,Name_Title_Miss.,0.030978


In [48]:
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test44.csv'), sep=",", index = False)

score: 0.76

## RF 45

In [66]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [69]:
len(train.columns)

46

In [54]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [55]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 700, 'criterion': 'gini', 'min_samples_leaf': 1}


In [67]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [75]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
12,Sex_female,0.111215
11,Sex_male,0.109769
33,Name_Title_Mr.,0.109746
1,Fare,0.088209
2,Name_Len,0.087904
0,Age,0.078651
8,Pclass_3,0.043268
35,Name_Title_Miss.,0.031292
7,Ticket_Len,0.031079
34,Name_Title_Mrs.,0.028852


In [57]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test45.csv'), sep=",", index = False)

score:  0.79426

## RF 46

In [60]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train['Ticket_Lett'] = train['Ticket'].apply(lambda x: str(x)[0])
test['Ticket_Lett'] = test['Ticket'].apply(lambda x: str(x)[0])
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [61]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [62]:
print(gs.best_score_)
print(gs.best_params_)

0.840628507295
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [63]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [64]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test46.csv'), sep=",", index = False)

score:  0.77

## RF 47

In [118]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [77]:
len(train.columns)

46

In [54]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [55]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 700, 'criterion': 'gini', 'min_samples_leaf': 1}


In [78]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [79]:
feats = pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False).iloc[:23, 0]

In [119]:
train = pd.concat((train.iloc[:, :2], train[feats]), axis=1)
test = pd.concat((test.iloc[:, 0], test[feats]), axis=1)

In [112]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [113]:
print(gs.best_score_)
print(gs.best_params_)

0.846240179574
{'min_samples_split': 10, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [115]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8316


In [116]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
3,Fare,0.129091
5,Age,0.11407
4,Name_Len,0.103566
2,Name_Title_Mr.,0.097082
0,Sex_female,0.096812
1,Sex_male,0.096616
6,Pclass_3,0.046606
8,Ticket_Len,0.045846
10,Cabin_Letter_n,0.032464
13,Pclass_1,0.025647


In [120]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test47.csv'), sep=",", index = False)

score:  

## RF 48

In [126]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [127]:
len(train.columns)

46

In [54]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [55]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 700, 'criterion': 'gini', 'min_samples_leaf': 1}


In [128]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [129]:
feats = pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False).iloc[:30, 0]

In [130]:
train = pd.concat((train.iloc[:, :2], train[feats]), axis=1)
test = pd.concat((test.iloc[:, 0], test[feats]), axis=1)

In [131]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [132]:
print(gs.best_score_)
print(gs.best_params_)

0.841750841751
{'min_samples_split': 4, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [133]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=4,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8328


In [134]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
2,Name_Title_Mr.,0.137337
5,Age,0.11189
4,Name_Len,0.110997
3,Fare,0.103998
1,Sex_male,0.092999
0,Sex_female,0.069961
6,Pclass_3,0.050742
8,Ticket_Len,0.040917
12,Fam_Size_Nuclear,0.024748
11,Fam_Size_Big,0.023612


In [135]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test48.csv'), sep=",", index = False)

score:  

## RF 49

In [136]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked', 'Ticket_Lett',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
#train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [137]:
len(train.columns)

46

In [54]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [55]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 700, 'criterion': 'gini', 'min_samples_leaf': 1}


In [138]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=700,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [140]:
feats = pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False).iloc[:10, 0]

In [141]:
train = pd.concat((train.iloc[:, :2], train[feats]), axis=1)
test = pd.concat((test.iloc[:, 0], test[feats]), axis=1)

In [142]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [144]:
print(gs.best_score_)
print(gs.best_params_)

0.838383838384
{'min_samples_split': 10, 'n_estimators': 50, 'criterion': 'gini', 'min_samples_leaf': 1}


In [145]:
rf = RandomForestClassifier(criterion='gini', 
                             n_estimators=50,
                             min_samples_split=10,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8373


In [148]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
3,Fare,0.179483
2,Name_Title_Mr.,0.140746
1,Sex_male,0.13494
5,Age,0.132249
4,Name_Len,0.12142
0,Sex_female,0.096419
6,Pclass_3,0.085237
8,Ticket_Len,0.068108
7,Name_Title_Miss.,0.022077
9,Name_Title_Mrs.,0.019321


In [149]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test49.csv'), sep=",", index = False)

score:  

## RF 50

In [151]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [152]:
len(train.columns)

37

In [153]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [154]:
print(gs.best_score_)
print(gs.best_params_)

0.83164983165
{'min_samples_split': 16, 'n_estimators': 50, 'criterion': 'entropy', 'min_samples_leaf': 1}


In [155]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=50,
                             min_samples_split=16,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8215


In [156]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
11,Sex_male,0.121257
24,Name_Title_Mr.,0.115977
1,Fare,0.098159
0,Age,0.087345
2,Name_Len,0.08623
12,Sex_female,0.077304
8,Pclass_3,0.048533
7,Ticket_Len,0.046897
25,Name_Title_Mrs.,0.03704
9,Pclass_1,0.030009


In [157]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test50.csv'), sep=",", index = False)

score:  

## RF 51

In [158]:
train = pd.read_csv(os.path.join('data', 'train.csv'))
test = pd.read_csv(os.path.join('data', 'test.csv'))
train, test = feature_process_helper.names(train, test)
train, test = feature_process_helper.age_impute(train, test)
train, test = feature_process_helper.cabin_num(train, test)
train, test = feature_process_helper.cabin(train, test)
train, test = feature_process_helper.embarked_impute(train, test)
train, test = feature_process_helper.fam_size(train, test)
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train['Ticket_Len'] = train['Ticket'].apply(lambda x: len(x))
#test['Ticket_Len'] = test['Ticket'].apply(lambda x: len(x))
#train, test = feature_process_helper.ticket_grouped(train, test)
train, test = feature_process_helper.dummies(train, test, columns = ['Pclass', 'Sex', 'Embarked',
                                                                     'Cabin_Letter', 'Name_Title', 'Fam_Size'])
train, test = feature_process_helper.drop(train, test, bye = ['Ticket'])

In [159]:
len(train.columns)

36

In [160]:
rf = RandomForestClassifier(max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = { "criterion"   : ["gini", "entropy"],
             "min_samples_leaf" : [1,5,10],
             "min_samples_split" : [2, 4, 10, 12, 16],
             "n_estimators": [50, 100, 400, 700, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=3,
                  n_jobs=-1)

gs = gs.fit(train.iloc[:, 2:], train.iloc[:, 1])

In [161]:
print(gs.best_score_)
print(gs.best_params_)

0.832772166105
{'min_samples_split': 12, 'n_estimators': 400, 'criterion': 'gini', 'min_samples_leaf': 1}


In [162]:
rf = RandomForestClassifier(criterion='entropy', 
                             n_estimators=400,
                             min_samples_split=12,
                             min_samples_leaf=1,
                             max_features='auto',
                             oob_score=True,
                             random_state=1,
                             n_jobs=-1)
rf.fit(train.iloc[:, 2:], train.iloc[:, 1])
print "%.4f" % rf.oob_score_ 

0.8294


In [163]:
pd.concat((pd.DataFrame(train.iloc[:, 2:].columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
1,Fare,0.130523
2,Name_Len,0.110326
0,Age,0.107603
23,Name_Title_Mr.,0.098299
10,Sex_male,0.095867
11,Sex_female,0.094893
7,Pclass_3,0.049144
15,Cabin_Letter_n,0.032542
25,Name_Title_Miss.,0.029018
24,Name_Title_Mrs.,0.028046


In [164]:
test['Fare'].fillna(train['Fare'].mean(), inplace = True)
predictions = rf.predict(test.iloc[:, 1:])
predictions = pd.DataFrame(predictions, columns=['Survived'])
predictions = pd.concat((test.iloc[:, 0], predictions), axis = 1)
predictions.to_csv(os.path.join('submission_files', 'y_test51.csv'), sep=",", index = False)

score:  