In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import feature_process_helper
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
%matplotlib inline



# Model 36

#### Load data and transform variables

In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
del y_train['id']
X_train, X_test = feature_process_helper.dates(X_train, X_test)
X_train, X_test = feature_process_helper.dates2(X_train, X_test)
X_train, X_test = feature_process_helper.construction(X_train, X_test)
X_train, X_test = feature_process_helper.bools(X_train, X_test)
X_train, X_test = feature_process_helper.locs(X_train, X_test)
X_train['population'] = np.log(X_train['population'])
X_test['population'] = np.log(X_test['population'])
X_train, X_test = feature_process_helper.removal2(X_train, X_test)
X_train, X_test = feature_process_helper.small_n2(X_train, X_test)
X_train, X_test = feature_process_helper.lda(X_train, X_test, y_train, cols = ['gps_height', 'latitude', 'longitude'])
X_train, X_test = feature_process_helper.dummies(X_train, X_test)

In [3]:
len(X_train.columns)

538

#### Tune hyperparameters
(note, I had tuned 'min_samples_split' separately and got 6 as the optimal value)

In [9]:
rf = RandomForestClassifier(criterion='gini',
                                max_features='auto',
                                min_samples_split=6,
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

param_grid = {"n_estimators" : [500, 750, 1000]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

gs = gs.fit(X_train, y_train.values.ravel())

In [10]:
print(gs.best_score_)
print(gs.best_params_)
print(gs.cv_results_)

0.801397306397
{'n_estimators': 1000}
{'std_train_score': array([ 0.0007768 ,  0.00074312,  0.00067579]), 'rank_test_score': array([3, 2, 1]), 'mean_score_time': array([  7.80649996,  21.57499993,  13.17700005]), 'std_test_score': array([ 0.00115149,  0.00106731,  0.00099995]), 'split1_train_score': array([ 0.92976667,  0.92996869,  0.92980034]), 'split0_test_score': array([ 0.80202687,  0.80202687,  0.80239723]), 'mean_test_score': array([ 0.80087542,  0.8009596 ,  0.80139731]), 'split0_train_score': array([ 0.92821307,  0.92848244,  0.92844877]), 'params': ({'n_estimators': 500}, {'n_estimators': 750}, {'n_estimators': 1000}), 'std_fit_time': array([ 1.36600018,  1.45599997,  0.09599996]), 'std_score_time': array([ 0.72249985,  0.11399996,  1.38400006]), 'param_n_estimators': masked_array(data = [500 750 1000],
             mask = [False False False],
       fill_value = ?)
, 'mean_train_score': array([ 0.92898987,  0.92922556,  0.92912456]), 'mean_fit_time': array([ 221.14199996,  3

#### Fit the model

In [11]:
rf36 = RandomForestClassifier(criterion='gini',
                                min_samples_split=6,
                                n_estimators=1000,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf36.fit(X_train, y_train.values.ravel())
print "%.4f" % rf36.oob_score_ 

0.8119


#### Inspect feature importance

In [12]:
pd.concat((pd.DataFrame(X_train.columns, columns = ['variable']), 
           pd.DataFrame(rf36.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:10]

Unnamed: 0,variable,importance
1,1,0.077294
517,quantity_group_dry,0.074886
0,0,0.073866
2,date_recorded,0.048087
8,construction_year,0.043055
5,population,0.039204
515,quantity_group_enough,0.030397
535,waterpoint_type_other,0.025788
471,extraction_type_other,0.021459
481,extraction_type_class_other,0.019897


#### Generate submission file

In [13]:
predictions = rf36.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv(os.path.join('submission_files', 'y_test36.csv'), sep=",", index = False)