In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import feature_process_helper
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

### Random Forest 1

#### Load data

In [23]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')

In [24]:
y_train = pd.read_csv('y_train.csv')
del y_train['id']

In [25]:
del X_train['funder']

#### Construct dataset

In [26]:
X_train, X_test = feature_process_helper.dates(X_train, X_test)
X_train, X_test = feature_process_helper.construction(X_train, X_test)
X_train, X_test = feature_process_helper.bools(X_train, X_test)
X_train, X_test = feature_process_helper.locs(X_train, X_test)
X_train, X_test = feature_process_helper.removal(X_train, X_test)
X_train, X_test = feature_process_helper.dummies(X_train, X_test)

In [27]:
rf2 = RandomForestClassifier(criterion='entropy',
                                n_estimators=500,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf2 = rf2.fit(X_train, y_train.values.ravel())
print "%.4f" % rf2.oob_score_ 

0.8081


#### Tune parameters

In [11]:
rf = RandomForestClassifier(n_estimators=50,
                            max_features='auto',
                            oob_score=True,
                            random_state=1,
                            n_jobs=-1)

Find the right information gain criterion.

In [13]:
param_grid = { "criterion"   : ["gini", "entropy"],}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

gs = gs.fit(X_train, y_train.values.ravel())
#scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
#print('CV accuracy: %.4f +/- %.4f' % (np.mean(scores), np.std(scores)))

In [14]:
print(gs.best_score_)

0.794191919192


In [15]:
print(gs.best_params_)

{'criterion': 'gini'}


In [17]:
print(gs.grid_scores_)

[mean: 0.79419, std: 0.00238, params: {'criterion': 'gini'}, mean: 0.79306, std: 0.00159, params: {'criterion': 'entropy'}]




Find the right number of minimum samples to split.

In [18]:
param_grid = { "min_samples_split" : [2, 4, 10]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

gs = gs.fit(X_train, y_train.values.ravel())
#scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
#print('CV accuracy: %.4f +/- %.4f' % (np.mean(scores), np.std(scores)))

In [19]:
print(gs.best_score_)
print(gs.best_params_)
print(gs.grid_scores_)

0.799494949495
{'min_samples_split': 10}
[mean: 0.79419, std: 0.00238, params: {'min_samples_split': 2}, mean: 0.79872, std: 0.00196, params: {'min_samples_split': 4}, mean: 0.79949, std: 0.00139, params: {'min_samples_split': 10}]




Find the right number of minimum samples in a leaf.

In [22]:
rf = RandomForestClassifier(criterion='gini',
                            min_samples_split=10,
                            n_estimators=50,
                            max_features='auto',
                            oob_score=True,
                            random_state=1,
                            n_jobs=-1)

In [23]:
param_grid = {"min_samples_leaf" : [1,5,10]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

gs = gs.fit(X_train, y_train.values.ravel())
#scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
#print('CV accuracy: %.4f +/- %.4f' % (np.mean(scores), np.std(scores)))

In [24]:
print(gs.best_score_)
print(gs.best_params_)
print(gs.grid_scores_)

0.799494949495
{'min_samples_leaf': 1}
[mean: 0.79949, std: 0.00139, params: {'min_samples_leaf': 1}, mean: 0.75559, std: 0.00112, params: {'min_samples_leaf': 5}, mean: 0.73729, std: 0.00270, params: {'min_samples_leaf': 10}]




In [25]:
rf = RandomForestClassifier(criterion='gini',
                            n_estimators=50,
                            max_features='auto',
                            oob_score=True,
                            random_state=1,
                            n_jobs=-1)

In [26]:
param_grid = {"min_samples_split" : [10, 12, 16]}

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

gs = gs.fit(X_train, y_train.values.ravel())
#scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
#print('CV accuracy: %.4f +/- %.4f' % (np.mean(scores), np.std(scores)))

In [27]:
print(gs.best_score_)
print(gs.best_params_)
print(gs.grid_scores_)

0.8
{'min_samples_split': 12}
[mean: 0.79949, std: 0.00139, params: {'min_samples_split': 10}, mean: 0.80000, std: 0.00024, params: {'min_samples_split': 12}, mean: 0.79759, std: 0.00066, params: {'min_samples_split': 16}]




#### Run model with 1000 trees

In [15]:
rf = RandomForestClassifier(criterion='entropy',
                                n_estimators=1000,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf = rf.fit(X_train, y_train.values.ravel())
print "%.4f" % rf.oob_score_ 

0.8092


In [22]:
pd.concat((pd.DataFrame(X_train.columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,longitude,0.094178
3,latitude,0.09322
1,gps_height,0.060972
6,population,0.045696
0,date_recorded,0.04287
3731,quantity_group_dry,0.041938
9,construction_year,0.03338
3729,quantity_group_enough,0.023267
3750,waterpoint_type_other,0.014815
11,month_recorded,0.014388


#### Generate submission file

In [16]:
predictions = rf.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv('y_test1.csv', sep=",", index = False)

#### Run model with 500 trees

In [17]:
rf2 = RandomForestClassifier(criterion='entropy',
                                n_estimators=500,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf2 = rf2.fit(X_train, y_train.values.ravel())
print "%.4f" % rf2.oob_score_ 

0.8088


In [21]:
pd.concat((pd.DataFrame(X_train.columns, columns = ['variable']), 
           pd.DataFrame(rf2.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:20]

Unnamed: 0,variable,importance
2,longitude,0.094282
3,latitude,0.09304
1,gps_height,0.060916
6,population,0.045731
0,date_recorded,0.042778
3731,quantity_group_dry,0.042771
9,construction_year,0.033192
3729,quantity_group_enough,0.023271
3750,waterpoint_type_other,0.01488
11,month_recorded,0.01448


#### Generate submission file

In [18]:
predictions = rf2.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv('y_test2.csv', sep=",", index = False)

#### Run model with 500 trees

In [None]:
corrs = joblib.load(os.path.join('pickles', 'rf1_corr_dict.pkl'))

In [7]:
dump = [i.split('&')[1] for i in corrs.keys()]
keep = [i for i in X_train.columns if i not in dump]

In [8]:
X_train = X_train[keep]
X_test = X_test[keep]

In [10]:
[i for i in X_train.columns if i.split('_')[0] == 'amntsh']

['amntsh_[0.2, 50]',
 'amntsh_(50, 250]',
 'amntsh_(250, 1000]',
 'amntsh_(1000, 350000]']

In [12]:
del X_train['amntshnull']
del X_train['amntsh_[0.2, 50]']
del X_train['amntsh_(50, 250]']
del X_train['amntsh_(250, 1000]']
del X_train['amntsh_(1000, 350000]']

In [5]:
np.shape(X_train)

(59400, 3754)

In [5]:
rf = RandomForestClassifier(criterion='entropy',
                                n_estimators=100,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf = rf.fit(X_train, y_train.values.ravel())
print "%.4f" % rf.oob_score_ 

0.8044


In [11]:
pd.concat((pd.DataFrame(X_train.columns, columns = ['variable']), 
           pd.DataFrame(rf.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:10]

Unnamed: 0,variable,importance
2,longitude,0.093689
3,latitude,0.092972
1,gps_height,0.060926
6,population,0.045784
0,date_recorded,0.042727
3731,quantity_group_dry,0.041852
9,construction_year,0.038812
3729,quantity_group_enough,0.02335
3750,waterpoint_type_other,0.015246
11,month_recorded,0.014349


#### Save model to pickle

In [None]:
joblib.dump(rf, os.path.join('pickles', 'rf1.pkl'), 9)

#### Get model cross validation score

In [77]:
scores = cross_val_score(rf, X_train, y_train.values.ravel(), n_jobs=-1)
scores.mean()

0.80035353535353548

#### Save cross validation score to pickle

In [78]:
joblib.dump(scores, os.path.join('pickles', 'rf1_cv.pkl'), 9)

['pickles\\rf1_cv.pkl']

In [73]:
def meaningful():
    global X_train, X_test
    status = pd.get_dummies(y_train['status_group'])
    good_cols = []
    for i in X_train.columns[12:]:
        if status[X_train[i] == 1]['functional'].mean() > (status['functional'].mean() + .0510):
            good_cols.append(i)
        elif status[X_train[i] == 1]['functional needs repair'].mean() > (status['functional needs repair'].mean() + .0510):
            good_cols.append(i)
        elif status[X_train[i] == 1]['non functional'].mean() > (status['non functional'].mean() + .0510):
            good_cols.append(i)
    X_train2 = pd.concat((X_train.iloc[:, :12], X_train[good_cols]), axis = 1)
    X_test2 = pd.concat((X_test.iloc[:, :12], X_test[good_cols]), axis = 1)
    return X_train2, X_test2

In [74]:
X_train2, X_test2 = meaningful()

### Random Forest  2

In [14]:
X_train, X_test = feature_process_helper.dates(X_train, X_test)
X_train, X_test = feature_process_helper.construction(X_train, X_test)
X_train, X_test = feature_process_helper.bools(X_train, X_test)
X_train, X_test = feature_process_helper.locs(X_train, X_test)
X_train, X_test = feature_process_helper.removal(X_train, X_test)
X_train, X_test = feature_process_helper.dummies(X_train, X_test)

In [76]:
corrs = joblib.load(os.path.join('pickles', 'rf1_corr_dict.pkl'))

In [77]:
dump = [i.split('&')[1] for i in corrs.keys()]
keep = [i for i in X_train.columns if i not in dump]

In [78]:
X_train2 = X_train2[keep]
X_test2 = X_test2[keep]

In [79]:
len(X_train2.columns)

3556

In [None]:
rf2 = RandomForestClassifier(criterion='entropy',
                                n_estimators=100,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf2 = rf2.fit(X_train2, y_train.values.ravel())
print "%.4f" % rf2.oob_score_ 

In [25]:
predictions = rf2.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv('y_test1.csv', sep=",", index = False)

In [123]:
smalls = []
for i in X_train.columns:
    if sum(X_train[i]) < 50:
        smalls.append(i)

In [127]:
keep = [i for i in X_train.columns if i not in smalls]

In [128]:
X_train = X_train[keep]
X_test = X_test[keep]

In [3]:
X_train = pd.read_csv('X_train2.csv')

In [7]:
np.shape(X_train)

(59400, 684)

In [85]:
param_range = [2, 50, 100]

param_grid = [{'min_samples_split': param_range}]

gs = GridSearchCV(estimator=rf2,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
print('CV accuracy: %.4f +/- %.4f' % (np.mean(scores), np.std(scores)))

ValueError: Cannnot send %Id bytes over connection