In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import feature_process_helper
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

### Random Forest 1

#### Load data

In [2]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')

In [3]:
y_train = pd.read_csv('y_train.csv')
del y_train['id']

#### Construct dataset

In [4]:
X_train, X_test = feature_process_helper.dates(X_train, X_test)
X_train, X_test = feature_process_helper.construction(X_train, X_test)
X_train, X_test = feature_process_helper.bools(X_train, X_test)
X_train, X_test = feature_process_helper.locs(X_train, X_test)
X_train, X_test = feature_process_helper.removal(X_train, X_test)
X_train, X_test = feature_process_helper.dummies(X_train, X_test)

#### Tune parameters

In [5]:
rf = RandomForestClassifier(criterion='entropy',
                                n_estimators=100,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)

In [15]:
param_range = [2, 50, 100]

param_grid = [{'min_samples_split': param_range}]

gs = GridSearchCV(estimator=rf,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=5,
                  n_jobs=-1)

gs = gs.fit(X_train, y_train.values.ravel())
#scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
#print('CV accuracy: %.4f +/- %.4f' % (np.mean(scores), np.std(scores)))

In [16]:
print gs.best_score_

0.802441077441


#### Run model with 1000 trees

In [8]:
rf = RandomForestClassifier(criterion='entropy',
                                n_estimators=1000,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf = rf.fit(X_train, y_train.values.ravel())
print "%.4f" % rf.oob_score_ 

0.8092


In [6]:
corrs = joblib.load(os.path.join('pickles', 'rf1_corr_dict.pkl'))

In [7]:
dump = [i.split('&')[1] for i in corrs.keys()]
keep = [i for i in X_train.columns if i not in dump]

In [8]:
X_train = X_train[keep]
X_test = X_test[keep]

In [9]:
np.shape(X_train)

(59400, 3753)

In [10]:
rf = RandomForestClassifier(criterion='entropy',
                                n_estimators=100,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf = rf.fit(X_train, y_train.values.ravel())
print "%.4f" % rf.oob_score_ 

0.8068


#### Save model to pickle

In [None]:
joblib.dump(rf, os.path.join('pickles', 'rf1.pkl'), 9)

#### Get model cross validation score

In [77]:
scores = cross_val_score(rf, X_train, y_train.values.ravel(), n_jobs=-1)
scores.mean()

0.80035353535353548

#### Save cross validation score to pickle

In [78]:
joblib.dump(scores, os.path.join('pickles', 'rf1_cv.pkl'), 9)

['pickles\\rf1_cv.pkl']

#### Generate submission file

In [9]:
predictions = rf.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv('y_test1.csv', sep=",", index = False)

In [73]:
def meaningful():
    global X_train, X_test
    status = pd.get_dummies(y_train['status_group'])
    good_cols = []
    for i in X_train.columns[12:]:
        if status[X_train[i] == 1]['functional'].mean() > (status['functional'].mean() + .0510):
            good_cols.append(i)
        elif status[X_train[i] == 1]['functional needs repair'].mean() > (status['functional needs repair'].mean() + .0510):
            good_cols.append(i)
        elif status[X_train[i] == 1]['non functional'].mean() > (status['non functional'].mean() + .0510):
            good_cols.append(i)
    X_train2 = pd.concat((X_train.iloc[:, :12], X_train[good_cols]), axis = 1)
    X_test2 = pd.concat((X_test.iloc[:, :12], X_test[good_cols]), axis = 1)
    return X_train2, X_test2

In [74]:
X_train2, X_test2 = meaningful()

### Random Forest  2

In [14]:
X_train, X_test = feature_process_helper.dates(X_train, X_test)
X_train, X_test = feature_process_helper.construction(X_train, X_test)
X_train, X_test = feature_process_helper.bools(X_train, X_test)
X_train, X_test = feature_process_helper.locs(X_train, X_test)
X_train, X_test = feature_process_helper.removal(X_train, X_test)
X_train, X_test = feature_process_helper.dummies(X_train, X_test)

In [76]:
corrs = joblib.load(os.path.join('pickles', 'rf1_corr_dict.pkl'))

In [77]:
dump = [i.split('&')[1] for i in corrs.keys()]
keep = [i for i in X_train.columns if i not in dump]

In [78]:
X_train2 = X_train2[keep]
X_test2 = X_test2[keep]

In [79]:
len(X_train2.columns)

3556

In [None]:
rf2 = RandomForestClassifier(criterion='entropy',
                                n_estimators=100,
                                max_features='auto',
                                oob_score=True,
                                random_state=1,
                                n_jobs=-1)
                            
rf2 = rf2.fit(X_train2, y_train.values.ravel())
print "%.4f" % rf2.oob_score_ 

In [25]:
predictions = rf2.predict(X_test)
y_test = pd.read_csv('y_test.csv')
pred = pd.DataFrame(predictions, columns = [y_test.columns[1]])
del y_test['status_group']
y_test = pd.concat((y_test, pred), axis = 1)
y_test.to_csv('y_test1.csv', sep=",", index = False)

In [123]:
smalls = []
for i in X_train.columns:
    if sum(X_train[i]) < 50:
        smalls.append(i)

In [127]:
keep = [i for i in X_train.columns if i not in smalls]

In [128]:
X_train = X_train[keep]
X_test = X_test[keep]

In [3]:
X_train = pd.read_csv('X_train2.csv')

In [7]:
np.shape(X_train)

(59400, 684)

In [85]:
param_range = [2, 50, 100]

param_grid = [{'min_samples_split': param_range}]

gs = GridSearchCV(estimator=rf2,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=2,
                  n_jobs=-1)

scores = cross_val_score(gs, X_train, y_train.values.ravel(), scoring='accuracy', cv=5)
print('CV accuracy: %.4f +/- %.4f' % (np.mean(scores), np.std(scores)))

ValueError: Cannnot send %Id bytes over connection