In [1]:
from data_loader_df import *
import sys
import numpy
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,cross_val_score,KFold,GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pylab as plt

In [2]:
train_file_path = "../data/adult.data"
test_file_path = "../data/adult.test"
train_X, train_Y, _, _, test_X, test_Y = load_all_data(train_file_path, test_file_path, valid_rate=0, is_df=False, norm=False, one_hot=False)

In [43]:
print train_X[0]
print test_X[0]
print train_X.shape, test_X.shape

[62 ' Local-gov' 68268 ' HS-grad' 9 ' Married-civ-spouse'
 ' Transport-moving' ' Husband' ' White' ' Male' 0 0 40 ' United-States']
[25 ' Private' 226802 ' 11th' 7 ' Never-married' ' Machine-op-inspct'
 ' Own-child' ' Black' ' Male' 0 0 40 ' United-States']
(30162, 14) (15060, 14)


In [3]:
#For trees, we should use label encoding rather than one hot to transfrom category feature
#preprocess: combine all the data to do label encoding
dataset = np.concatenate((train_X, test_X), axis=0)
#trasfrom all features if needed
for j in xrange(14):#14 features in total
    if type(dataset[0][j]) == str:
        labelencoder = LabelEncoder()
        dataset[:, j] = labelencoder.fit_transform(dataset[:, j])

In [45]:
#Test! if data is well transformed
print dataset[0]
print dataset.shape

[62 1 68268 11 9 2 13 0 4 1 0 0 40 38]
(45222, 14)


In [4]:
#split dataset back to train, validation, test
train_X, test_X = dataset[:30162], dataset[30162:]
print train_X[0]
print test_X[0]
print train_X.shape, test_X.shape

[62 1 68268 11 9 2 13 0 4 1 0 0 40 38]
[25 2 226802 1 7 4 6 3 2 1 0 0 40 38]
(30162, 14) (15060, 14)


In [47]:
#Test random forest model: have a taste with some default setting
rf = RandomForestClassifier(n_estimators=100) #number of weak estimators, popular setting is 100
#cv_results = cross_val_score(rf,train_X,train_Y,cv=5)
#print("CV validation accuracy", cv_results.mean()*100)
rf.fit(train_X,train_Y)
prediction_rf = rf.predict(test_X)
print ("Accuracy: %f " % (100*metrics.accuracy_score(test_Y, prediction_rf)))
print ("Auc: %f" % metrics.roc_auc_score(test_Y, prediction_rf))

Accuracy: 84.920319 
Auc: 0.770834


In [5]:
#tune parameters
#the model contains parameters of: n_estimators, max_features, min_samples_leaf
val_max_features = range(2, 16, 2)#2,4,6,...14, 14 means use all of them
val_n_estimators = range(20, 200, 30)
val_min_samples_leaf = range(10, 60, 10)

rf=RandomForestClassifier()
kf=KFold(n_splits=5)## 5-cross-validation
param_grid=dict(n_estimators=val_n_estimators,max_features=val_max_features,min_samples_leaf=val_min_samples_leaf)
grid=GridSearchCV(estimator=rf,param_grid=param_grid,cv=kf)
gres=grid.fit(train_X,train_Y)
print("Best",gres.best_score_)
print("params",gres.best_params_)

('Best', 0.8611166368277966)
('params', {'max_features': 8, 'n_estimators': 110, 'min_samples_leaf': 10})


In [None]:
import pickle
with open('rf.pickle', 'w') as f:
    pickle.dump([gres.best_score_, gres.best_params_], f)

In [6]:
rf=RandomForestClassifier(n_estimators=110,max_features=8,min_samples_leaf=10)
rf.fit(train_X,train_Y)

prediction = rf.predict(test_X)
print("Test dataset: Accuracy: %f " % (100*metrics.accuracy_score(test_Y, prediction)))
print ("Auc: %f" % metrics.roc_auc_score(test_Y, prediction))

pred_train = rf.predict(train_X)
print("Train dataset: Accuracy: %f " % (100*metrics.accuracy_score(train_Y, pred_train)))

Test dataset: Accuracy: 85.697211 
Auc: 0.774526
Train dataset: Accuracy: 89.148598 


In [7]:
print ('feature significances：%s' % rf.feature_importances_)

feature significances：[0.08619185 0.01654611 0.07174874 0.0232381  0.16037789 0.08173318
 0.04000347 0.23266985 0.00260084 0.00426537 0.18373738 0.04534729
 0.04883964 0.0027003 ]


In [34]:
# Test Decision Tree model: have a taste with some default setting
dt = DecisionTreeClassifier()
#cv_res=cross_val_score()
dt.fit(train_X,train_Y)
prediction = dt.predict(test_X)
print("Accuracy: %f " % (100*accuracy_score(test_Y, prediction)))

Accuracy: 80.073041 


In [38]:
#compute auc
metrics.roc_auc_score(test_Y, prediction)

0.7370639036924248

In [8]:
#tune parameters for decision tree
#parameters: max_depth, min_samples_leaf, max_features
val_max_features = range(2, 16, 2)#2,4,6,...14, 14 means use all of them
val_max_depth = range(4, 64, 8)
val_min_samples_leaf = range(10, 60, 10)

dt = DecisionTreeClassifier()
kf=KFold(n_splits=5)## 5-cross-validation
param_grid=dict(max_depth=val_max_depth,max_features=val_max_features,min_samples_leaf=val_min_samples_leaf)
grid=GridSearchCV(estimator=dt,param_grid=param_grid,cv=kf)
gres=grid.fit(train_X,train_Y)
print("Best",gres.best_score_)
print("params",gres.best_params_)

('Best', 0.8536569193024335)
('params', {'max_features': 8, 'max_depth': 44, 'min_samples_leaf': 50})


In [9]:
import pickle
with open('dt.pickle', 'w') as f2:
    pickle.dump([gres.best_score_, gres.best_params_], f2)

In [12]:
dt = DecisionTreeClassifier(max_depth=44, max_features=8, min_samples_leaf=50)
dt.fit(train_X,train_Y)
prediction = dt.predict(test_X)

print("Test dataset: Accuracy: %f " % (100*metrics.accuracy_score(test_Y, prediction)))
print ("Auc: %f" % metrics.roc_auc_score(test_Y, prediction))

pred_train = dt.predict(train_X)
print("Train dataset: Accuracy: %f " % (100*metrics.accuracy_score(train_Y, pred_train)))

Test dataset: Accuracy: 84.800797 
Auc: 0.767673
Train dataset: Accuracy: 85.988993 
