In [1]:
# init
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
# load data
X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
Xte = np.genfromtxt('data/X_test.txt', delimiter=None)

np.random.seed(0)
X,Y = ml.shuffleData(X,Y)

In [3]:
def calcError(prediction, real):
    err_count = 0.
    for i in range(len(prediction)):
        if prediction[i] != real[i]:
            err_count+=1
    return err_count / len(prediction)

In [18]:
# Setting Xtr, Ytr, Xva, Yva
Xtr = X[:190000, :]
Ytr = Y[:190000]
Xva = X[190000:, :]
Yva = Y[190000:]

In [None]:
# init learner
# Kaggle上传的是 max depth = 20, max_feature = 2
regr = RandomForestRegressor(max_depth=20, 
                             random_state=0, 
                             n_estimators = 100, 
                             max_features = 1)

In [5]:
# learn
regr.fit(Xtr, Ytr)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [4]:
def convert(regress_list):
    result = []
    for i in regress_list:
        if i < 0.5:
            result.append(0)
        else:
            result.append(1)
    return result

In [7]:
for data_split in range(5):
    Xtr = X[:150000 + data_split*10000, ]
    Ytr = Y[:150000 + data_split*10000]
    Xva = X[150000 + data_split*10000:, ]
    Yva = Y[150000 + data_split*10000:]
    
    for depth in range(20, 45):
        for feature in range(1, 15):
            learner = RandomForestRegressor(max_depth= depth, random_state=0, n_estimators = 100, max_features = feature)
            learner.fit(Xtr, Ytr)
            
            Ytr_hat = convert(learner.predict(Xtr))       
            Yva_hat = convert(learner.predict(Xva))         
            validation_auc = roc_auc_score(Yva_hat, Yva)

            print "depth =", depth, "feature =", feature, "ValAUC =", validation_auc, "ValErr =", calcError(Yva_hat, Yva), "TraErr =", calcError(Ytr_hat, Ytr)
print "Done"

150000 Train Data depth = 20 feature = 1 ValAUC = 0.718990003173 ValErr = 0.26306 TraErr = 0.148546666667
150000 Train Data depth = 20 feature = 2 ValAUC = 0.714809982881 ValErr = 0.2627 TraErr = 0.128886666667
150000 Train Data depth = 20 feature = 3 ValAUC = 0.712362559135 ValErr = 0.26344 TraErr = 0.120186666667
150000 Train Data depth = 20 feature = 4 ValAUC = 0.710297956802 ValErr = 0.2645 TraErr = 0.115133333333
150000 Train Data depth = 20 feature = 5 ValAUC = 0.70850017686 ValErr = 0.26562 TraErr = 0.11068
150000 Train Data depth = 20 feature = 6 ValAUC = 0.707756689206 ValErr = 0.2657 TraErr = 0.10768
150000 Train Data depth = 20 feature = 7 ValAUC = 0.706515358852 ValErr = 0.2664 TraErr = 0.105513333333
150000 Train Data depth = 20 feature = 8 ValAUC = 0.706143916933 ValErr = 0.26664 TraErr = 0.102953333333
150000 Train Data depth = 20 feature = 9 ValAUC = 0.705421943565 ValErr = 0.2669 TraErr = 0.100506666667
150000 Train Data depth = 20 feature = 10 ValAUC = 0.703861380925 

150000 Train Data depth = 25 feature = 10 ValAUC = 0.696014332162 ValErr = 0.27154 TraErr = 0.0588666666667
150000 Train Data depth = 25 feature = 11 ValAUC = 0.696101189113 ValErr = 0.27148 TraErr = 0.0585866666667
150000 Train Data depth = 25 feature = 12 ValAUC = 0.695445702308 ValErr = 0.27196 TraErr = 0.0578133333333
150000 Train Data depth = 25 feature = 13 ValAUC = 0.696408935476 ValErr = 0.27116 TraErr = 0.0571933333333
150000 Train Data depth = 25 feature = 14 ValAUC = 0.694992554875 ValErr = 0.27222 TraErr = 0.0561733333333
150000 Train Data depth = 26 feature = 1 ValAUC = 0.703438775291 ValErr = 0.2664 TraErr = 0.07486
150000 Train Data depth = 26 feature = 2 ValAUC = 0.69888160891 ValErr = 0.26938 TraErr = 0.06588
150000 Train Data depth = 26 feature = 3 ValAUC = 0.697794505263 ValErr = 0.2701 TraErr = 0.0616333333333
150000 Train Data depth = 26 feature = 4 ValAUC = 0.697134606733 ValErr = 0.27054 TraErr = 0.0595466666667
150000 Train Data depth = 26 feature = 5 ValAUC = 0

150000 Train Data depth = 31 feature = 6 ValAUC = 0.690752261154 ValErr = 0.27552 TraErr = 0.0495
150000 Train Data depth = 31 feature = 7 ValAUC = 0.692523021478 ValErr = 0.27406 TraErr = 0.0493866666667
150000 Train Data depth = 31 feature = 8 ValAUC = 0.691698842579 ValErr = 0.27474 TraErr = 0.0494
150000 Train Data depth = 31 feature = 9 ValAUC = 0.692598064908 ValErr = 0.274 TraErr = 0.0492733333333
150000 Train Data depth = 31 feature = 10 ValAUC = 0.691726320454 ValErr = 0.27472 TraErr = 0.0493333333333
150000 Train Data depth = 31 feature = 11 ValAUC = 0.689828772625 ValErr = 0.27628 TraErr = 0.04922
150000 Train Data depth = 31 feature = 12 ValAUC = 0.6908255189 ValErr = 0.27546 TraErr = 0.0491666666667
150000 Train Data depth = 31 feature = 13 ValAUC = 0.690679481738 ValErr = 0.27558 TraErr = 0.04908
150000 Train Data depth = 31 feature = 14 ValAUC = 0.691407756662 ValErr = 0.27498 TraErr = 0.0491133333333
150000 Train Data depth = 32 feature = 1 ValAUC = 0.694725195863 ValEr

KeyboardInterrupt: 

In [7]:
Yte_hat = regr.predict(Xte)

In [9]:
Yte_hat = regr.predict(Xte)
Yte = np.vstack((np.arange(Xte.shape[0]), Yte_hat)).T
np.savetxt('Y_submit_random_forest.txt',Yte,'%d, %.2f',header='ID,Prob1',comments='',delimiter=',')

In [11]:
Yva_hat = convert(regr.predict(Xva))

In [12]:
validation_auc = roc_auc_score(Yva_hat, Yva)
print "Validation AUC", validation_auc

Validation AUC 0.697342651327


In [13]:
print "Validation Error", calcError(Yva_hat, Yva)

Validation Error 0.2708


In [15]:
Ytr_hat = convert(regr.predict(Xtr))

In [16]:
print "Train Error", calcError(Ytr_hat, Ytr)

Train Error 0.0574631578947
