In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
adult = pd.read_csv('./data/adult_reconstruction.numeric.csv')

In [6]:
adult.head()

Unnamed: 0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation
0,0,1,0,0,4,9,2,2,5,4,0,39,49100,13
1,0,0,0,0,4,15,1,0,3,4,1,39,11500,3
2,0,0,0,0,4,1,0,4,3,4,1,39,2600,8
3,1,2,0,0,4,11,0,2,0,1,1,1,38997,12
4,0,0,0,0,4,9,2,4,1,4,1,0,41400,4


In [7]:
adult['label'] = adult['income'] > 50000
adult['label'] = adult['label'].astype('int')

In [8]:
adult.head()

Unnamed: 0,hours-per-week,age,capital-gain,capital-loss,workclass,education,education-num,marital-status,relationship,race,gender,native-country,income,occupation,label
0,0,1,0,0,4,9,2,2,5,4,0,39,49100,13,0
1,0,0,0,0,4,15,1,0,3,4,1,39,11500,3,0
2,0,0,0,0,4,1,0,4,3,4,1,39,2600,8,0
3,1,2,0,0,4,11,0,2,0,1,1,1,38997,12,0
4,0,0,0,0,4,9,2,4,1,4,1,0,41400,4,0


In [9]:
X = adult.drop(columns=['label', 'income'])  
y = adult['label'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
model = RandomForestClassifier(n_estimators = 80, max_depth = 3, min_samples_leaf = 4)

In [12]:
model.fit(X_train, y_train)

RandomForestClassifier(max_depth=3, min_samples_split=4, n_estimators=80)

In [13]:
y_pred = model.predict(X_test)

In [14]:
accuracy_score(y_pred, y_test)

0.8084546678086382

In [15]:
max_depth = np.random.randint(1,8,3)
min_samples_leaf = [1,2,3,4]
n_estimators = [80,100,120]

In [16]:
model_collection = {}

for i in max_depth:
    for j in min_samples_leaf:
        for k in n_estimators:
            model = RandomForestClassifier(n_estimators = k, max_depth = i, min_samples_leaf = j)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_train)
            model_collection["i: {}, j: {}, k: {}".format(i,j,k)] = accuracy_score(y_pred, y_train)

In [17]:
model_collection

{'i: 2, j: 1, k: 80': 0.7576777193197113,
 'i: 2, j: 1, k: 100': 0.7576777193197113,
 'i: 2, j: 1, k: 120': 0.7576777193197113,
 'i: 2, j: 2, k: 80': 0.7576777193197113,
 'i: 2, j: 2, k: 100': 0.7576777193197113,
 'i: 2, j: 2, k: 120': 0.7576777193197113,
 'i: 2, j: 3, k: 80': 0.7576777193197113,
 'i: 2, j: 3, k: 100': 0.7576777193197113,
 'i: 2, j: 3, k: 120': 0.7576777193197113,
 'i: 2, j: 4, k: 80': 0.7576777193197113,
 'i: 2, j: 4, k: 100': 0.7576777193197113,
 'i: 2, j: 4, k: 120': 0.7576777193197113,
 'i: 4, j: 1, k: 80': 0.816897100208002,
 'i: 4, j: 1, k: 100': 0.815673559280558,
 'i: 4, j: 1, k: 120': 0.8146947265386028,
 'i: 4, j: 2, k: 80': 0.814755903584975,
 'i: 4, j: 2, k: 100': 0.817508870671724,
 'i: 4, j: 2, k: 120': 0.814755903584975,
 'i: 4, j: 3, k: 80': 0.8162241526979077,
 'i: 4, j: 3, k: 100': 0.8156123822341858,
 'i: 4, j: 3, k: 120': 0.8143888413067417,
 'i: 4, j: 4, k: 80': 0.815673559280558,
 'i: 4, j: 4, k: 100': 0.8164688608833965,
 'i: 4, j: 4, k: 120': 0.

Wat is mis met deze aanpak?

In [18]:
y_pred

array([1, 0, 0, ..., 0, 1, 0])

## Cross validation with SearchGridCV

In [19]:
rf = RandomForestClassifier(criterion = 'entropy')

In [20]:
param_grid = {'max_depth': [2,10,None], 
              'min_samples_leaf': [2,4,6,8],
              'max_features': ['auto','sqrt']}

In [21]:
grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    refit=True,
    return_train_score=True
)

In [22]:
grid_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(criterion='entropy'),
             param_grid={'max_depth': [2, 10, None],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [2, 4, 6, 8]},
             return_train_score=True, scoring='accuracy')

In [59]:
cv_results_df = pd.DataFrame(grid_rf.cv_results_)
cv_results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_leaf,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.395643,0.018999,0.030218,0.000313,2.0,auto,2,"{'max_depth': 2, 'max_features': 'auto', 'min_...",0.764201,0.764201,...,0.764171,6e-05,18,0.764163,0.764163,0.764163,0.764163,0.764201,0.764171,1.5e-05
1,0.400851,0.030291,0.030509,0.00101,2.0,auto,4,"{'max_depth': 2, 'max_features': 'auto', 'min_...",0.764201,0.764201,...,0.764171,6e-05,18,0.764163,0.764163,0.764163,0.764163,0.764201,0.764171,1.5e-05
2,0.386351,0.009115,0.030313,0.000451,2.0,auto,6,"{'max_depth': 2, 'max_features': 'auto', 'min_...",0.764201,0.764201,...,0.764171,6e-05,18,0.764163,0.764163,0.764163,0.764163,0.764201,0.764171,1.5e-05
3,0.397188,0.012959,0.033539,0.005184,2.0,auto,8,"{'max_depth': 2, 'max_features': 'auto', 'min_...",0.764201,0.764201,...,0.764171,6e-05,18,0.764163,0.764163,0.764163,0.764163,0.764201,0.764171,1.5e-05
4,0.401247,0.024631,0.034749,0.00469,2.0,sqrt,2,"{'max_depth': 2, 'max_features': 'sqrt', 'min_...",0.764201,0.764201,...,0.764532,0.000741,17,0.764163,0.764163,0.764163,0.765858,0.764201,0.76451,0.000674
5,0.395,0.018637,0.030753,0.000682,2.0,sqrt,4,"{'max_depth': 2, 'max_features': 'sqrt', 'min_...",0.764201,0.764201,...,0.764171,6e-05,18,0.764163,0.764163,0.764163,0.764163,0.764201,0.764171,1.5e-05
6,0.382448,0.002087,0.031297,0.001177,2.0,sqrt,6,"{'max_depth': 2, 'max_features': 'sqrt', 'min_...",0.764201,0.764201,...,0.764171,6e-05,18,0.764163,0.764163,0.764163,0.764163,0.764201,0.764171,1.5e-05
7,0.392202,0.019919,0.030897,0.000641,2.0,sqrt,8,"{'max_depth': 2, 'max_features': 'sqrt', 'min_...",0.764201,0.764201,...,0.764171,6e-05,18,0.764163,0.764163,0.764163,0.764163,0.764201,0.764171,1.5e-05
8,0.771575,0.042487,0.066195,0.002857,10.0,auto,2,"{'max_depth': 10, 'max_features': 'auto', 'min...",0.830948,0.828989,...,0.833268,0.003554,10,0.846203,0.846429,0.84334,0.845261,0.845073,0.845261,0.001093
9,0.760774,0.054218,0.069721,0.006197,10.0,auto,4,"{'max_depth': 10, 'max_features': 'auto', 'min...",0.830646,0.827934,...,0.832876,0.003226,12,0.843905,0.843491,0.839574,0.841984,0.841269,0.842045,0.001566


In [23]:
grid_rf.best_score_

0.8367334639144192

In [None]:
grid_rf.

In [61]:
best_model = grid_rf.best_estimator_

RandomForestClassifier(criterion='entropy', min_samples_leaf=4)

In [62]:
grid_rf.best_params_

{'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 4}