### Tuning, training and testing random forests models for predicting college enrollment

loading packages needed

In [None]:
# if packages not installed, install them
# import pip
# pip.main(['install', numpy]) 
# pip.main(['install', pandas]) 
# pip.main(['install', sklearn]) 
# pip.main(['install', matplotlib]) 
# pip.main(['install', itertools]) 

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics

import itertools

In [None]:
X=np.loadtxt('dat1_collen.csv', delimiter=',')
print('Dimension of X is {}'.format(X.shape))
X[0:]

In [None]:
collen=np.loadtxt('collen.csv', delimiter=',')
print('Dimension of collen is {}'.format(collen.shape))
collen[0:]

In [None]:
X = dat[:,1:36]
print('Dimension of X is {}'.format(X.shape))
X[0:]

In [None]:
weights=np.loadtxt('weights4_collen.csv', delimiter=',')
weights[0:20]

##### Applying nested cross-validation here (5-fold)

In [None]:
skf = StratifiedKFold(n_splits=5, random_state = 666, shuffle= True)
skf.get_n_splits(X, collen)
train_indices=[]
test_indices=[]
for train_index, test_index in skf.split(X, collen):
    train_indices.append(train_index)
    test_indices.append(test_index)

In [None]:
def cross_val_scores_weighted(model, X, y, weights, cv=5, metrics=[sklearn.metrics.accuracy_score]):
    skf = StratifiedKFold(n_splits=cv, random_state = 66, shuffle= True)
    skf.get_n_splits(X, y)
    scores = [[] for metric in metrics]
    for train_index, test_index in skf.split(X, y):
        model_clone = sklearn.base.clone(model)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        weights_train, weights_test = weights[train_index], weights[test_index]
        #print(weights[train_index], weights[test_index])
        model_clone.fit(X_train,y_train,sample_weight=weights_train)
        y_pred = model_clone.predict(X_test)
        for i, metric in enumerate(metrics):
            score = metric(y_test, y_pred, sample_weight = weights_test)
            scores[i].append(score)
    return scores

In [None]:
#parameter ranges for model-tuning
ntrees = [10, 50, 100, 200,300,400,500, 600, 700, 800, 900, 1000]  #number of trees
depths = range(1, 36)
ntree_depth_combo = list(itertools.product(ntrees, depths))

### train_test 1 (CV1)

In [None]:
train1 = train_indices[0]
test1 = test_indices[0]

#### tuning maximum depth and number of trees

In [None]:
cv1_accuracy = [] # container for training data accuracy
for (ntree, depth) in ntree_depth_combo:
    print(ntree, depth)
    rf = RandomForestClassifier(random_state = 666, n_estimators=ntree, max_depth = depth) 
    scores = cross_val_scores_weighted(model=rf, X=X[train1], y=collen[train1], weights = weights[train1], cv=5)
    cv1_accuracy.append(np.mean(scores))

optimal_tune = ntree_depth_combo[cv1_accuracy.index(max(cv1_accuracy))] 
print(optimal_tune)


In [None]:
cv1_acc_seg = [cv1_accuracy[x:x+49] for x in range(0,len(cv1_accuracy),49)]
print(optimal_tune, max(cv1_accuracy))
line = ['-', '--', ':']
i = 0
fig = plt.figure(figsize=(20,5))
ax = plt.subplot(111)
for acc in cv1_acc_seg:
    ax.plot(depths, acc, line[i])
    i= i+1
    if i is 3:
        i = 0
chartBox = ax.get_position()
ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
ax.legend(ntrees, loc='upper center', bbox_to_anchor=(1.1, 0.8), shadow=True, ncol=1)
plt.ylabel('5-fold cross validation accuracy (training data)')
plt.show()


#plt.legend(ntrees, loc='lower right', ncol = 2)
#plt.xticks(range(len(cv1_accuracy)), ntree_depth_combo, size='small')

#plt.show()

#### train & test: using the best maximum depth & number of trees

In [None]:
rf1 = RandomForestClassifier(random_state = 666, n_estimators=400, max_depth = 7) 
rf1.fit(X[train1], collen[train1], sample_weight = weights[train1])
rf1.score(X[test1], collen[test1], sample_weight=weights[test1])

In [None]:
roc_auc_score(y_true = [ int(i) for i in collen[test1] ], 
              y_score = rf1.predict_proba(X[test1])[:, 1], sample_weight = weights[test1])

### train_test 2 (CV2)

In [None]:
train2 = train_indices[1]
test2 = test_indices[1]

#### tuning maximum depth and number of trees

In [None]:
cv2_accuracy = [] # container for training data accuracy
for (ntree, depth) in ntree_depth_combo:
    print(ntree, depth)
    rf = RandomForestClassifier(random_state = 666, n_estimators=ntree, max_depth = depth) 
    scores = cross_val_scores_weighted(model=rf, X=X[train2], y=collen[train2], weights = weights[train2], cv=5)
    cv2_accuracy.append(np.mean(scores))

optimal_tune = ntree_depth_combo[cv2_accuracy.index(max(cv2_accuracy))] 
print(optimal_tune)


In [None]:
cv2_acc_seg = [cv2_accuracy[x:x+35] for x in range(0,len(cv2_accuracy),35)]
print(optimal_tune, max(cv2_accuracy))
line = ['-', '--', ':']
i = 0
fig = plt.figure(figsize=(20,5))
ax = plt.subplot(111)
for acc in cv2_acc_seg:
    ax.plot(depths, acc, line[i])
    i= i+1
    if i is 3:
        i = 0
chartBox = ax.get_position()
ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
ax.legend(ntrees, loc='upper center', bbox_to_anchor=(1.1, 0.8), shadow=True, ncol=1)
plt.ylabel('5-fold cross validation accuracy (training data)')
plt.show()

#### train & test: using the best maximum depth & number of trees

In [None]:
rf2 = RandomForestClassifier(random_state = 666, n_estimators=800, max_depth = 10) 
rf2.fit(X[train2], collen[train2], sample_weight = weights[train2])
rf2.score(X[test2], collen[test2], sample_weight=weights[test2])

In [None]:
roc_auc_score(y_true = [ int(i) for i in collen[test2] ], 
              y_score = rf2.predict_proba(X[test2])[:, 1], sample_weight = weights[test2])

### train_test 3 (CV3)

In [None]:
train3 = train_indices[2]
test3 = test_indices[2]

#### tuning maximum depth and number of trees

In [None]:
cv3_accuracy = [] # container for training data accuracy
for (ntree, depth) in ntree_depth_combo:
    print(ntree, depth)
    rf = RandomForestClassifier(random_state = 666, n_estimators=ntree, max_depth = depth) 
    scores = cross_val_scores_weighted(model=rf, X=X[train3], y=collen[train3], weights = weights[train3], cv=5)
    cv3_accuracy.append(np.mean(scores))

optimal_tune = ntree_depth_combo[cv3_accuracy.index(max(cv3_accuracy))] 
print(optimal_tune)


In [None]:
cv3_acc_seg = [cv3_accuracy[x:x+35] for x in range(0,len(cv3_accuracy),35)]
print(optimal_tune, max(cv3_accuracy))
line = ['-', '--', ':']
i = 0
fig = plt.figure(figsize=(20,5))
ax = plt.subplot(111)
for acc in cv3_acc_seg:
    ax.plot(depths, acc, line[i])
    i= i+1
    if i is 3:
        i = 0
chartBox = ax.get_position()
ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
ax.legend(ntrees, loc='upper center', bbox_to_anchor=(1.1, 0.8), shadow=True, ncol=1)
plt.ylabel('5-fold cross validation accuracy (training data)')
plt.show()

#### train & test: using the best maximum depth & number of trees

In [None]:
rf3 = RandomForestClassifier(random_state = 666, n_estimators=900, max_depth = 10) 
rf3.fit(X[train3], collen[train3], sample_weight = weights[train3])
rf3.score(X[test3], collen[test3], sample_weight=weights[test3])

In [None]:
roc_auc_score(y_true = [ int(i) for i in collen[test3] ], 
              y_score = rf3.predict_proba(X[test3])[:, 1], sample_weight = weights[test3])

### train_test 4 (CV4)

In [None]:
train4 = train_indices[3]
test4 = test_indices[3]

#### tuning maximum depth and number of trees

In [None]:
cv4_accuracy = [] # container for training data accuracy
for (ntree, depth) in ntree_depth_combo:
    print(ntree, depth)
    rf = RandomForestClassifier(random_state = 666, n_estimators=ntree, max_depth = depth) 
    scores = cross_val_scores_weighted(model=rf, X=X[train4], y=collen[train4], weights = weights[train4], cv=5)
    cv4_accuracy.append(np.mean(scores))

optimal_tune = ntree_depth_combo[cv4_accuracy.index(max(cv4_accuracy))] 
print(optimal_tune)


In [None]:
cv4_acc_seg = [cv4_accuracy[x:x+35] for x in range(0,len(cv4_accuracy),35)]
print(optimal_tune, max(cv4_accuracy))
line = ['-', '--', ':']
i = 0
fig = plt.figure(figsize=(20,5))
ax = plt.subplot(111)
for acc in cv4_acc_seg:
    ax.plot(depths, acc, line[i])
    i= i+1
    if i is 3:
        i = 0
chartBox = ax.get_position()
ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
ax.legend(ntrees, loc='upper center', bbox_to_anchor=(1.1, 0.8), shadow=True, ncol=1)
plt.ylabel('5-fold cross validation accuracy (training data)')
plt.show()

#### train & test: using the best maximum depth & number of trees

In [None]:
rf4 = RandomForestClassifier(random_state = 666, n_estimators=400, max_depth = 11) 
rf4.fit(X[train4], collen[train4], sample_weight = weights[train4])
rf4.score(X[test4], collen[test4], sample_weight=weights[test4])

In [None]:
roc_auc_score(y_true = [ int(i) for i in collen[test4] ], 
              y_score = rf4.predict_proba(X[test4])[:, 1], sample_weight = weights[test4])

### train_test 5 (CV5)

In [None]:
train5 = train_indices[4]
test5 = test_indices[4]

#### tuning maximum depth and number of trees

In [None]:
cv5_accuracy = [] # container for training data accuracy
for (ntree, depth) in ntree_depth_combo:
    print(ntree, depth)
    rf = RandomForestClassifier(random_state = 666, n_estimators=ntree, max_depth = depth) 
    scores = cross_val_scores_weighted(model=rf, X=X[train5], y=collen[train5], weights = weights[train5], cv=5)
    cv5_accuracy.append(np.mean(scores))

optimal_tune = ntree_depth_combo[cv5_accuracy.index(max(cv5_accuracy))] 
print(optimal_tune)


In [None]:
cv5_acc_seg = [cv5_accuracy[x:x+35] for x in range(0,len(cv5_accuracy),35)]
print(optimal_tune, max(cv5_accuracy))
line = ['-', '--', ':']
i = 0
fig = plt.figure(figsize=(20,5))
ax = plt.subplot(111)
for acc in cv5_acc_seg:
    ax.plot(depths, acc, line[i])
    i= i+1
    if i is 3:
        i = 0
chartBox = ax.get_position()
ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
ax.legend(ntrees, loc='upper center', bbox_to_anchor=(1.1, 0.8), shadow=True, ncol=1)
plt.ylabel('5-fold cross validation accuracy (training data)')
plt.show()

#### train & test: using the best maximum depth & number of trees

In [None]:
rf5 = RandomForestClassifier(random_state = 666, n_estimators=700, max_depth = 20) 
rf5.fit(X[train5], collen[train5], sample_weight = weights[train5])
rf5.score(X[test5], collen[test5], sample_weight=weights[test5])

In [None]:
roc_auc_score(y_true = [ int(i) for i in collen[test5] ], 
              y_score = rf5.predict_proba(X[test5])[:, 1], sample_weight = weights[test5])