In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

##Seaborn for fancy plots. 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (8,8)

## 3950 Assignment 1: Part 2

Sample solution

In [18]:
name = "Muh Name"

In [19]:
df = pd.read_csv("training.csv")
df = df.drop(columns={"id"})
df.head()

Unnamed: 0,target,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,var_200
0,0,0.66,0.106,0.434,0.387,0.903,0.661,0.158,0.291,0.21,...,0.015,0.377,0.479,0.05,0.395,0.123,0.833,0.461,0.99,0.105
1,1,0.844,0.813,0.03,0.939,0.721,0.287,0.539,0.874,0.787,...,0.112,0.048,0.088,0.86,0.56,0.346,0.511,0.883,0.858,0.599
2,0,0.56,0.567,0.568,0.434,0.414,0.18,0.448,0.888,0.023,...,0.874,0.236,0.599,0.602,0.005,0.493,0.122,0.395,0.782,0.943
3,0,0.681,0.245,0.909,0.785,0.738,0.57,0.692,0.411,0.182,...,0.219,0.691,0.261,0.031,0.968,0.353,0.798,0.104,0.944,0.09
4,0,0.846,0.431,0.805,0.237,0.465,0.642,0.219,0.102,0.795,...,0.704,0.242,0.089,0.605,0.577,0.043,0.686,0.07,0.666,0.572


Create a trial run to see what a default forrest looks like. 

In [20]:
#model
y_trial = np.array(df["target"]).reshape(-1,1)
X_trial = np.array(df.drop(columns={"target"}))
X_trainT, X_testT, y_trainT, y_testT = train_test_split(X_trial, y_trial.ravel(), test_size=.3)

trial_forrest = RandomForestClassifier()
trial_pipe = [('scale', StandardScaler()),('forest', trial_forrest) ]
pipe = Pipeline(trial_pipe)
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_trainT, y_trainT)
print("Score:", pipe.score(X_testT, y_testT))
trial_depths = [estimator.tree_.max_depth for estimator in trial_forrest.estimators_]
print("Avg Depth:", np.mean(trial_depths))

Score: 0.6133333333333333
Avg Depth: 8.23


Create model using grid search to tune HPs. The training set is very small, so calculation of many options should be pretty fast. 

I'm going to scale the data, but I suspect that will not be a massive impact. 

In [21]:
#Create Pipeline with Scaling. 
scaler = StandardScaler()
estimator = RandomForestClassifier(n_jobs=-1, verbose=0)
pipe = Pipeline(steps=[("scaler", scaler), ("forest", estimator)])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_trial, y_trial.ravel(), test_size=.3)

rf_para = {'forest__min_samples_split':[3,4,5,6],
            #'forest__criterion':["gini","entropy"],
            'forest__max_depth':[4,5,6,7],
            'forest__n_estimators':[200],
            'forest__max_samples':[.4, .5, .6, .7],
            'forest__ccp_alpha':[.000005,.00001,.00005,.0001,.0005,.001]}
 
clf = GridSearchCV(pipe, param_grid=rf_para, cv=5, n_jobs=-1) 
clf.fit(X_train, y_train.ravel())
best = clf.best_estimator_


In [23]:
print(best.score(X_test, y_test))
print(best)

0.56
Pipeline(steps=[('scaler', StandardScaler()),
                ('forest',
                 RandomForestClassifier(ccp_alpha=0.0005, max_depth=5,
                                        max_samples=0.7, min_samples_split=3,
                                        n_estimators=200, n_jobs=-1))])


### Retrain Model

We can use all the data to train the model, now that we've determined the best "settings". These are the hyperparameters that we found to be the best in the grid search above. Because there is random variation, the exact values may vary a bit - if we wanted them to be constant, we'd need to set the random seed above so things don't change from run to run. Since this uses all the data, we should expect that it is an improvement over the model we got above, that was trained with 70% of the data. 

In [28]:
pipe2 = Pipeline(steps=[("scaler", StandardScaler()), ("forest", RandomForestClassifier(n_jobs=-1, verbose=0, ccp_alpha=.0005, max_depth=5, max_samples=.7, min_samples_split=3, n_estimators=200))])
best = pipe2.fit(X_trial, y_trial.ravel())

# Testing

Please leave the stuff below as-is in your file. 

This will take your best model and score it with the test data. 

In [29]:
#Load Test Data
test_df = pd.read_csv("testing.csv")
test_df = test_df.drop(columns={"id"})
#Create tests and score
test_y = np.array(test_df["target"]).reshape(-1,1)
test_X = np.array(test_df.drop(columns={"target"}))

preds = best.predict(test_X)

roc_score = roc_auc_score(test_y, preds)
acc_score = accuracy_score(test_y, preds)

print(roc_score)
print(acc_score)
print(name, np.mean([roc_score, acc_score]))

0.645103640978653
0.6445569620253164
Muh Name 0.6448303015019847
