# Model tuning

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score


## perform broad random search to then fine tune with grid

In [None]:
#load initial cleaned data
df=pd.read_csv(r"..\data\processed\prelim_clean2.csv", index_col=[0])

res_features = pickle.load(open('../models/01final_features_res.sav', 'rb'))
subset=df[df.columns.intersection(res_features)]


In [None]:
#for demo
# X_train, X_test = train_test_split(df,test_size=0.1, random_state=1)
# result_NaN = X_test.mask(np.random.random(X_test.shape)<0.005)
# result_NaN.to_csv('../data/processed/test.csv', index=False)
# X_test.target==1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(subset, df['target'], test_size=0.3, random_state=1)

In [None]:
base_model = RandomForestClassifier(random_state = 1)
base_model.fit(X_train, y_train)
y_pred = base_model.predict(X_test)
print(base_model.score(X_test, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, random_state=1, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

## Best params and results from forest

In [None]:
best_params=rf_random.best_params_
best_params

In [None]:
y_pred = rf_random.predict(X_test)
print(rf_random.score(X_test, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

## Grid search

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_features': [5,6,8,9],
    'min_samples_leaf': [1,2, 3, 4],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [1000, 1200, 1500, 1800]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_


In [None]:
#pickle.dump(grid_search, open('../models/rf_grid_search_sub.sav', 'wb'))

In [None]:
y_pred = grid_search.predict(X_test)
print(grid_search.score(X_test, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

# Final RF performance

In [None]:
model_features = pickle.load(open('../models/rf_grid_search_sub.sav', 'rb'))

In [None]:
best_rf = RandomForestClassifier(**model_features.best_params_)

In [None]:
# Fit the best model
best_rf.fit(X_train, y_train)
pickle.dump(best_rf, open('../models/rf_best.sav', 'wb'))
y_pred = best_rf.predict(X_test)


In [None]:
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
sum(y_pred==1)

In [None]:
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve
plot_precision_recall_curve(best_rf, X_test, y_test)

In [None]:
threshold = .48 #to do: youden index?
predicted_proba = best_rf.predict_proba(X_test)
predicted = (predicted_proba [:,1] >= threshold).astype('int')
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
sum(predicted==1)

## Same procedure but assessing broad subset

In [None]:
#load initial cleaned data
df=pd.read_csv(r"..\data\processed\prelim_clean2.csv", index_col=[0])

res_features = pickle.load(open('../models/01final_features_broad.sav', 'rb'))
subset=df[df.columns.intersection(res_features)]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(subset, df['target'], test_size=0.3, random_state=1)

In [None]:
base_model = RandomForestClassifier(random_state = 1)
base_model.fit(X_train, y_train)
y_pred = base_model.predict(X_test)
print(base_model.score(X_test, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['log2', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, random_state=1, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
best_params=rf_random.best_params_
best_params

In [None]:
y_pred = rf_random.predict(X_test)
print(rf_random.score(X_test, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1,2, 3, 4],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [500, 800, 1000, 1500]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_
best_grid = grid_search.best_estimator_


In [None]:
y_pred = grid_search.predict(X_test)
print(grid_search.score(X_test, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

Definitely let's use the subset