# Run a Random Forest Model using grid search

This script is designed to take a dataset that only contains categorical features (one-hot encoded) as dummy variables

Steps in modeling portion of the script:
1. Start with randomized search to narrow the search for the optimal hyperparameters 
2. Take the results from the randomized search to create a range of values to be run in the grid search
3. Use the "best model" from grid search as the model for fitting, evaluating, and getting feature importances

## Import necessary packages

In [66]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from pprint import pprint
import seaborn as sns
import pandas as pd
import numpy as np
import time
import os

%matplotlib inline

### Define working directory and file input names

In [67]:
# define filenames and CWD as variable
working_directory = r"C:\Users\nick_simmons\Mitel"
input_file_name = 'opp_dummies_dataset.csv'
featimp_corr_output_name = 'feature_importances_corr_output.csv'

full_input_path = os.path.join(working_directory, input_file_name)
full_feature_importances_output_file_path = os.path.join(working_directory, featimp_corr_output_name)

In [68]:
# ensure you are working in the right directory
os.chdir(working_directory)
print(f'Your working directory is: {os.getcwd()}\n')
print(f'Input file name: {input_file_name}\n')
# print(f'Binarized output file name: {binarized_output_file_name}')

Your working directory is: C:\Users\nick_simmons\Mitel

Input file name: opp_dummies_dataset.csv



## Import data from .csv as a Pandas DataFrame, define target and features

In [69]:
df = pd.read_csv(input_file_name)

In [71]:
# define target y, and features df X
X = df.drop('opp_won', axis=1)
y = df['opp_won']

## Create test and training sets 
### Training Dataset = 70% Training Dataset = 30%

In [72]:
# create train and test sets - stratify=y means the balance of target variables classes will be equal across all test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
# check one to make sure it worked
# X_train.head(5)

# Random Search

### Define hyperparameters values for Randomized Search

In [74]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


### Use random search to find best set of hyper parameters to use in grid search

In [75]:
# init a Random Forest Classifier model
clf=RandomForestClassifier()

# Random search of parameters, using 5 fold cross validation, 
# search across 100 different combinations, and use all available cores

rf_random = RandomizedSearchCV(estimator = clf, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 5, 
                               verbose=False, 
                               random_state=42, 
                               n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)



RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=False)

# Grid Search

### Use random search results to create new params for grid search

In [76]:
# USE VALUES FROM RANDOM SEARCH TO INFORM CHOICES OF HYPERPARAMETERS IN GRID SEARCH

param_grid = {
    'bootstrap': [True],
    'max_depth': [10],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 5],
    'min_samples_split': [2, 5],
    'n_estimators': [300, 500, 2000]
}

# Instantiate grid_rf using clf as the estimator
grid_rf = GridSearchCV(estimator=clf,
                       param_grid=param_grid,
                       scoring='neg_mean_squared_error',
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

# set start time so you can measure how long it takes for the grid search to complete
start_time = time.time()

# fit the data to the grid
grid_rf.fit(X_train, y_train)

total_time = time.time()-start_time
print(f'It took {total_time} seconds to run this model')

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.3s


It took 8.44884967803955 seconds to run this model


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    8.2s finished


### Get the best model from grid search and test accuracy

In [77]:
# Extract best model from 'grid_rf'
best_model = grid_rf.best_estimator_
# Predict the test set labels
y_pred = best_model.predict(X_test)
# Evaluate the test set RMSE
rmse_test = MSE(y_test, y_pred)**(1/2)
# Print the test set RMSE
print(f'Test set RMSE of rf: {rmse_test}')
# model accuracy
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Test set RMSE of rf: 0.5962847939999439
Accuracy: 0.6444444444444445


# Feature importances

In [79]:
# Get feature importances from the model
feature_importances = pd.DataFrame(best_model.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

# get the feature names and values from feature importances
feature_imp_df = feature_importances.copy()
feature_imp_labels_list = feature_imp_df.index.tolist()
feature_imp_values_list = feature_imp_df.values.tolist()

if len(feature_imp_labels_list) == len(feature_imp_values_list):
    print(f'Labels and values are the same length: {len(feature_imp_labels_list)}')

# create a flat list from list of lists (list of list comes from convert numpy array using tolist())
feature_imp_values_list = [item for sublist in feature_imp_values_list for item in sublist]

# feature_importances.to_csv(full_feature_importances_output_file_path)    

Labels and values are the same length: 41


### Create correlation matrix and grab the first column to show the relationship between the features and target variable sorted by feature importance

In [80]:
# convert list to set so we can search if stage is already in the list or not
features_labels_set = set(feature_imp_labels_list)
# if stage is not in the set, then add it to the first position of the feature_imp_labels_list
if 'Stage' not in features_labels_set:
    feature_imp_labels_list[:0] = ['Stage']


In [81]:
# create a new df with columns from feature labels

# copy df to avoid changing the original
df_copy = df.copy()

# create a new df by reordering the feature names according to feature importance rankings 
corr_df_in = df_copy[feature_imp_labels_list]

# create a correlation matrix for every feature in the dataset
corr_df = corr_df_in.corr()

# drop target out so the first value isn't 1 (stage correlated with itself)
featimp_corr_df = pd.DataFrame(corr_df.iloc[1:, 0])

# rename the column for clarification
featimp_corr_df.rename(index=str, columns={'Stage':'Correlations with Stage'}, inplace=True)

# add a the feature importances values list as a column to the new corr df
featimp_corr_df['Feature Importance'] = feature_imp_values_list

# reorder columns
featimp_corr_df = featimp_corr_df[['Feature Importance', 'Correlations with Stage']]

featimp_corr_df.to_csv(full_feature_importances_output_file_path, mode='x')

KeyError: "['Stage'] not in index"

### Feature Importances Top Ten plot

In [None]:
## Create bar graph of top 10 importance variables
fi_top_ten = feature_importances.iloc[0:10]
# print(fi_top_ten)
# print(fi_top_ten.iloc[:,0])
fi_top10_values = fi_top_ten.iloc[:,0].tolist()
fi_top10_labels = fi_top_ten.index.tolist()

In [None]:
# Creating a bar plot
sns.barplot(x=fi_top10_values, y=fi_top10_labels)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Important Features")
plt.legend()
plt.show()