<b>Import Libraries<b>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

<b>import data and handle missing values<b>

In [None]:
df = pd.read_csv('smote_train.csv')
df_target = df['historical_data']
df_feat = df.drop(['Unnamed: 0','historical_data'],axis=1)
df_feat[['column_1','column_2','column_3']] = df_feat[['column_1','column_2','column_3']].fillna("NULL").copy()
df_feat = df_feat.fillna(0).copy()

#check that all null values are handled
sns.heatmap(df_feat.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
#get dummy variables for categorical features
df_feat2 = pd.get_dummies(df_feat,columns=['column_1','column_2','column_3'])

<b>train/test split<b>

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_feat2, df_target, test_size=0.20, random_state=99)

<b>create, train, and evaluate basic model<b>

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 500)
rfc.fit(X_train, y_train)

In [None]:
rfc_pred = rfc.predict(X_test)

In [None]:
#evaluate
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,rfc_pred))

In [None]:
print(classification_report(y_test,rfc_pred))

<b>Find stable range of trees for later use in hyperparameter tuning<b>

In [None]:
predictions = []
for tree in rfc.estimators_:
    predictions.append(tree.predict_proba(X_test)[None, :])

In [None]:
predictions = np.vstack(predictions)

In [None]:
cum_mean = np.cumsum(predictions, axis=0)/np.arange(1, predictions.shape[0] + 1)[:, None, None]

In [None]:
from sklearn.metrics import accuracy_score

scores = []
for pred in cum_mean:
    scores.append(accuracy_score(y_test, np.argmax(pred, axis=1)))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(scores, linewidth=3)
plt.xlabel('num_trees')
plt.ylabel('accuracy');

<b> Evaluate feature importances and drop useless ones <b>

In [None]:
# List of features
feature_list = list(df_feat2.columns)

# Get numerical feature importances
importances = list(rfc.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
# Reset style 
plt.style.use('fivethirtyeight')
plt.figure(figsize=(15, 6))

# list of x locations for plotting
x_values = list(range(len(importances)))

# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2)

# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')

# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');

In [None]:
# List of features sorted from most to least important
sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]

# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)

# Make a line graph
plt.figure(figsize=(15, 6))
plt.plot(x_values, cumulative_importances, 'g-')

# Draw line at 95% of importance retained
plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')

# Format x ticks and labels
plt.xticks(x_values, sorted_features, rotation = 'vertical')

# Axis labels and title
plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');

In [None]:
df_feat_importance = pd.DataFrame(feature_importances,columns=['Name','Importance'])

In [None]:
df_feat_importance['Name'].head(26).to_clipboard()

In [None]:
X_train_imp = X_train[['feature_1',
'feature_2',
'feature_3',
'feature_4',
'feature_5',
'feature_6',
'feature_7',
'feature_8',
'feature_9',
'feature_10',
'feature_11',
'feature_12',
'feature_13',
'feature_14',
'feature_15',
'feature_16',
'feature_17',
'feature_18',
'feature_19',
'feature_20',
'feature_21',
'feature_22',
'feature_23',
'feature_24',
'feature_25',
'feature_26'
]]

X_test_imp = X_test[['feature_1',
'feature_2',
'feature_3',
'feature_4',
'feature_5',
'feature_6',
'feature_7',
'feature_8',
'feature_9',
'feature_10',
'feature_11',
'feature_12',
'feature_13',
'feature_14',
'feature_15',
'feature_16',
'feature_17',
'feature_18',
'feature_19',
'feature_20',
'feature_21',
'feature_22',
'feature_23',
'feature_24',
'feature_25',
'feature_26'
]]

<b> Create randomized grid for hyperparamater tuning and choose best model <b>

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from pprint import pprint

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

pprint(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_imp, y_train)

In [None]:
#select model with best paramaters
rf_random.best_params_

In [None]:
#make predictions
best_random = rf_random.best_estimator_
random_predictions = best_random.predict(X_test_imp)

In [None]:
#evaluate
from sklearn.metrics import accuracy_score
accuracy_score(y_test, random_predictions)

In [None]:
#confusion matrix
print(confusion_matrix(y_test,random_predictions))

In [None]:
# recall/precision metrics
print(classification_report(y_test,random_predictions))

<b> Make predictions on test set using trained model from randomized grid search <b>

In [None]:
df_test = pd.read_csv('test_norm.csv')
df_feat_test = df_test.drop(['Unnamed: 0'],axis=1)
df_feat_test[['column_1','column_2','column_3']] = df_feat_test[['column_1','column_2','column_3']].fillna("NULL").copy()
df_feat_test = df_feat_test.fillna(0).copy()

#check that all null values are handled
sns.heatmap(df_feat_test.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
df_feat_test2 = pd.get_dummies(df_feat_test,columns=['column_1','column_2','column_3'])
df_feat_test3 = df_feat_test2[['feature_1',
'feature_2',
'feature_3',
'feature_4',
'feature_5',
'feature_6',
'feature_7',
'feature_8',
'feature_9',
'feature_10',
'feature_11',
'feature_12',
'feature_13',
'feature_14',
'feature_15',
'feature_16',
'feature_17',
'feature_18',
'feature_19',
'feature_20',
'feature_21',
'feature_22',
'feature_23',
'feature_24',
'feature_25',
'feature_26'
]]

In [None]:
test_predictions = best_random.predict(df_feat_test3)
test_proba = best_random.predict_proba(df_feat_test3)

In [None]:
df_pred = pd.DataFrame(test_predictions)
df_proba = pd.DataFrame(test_proba)

In [None]:
test_final = pd.concat([df_test,df_pred,df_proba],axis=1)

In [None]:
test_final.to_excel('YYYYMMDD RF SMOTE Holiday SR Predictions.xlsx')