# ML Model Selection Template - Multi-Class Classifier
This is a notebook template to be used for model selection for Multi-Class Classification problems.

## How to Use:
1. Import the dataset and set this as ```df```
2. Execute the notebook
3. Inspect the tabel of results at the bottom and pick the top N performing models
4. Put this models into an ensemble and run the ```Ensemble``` section of the notebook
5. When happy with the chosen mode, you have the option to serialize for export

## Table of Contents
1. Import dataset
2. Set Hyper Parameters
3. Model selection
4. Ensemble model
5. Serialize the model for output

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score
import scikitplot as skplt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
import dill
import gzip

# Import dataset

In [None]:
# Import Dataset
from sklearn.datasets import make_classification

In [None]:
dt = make_classification( 5000, 10, n_classes=5, n_clusters_per_class = 2, n_informative = 6, random_state=0)
dt = pd.concat([ pd.DataFrame(dt[0]), pd.DataFrame(dt[1])], axis=1 )
dt.columns = ['feature_{}'.format(x) for x in range(10)] + ['target']

# Set Hyper Parameters

In [None]:
#################################
# Hyper Parameters for Notebook #
#################################
# dataframe containing data
df = dt# add name of your dataset here
#list of columns with continous variables
colCORR = ['feature_{}'.format(x) for x in range(10)]
# target label
targetLabel = 'target'

# AFTER FIRST RUN
# There is an option to select an ensemble of models.
# Use this to put together a soft voting ensemble of the top N performers

# AFTER choosing the best model
# Serialize model for output

# Model selection

In [None]:
df.shape

In [None]:
# Check data
df.columns.tolist()

In [None]:
df.head()

In [None]:
#  Are there strong linear relationships?
sns.set(style="white")
f, ax = plt.subplots(figsize=(20, 20))

corr = df[colCORR].corr()
# Mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# colormap
cmap = sns.diverging_palette(220, 20, n=100, as_cmap=True) 

sns.heatmap(corr, mask=mask, cmap=cmap, annot = True, vmax=0.75, vmin=-0.75, center=0,
            square=True, linewidths=.5, annot_kws={"size":10}, cbar_kws={"shrink": .5})
plt.show()

In [None]:
# # Is the dataset unbalanced?
X = Counter(df[targetLabel]).items()
plt.bar([x for x in dict(X).keys()], [dict(X)[x] for x in range(len(dict(X))) ]  )
plt.title("Frequency of {}".format(targetLabel))
plt.xticks([x for x in dict(X).keys()])
plt.show()

In [None]:
[ print('Class {} : {:0.00%}'.format(x, df[df[targetLabel] == x].shape[0] / df.shape[0] )) for x in set(df[targetLabel]) ];

In [None]:
class multiClassClassifierFit(object):
    def __init__(self, clf, params=None):
        if params:            
            self.clf = clf(**params)
        else:
            self.clf = clf()

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def grid_search(self, parameters, Kfold):
        self.grid = GridSearchCV(estimator = self.clf, param_grid = parameters,
                                 cv = Kfold, verbose=1)
        
    def grid_fit(self, X, Y):
        self.grid.fit(X, Y)
        
    def grid_predict(self, X, Y):
        print("Classification Report :")
        print(classification_report(Y,  self.grid.predict(X) ))

In [None]:
def multiClassClassifierCurves(estimator, title, X, y, X_test, y_test, 
                           ylim=[0.5, 1.01], cv=5, n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10)):
    """Test and training learning curve"""
    fig, ax  = plt.subplots(nrows=2,ncols=2, figsize=(20,11))
    fig.suptitle(title)
    if 'NOPROBA' not in title:
        skplt.metrics.plot_roc(y_test, estimator.predict_proba(X_test), plot_macro=False,
                               plot_micro=False, ax = ax[0,0])
        ax[0,0].grid()
        skplt.metrics.plot_precision_recall(y_test, estimator.predict_proba(X_test),
                                            plot_micro=False, ax = ax[1,0])
        ax[1,0].grid()
    skplt.metrics.plot_confusion_matrix(y_test, estimator.predict(X_test), normalize=True, ax=ax[1,1])

    if ylim is not None:
        ax[0,1].set_ylim(*ylim)
    ax[0,1].set_xlabel("Training examples")
    ax[0,1].set_ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=1,
                                                            train_sizes=train_sizes, verbose=0)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    ax[0,1].grid()
    ax[0,1].fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    ax[0,1].fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    ax[0,1].plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    ax[0,1].plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    ax[0,1].set_title('Learning Rates')
    ax[0,1].legend(loc="best")
    
    return fig

In [None]:
# Split the dataset into 3 datasets:
#  - 20% to show how to predict best time to call
#  - 80% -> 80% Training dataset
#        -> 20% Testing dataset
X = df.drop([targetLabel], axis=1)
y = df[targetLabel]

X, X_validation, y, y_validation = train_test_split(X, y, test_size = 0.2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [None]:
# Logistic Regression
lr = multiClassClassifierFit(clf = LogisticRegression)
lr.grid_search(parameters = [{'C':np.logspace(-2,2,20),
                              'random_state' : [0],
                              'class_weight' : ['balanced',None],
                              'solver' : ['lbfgs'],
                              'multi_class' : ['ovr', 'multinomial', 'auto'],
                              'random_state' : [0]
                             }], Kfold = 5)
lr.grid_fit(X_train, y_train)
lr.grid_predict(X_test, y_test)
g = multiClassClassifierCurves(lr.grid.best_estimator_, "Logistic Regression", X_train, y_train, X_test, y_test)

In [None]:
# # Support Vector Machine Classifier (SVC)
# svc = multiClassClassifierFit(clf = LinearSVC)
# svc.grid_search(parameters = [{'C':np.logspace(-2,2,20),
#                               'random_state' : [0],
#                                'max_iter' : [4000]
#                               }], Kfold = 5)
# svc.grid_fit(X_train, y_train)
# svc.grid_predict(X_test, y_test)
# g = multiClassClassifierCurves(svc.grid.best_estimator_, "SVC NOPROBA", X_train, y_train, X_test, y_test)

In [None]:
# # k-Nearest Neighbours
# knn = multiClassClassifierFit(clf = KNeighborsClassifier)
# knn.grid_search(parameters = [{'n_neighbors': np.arange(1,30,2)}], Kfold = 5, scoreMetric='recall')
# knn.grid_fit(X_train, y_train)
# knn.grid_predict(X_test, y_test)
# g = multiClassClassifierCurves(knn.grid.best_estimator_, "KNN", X_train, y_train, X_test, y_test)

In [None]:
# Decision Tree
tr = multiClassClassifierFit(clf = DecisionTreeClassifier)
tr.grid_search(parameters = [{'criterion' : ['entropy', 'gini'],
                             'max_depth' : [5, 10, 15, 20, 25],
                             'random_state' : [0]
                             }], Kfold = 5)
tr.grid_fit(X_train, y_train)
tr.grid_predict(X_test, y_test)
g = multiClassClassifierCurves(tr.grid.best_estimator_, "Decision tree", X_train, y_train, X_test, y_test,  )

In [None]:
#  Random Forest
rf = multiClassClassifierFit(clf = RandomForestClassifier)
param_grid = {'criterion' : ['entropy', 'gini'], 
              'n_estimators' : [20, 40, 60, 80, 100],
              'random_state' : [0]}
rf.grid_search(parameters = param_grid, Kfold = 5 )
rf.grid_fit(X_train, y_train)
rf.grid_predict(X_test, y_test)
g = multiClassClassifierCurves(rf.grid.best_estimator_, "Random Forest", X_train, y_train, X_test, y_test)

In [None]:
# Gradient Boost
gb = multiClassClassifierFit(clf = GradientBoostingClassifier)
param_grid = {'n_estimators' : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
              'random_state' : [0]}
gb.grid_search(parameters = param_grid, Kfold = 5)
gb.grid_fit(X_train, y_train)
gb.grid_predict(X_test, y_test)
g = multiClassClassifierCurves(gb.grid.best_estimator_, "Gradient Boosting", X_train, y_train, X_test, y_test)

In [None]:
# # AdaBoostClassifier
# abc1 = multiClassClassifierFit(clf = AdaBoostClassifier)
# param_grid = {}
# abc1.grid_search(parameters = param_grid, Kfold = 5)
# abc1.grid_fit(X_train, y_train)
# abc1.grid_predict(X_test, y_test)
# g = multiClassClassifierCurves(abc1.grid.best_estimator_, "AdaBoostClassifier", 
#                         X_train, y_train, X_test, y_test)

In [None]:
# # MLPClassifier
# nn = multiClassClassifierFit(clf = MLPClassifier)
# param_grid = {'hidden_layer_sizes' : [10, 100, 200, 1000, 2000, 3000, 4000],
#               'random_state' : [0],
#              'early_stopping' : [True]}
# nn.grid_search(parameters = param_grid, Kfold = 5)
# nn.grid_fit(X_train, y_train)
# nn.grid_predict(X_test, y_test)
# g = multiClassClassifierCurves(nn.grid.best_estimator_, "MLPClassifier - (Multi Layer Perceptron)", X_train, y_train, X_test, y_test)

In [None]:
# # GaussianProcessClassifier
# ''' SKLEARN help
# The advantages of Gaussian processes are:
#         The prediction interpolates the observations (at least for regular kernels).
#         The prediction is probabilistic (Gaussian) so that one can compute empirical confidence intervals and decide based on those if one should refit (online fitting, adaptive fitting) the prediction in some region of interest.
#         Versatile: different kernels can be specified. Common kernels are provided, but it is also possible to specify custom kernels.

# The disadvantages of Gaussian processes include:
#         They are not sparse, i.e., they use the whole samples/features information to perform the prediction.
#         They lose efficiency in high dimensional spaces – namely when the number of features exceeds a few dozens.
# '''
# gpc = multiClassClassifierFit(clf = GaussianProcessClassifier)
# param_grid = {'max_iter_predict' : [50, 100, 200],
#               'random_state' : [0]}
# gpc.grid_search(parameters = param_grid, Kfold = 5
# gpc.grid_fit(X_train, y_train)
# gpc.grid_predict(X_test, y_test)
# g = multiClassClassifierCurves(gpc.grid.best_estimator_, "Gaussian Process Classifier", X_train, y_train, X_test, y_test)

In [None]:
# GaussianNB
NB = multiClassClassifierFit(clf = GaussianNB)
param_grid = {}
NB.grid_search(parameters = param_grid, Kfold = 5)
NB.grid_fit(X_train, y_train)
NB.grid_predict(X_test, y_test)
g = multiClassClassifierCurves(NB.grid.best_estimator_, "Gaussian Naives Bayes", X_train, y_train, X_test, y_test)

In [None]:
# # QuadraticDiscriminantAnalysis
# '''
# A classifier with a quadratic decision boundary, generated by fitting class conditional densities to the data 
# and using Bayes’ rule.
# The model fits a Gaussian density to each class.
# '''
# QDA = multiClassClassifierFit(clf = QuadraticDiscriminantAnalysis)
# param_grid = {}
# QDA.grid_search(parameters = param_grid, Kfold = 5)
# QDA.grid_fit(X_train, y_train)
# QDA.grid_predict(X_test, y_test)
# g = multiClassClassifierCurves(QDA.grid.best_estimator_, "Quadratic Discriminant Analysis",X_train, y_train, X_test, y_test)

In [None]:
classifier_results = [
    {'Model Type':'Linear','Classifier':'Logistic Regression','Label':'lr', 'Model':lr},
#     {'Model Type':'SVM','Classifier':'SVC','Label':'svc', 'Model':svc},
    {'Model Type':'Tree','Classifier':'Decision Tree','Label':'tr', 'Model':tr},
    {'Model Type':'Tree','Classifier':'Random Forest','Label':'rf', 'Model':rf},
    {'Model Type':'Tree','Classifier':'Gradient Boosting','Label':'gb', 'Model':gb},
#     {'Model Type':'Neural Network','Classifier':'Multi-layer Perceptron','Label':'nn', 'Model':nn},
    {'Model Type':'Probabilistic','Classifier':'Gaussian Naive Bayes','Label':'NB', 'Model':NB}
]
df_res = pd.DataFrame(classifier_results)
df_res = df_res[['Model Type','Classifier', 'Label', 'Model']]
df_res['Recall'] = df_res['Model'].apply(lambda x : recall_score(y_validation, x.grid.best_estimator_.predict(X_validation), average='weighted'))
df_res['Accuracy'] = df_res['Model'].apply(lambda x : accuracy_score(y_validation, x.grid.best_estimator_.predict(X_validation)) )
df_res['f1 Score'] = df_res['Model'].apply(lambda x : f1_score(y_validation, x.grid.best_estimator_.predict(X_validation), average='weighted'))
df_res.drop(['Model'], axis=1,inplace=True)
df_res

# Ensemble model

In [None]:
# choose the top N performers and put in an esemble
estimatorEnsemble = [('gb', GradientBoostingClassifier(**gb.grid.best_params_)),
                     ('rf', RandomForestClassifier(**rf.grid.best_params_)),
                     ('tr', DecisionTreeClassifier(**tr.grid.best_params_)),
                     ('NB', GaussianNB(**NB.grid.best_params_))]
votingC = VotingClassifier(estimators=estimatorEnsemble, voting='soft')
votingC = votingC.fit(X_train, y_train)
skplt.metrics.plot_confusion_matrix(y_test, votingC.predict(X_test), normalize=True)
plt.show()

In [None]:
df_res.append({'Model Type':'Ensemble',
               'Classifier':[est[0] for est in estimatorEnsemble ],
               'Label':[est[0] for est in estimatorEnsemble ],
               'Recall': recall_score(y_validation, votingC.predict(X_validation), average='weighted'),
               'Accuracy' : accuracy_score(y_validation, votingC.predict(X_validation)),
               'f1 Score' : f1_score(y_validation, votingC.predict(X_validation), average='weighted'), 
              }, ignore_index=True)

# Serialize the model for output

In [None]:
# # Serialise and Save the best model
# model = votingC
# filename = 'df.pk'

# with gzip.open('./models/'+filename, 'wb') as file:
#     dill.dump(model, file, recurse=True)

# # check that save and load are ok
# with gzip.open('./models/'+filename ,'rb') as f:
#     loaded_model = dill.load(f)
    
# #  check that loading and unloading the model have not altered
# QC = []
# for orig, loaded in zip( model.predict(X_validation), loaded_model.predict(X_validation)):
#     QC.append(orig==loaded)
# if [x for x in set(QC)][0] == True:
#     print('Model saved and loaded correctly')
# else:
#     raise ValueError('Model has NOT been saved and loaded correctly')