In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 999
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, \
                             AdaBoostClassifier, \
                             GradientBoostingClassifier, \
                             ExtraTreesClassifier, \
                             BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

from sklearn.metrics import roc_curve, auc, confusion_matrix

import re

import seaborn as sns
sns.set()

from sklearn.externals import joblib as jl

In [2]:
def predict_threshold(predict_proba, threshold):
    return np.array([1 if x > threshold else 0 for x in predict_proba.T[1]])

def process_persample(classifier, train_data, train_label, test_data, test_label, th):
    mod = classifier.fit(train_data, train_label)
    probas_ = classifier.predict_proba(test_data)
    pred = predict_threshold(probas_, th)
    confusion_ = confusion_matrix(test_label, pred, labels=None)
    score = np.round(accuracy_score(test_label, pred), 3)
    fpr, tpr, thresholds = roc_curve(test_label, probas_[:, 1])
    roc_auc = auc(fpr, tpr)
    return score, confusion_, [fpr, tpr, thresholds, roc_auc]

def process_classifiers(classifiers, df_train, df_test, df_validate, y_train, y_test, y_validate, th):
    
    train_f = df_train
    train_l = y_train

    test_f = df_test
    test_l = y_test
    
    validate_f = df_validate
    validate_l = y_validate
    
    index = [re.findall('\w*', str(x))[0] for x in classifier]
    
    params = {"test" : [test_f,test_l], "validate" : [validate_f,validate_l]}
    #params = {"test" : [test_f,test_l]}
    accuracy_table = []
    
    with sns.axes_style("darkgrid"):
        fig, axes = plt.subplots(int(np.ceil(9/ 3)), 3, sharex='all', sharey='all', figsize=(12, 12))

        for ax, i in zip(axes.flatten(), range(len(classifier))):
            a = []
            
            for keys in params:
                acc, conf_m, roc_ = process_persample(classifier[i], train_f, train_l, \
                                                  params[keys][0], params[keys][1], th)
                a.append(acc)
                
                x_roc = np.insert(roc_[0],0,0, axis=0)
                y_roc = np.insert(roc_[1],0,0, axis=0)
                ax.plot(x_roc, y_roc ,lw=1, label = 'ROC %s (area = %0.4f)' % (keys, roc_[3]))
        
            ax.set_title(re.findall('\w*', str(classifier[i]))[0], size = 13)
            ax.legend(loc=4, fontsize=11)
            
            accuracy_table.append(a)
            ax.set_xlim(-0.1,1.1)
            ax.set_ylim(-0.1,1.1)
            ax.axis("equal")
        
        
    
    return pd.DataFrame(np.array(accuracy_table), columns = ['test accuracy','validate_accuracy'], \
                        index = index)


classifier = [LogisticRegression(),
              DecisionTreeClassifier(),
              RandomForestClassifier(),
              AdaBoostClassifier(), 
              GradientBoostingClassifier(),
              ExtraTreesClassifier(),
              BaggingClassifier(),
              KNeighborsClassifier(),
              GaussianNB()
             ]

In [None]:
process_classifiers(classifier, X_train, X_test, X_validate, y_train, y_test, y_validate, 0.5)