In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
# functions needed for pr_auc_score()
from sklearn.metrics import auc, precision_recall_curve
from sklearn import metrics

# functions needed for imbalanced_cross_validation_score()
from sklearn.model_selection import StratifiedKFold

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

# Classification models to compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from imblearn.ensemble import BalancedBaggingClassifier

from sklearn import preprocessing 

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [10]:
def pr_auc_score(clf, x, y):
    '''
        This function computes area under the precision-recall curve. 
    '''
      
    precisions, recalls,_ = precision_recall_curve(y, clf.predict_proba(x)[:,1], pos_label=1)
    
    return auc(recalls, precisions)

In [11]:
def Confusion_matrix(clf, x, y):
    cm = metrics.confusion_matrix(y, clf.predict(x))
    return (cm[0][0], cm[0][1], cm[1][0], cm[1][1])

In [12]:
def imbalanced_cross_validation_score(clf, x, y, cv, scoring, sampler):
    '''
        This function computes the cross-validation score of a given 
        classifier using a choice of sampling function to mitigate 
        the class imbalance, and stratified k-fold sampling.
        
        The first five arguments are the same as 
        sklearn.model_selection.cross_val_score.
        
        - clf.predict_proba(x) returns class label probabilities
        - clf.fit(x,y) trains the model
        
        - x = data
        
        - y = labels
        
        - cv = the number of folds in the cross validation
        
        - scoring(classifier, x, y) returns a float
        
        The last argument is a choice of random sampler: an object 
        similar to the sampler objects available from the python 
        package imbalanced-learn. In particular, this 
        object needs to have the method:
        
        sampler.fit_sample(x,y)
        
        See http://contrib.scikit-learn.org/imbalanced-learn/
        for more details and examples of other sampling objects 
        available.  
    
    '''
    
    cv_score = 0.
    train_score = 0.
    test_score = 0.
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    
    # stratified k-fold creates folds with the same ratio of positive 
    # and negative samples as the entire dataset.
    
    skf = StratifiedKFold(n_splits=cv, random_state=0, shuffle=False)
    
    for train_idx, test_idx in skf.split(x,y):
        
        xfold_train_sampled, yfold_train_sampled = sampler.fit_sample(x[train_idx],y[train_idx])
        clf.fit(xfold_train_sampled, yfold_train_sampled)
        
        TP_train, FP_train, FN_train, TN_train = scoring(clf, xfold_train_sampled, yfold_train_sampled)
        TP_test, FP_test, FN_test, TN_test  = scoring(clf, x[test_idx], y[test_idx])
        
        print("Train FP: {0} Train FN: {1}; Test FP: {2} Test FN: {3}".format(FP_train, FN_train, FP_test, FN_test))
        
        TP += TP_test
        FP += FP_test
        FN += FN_test
        TN += TN_test
        

    ave_tp = TP/cv
    ave_fp = FP/cv
    ave_fn = FN/cv
    ave_tn = TN/cv
    
    
    sensitivity = ave_tp/(ave_tp + ave_fn)
    specificity = ave_tn/(ave_fp + ave_tn)
    
    g_mean = math.sqrt(sensitivity * specificity)
    mcc = (ave_tp * ave_tn - ave_fp * ave_fn)/math.sqrt((ave_tp + ave_fp) * (ave_tp + ave_fn) * (ave_tn + ave_fp) * (ave_tn + ave_fn))
    
    values = [sensitivity, specificity, g_mean, mcc]
    
        
    return values

In [13]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

data_nolabel_v_f = data_nolabel_v.fillna(data_nolabel_v.mean())
data_label_v_f = data_label_v.fillna(data_label_v.mean())

In [14]:
X = data_label_v_f.values
y = data_label['Var66'].values

In [15]:
x = preprocessing.MinMaxScaler().fit_transform(X)

In [16]:
clfs={
    'gnb': GaussianNB(),
    'svm': SVC(),
    'lr':  LogisticRegression(),
    'rfc': RandomForestClassifier(),
    'et': ExtraTreesClassifier(),
    'ada': AdaBoostClassifier(),
    'ml': MLPClassifier(),
    'bb': BalancedBaggingClassifier(base_estimator = RandomForestClassifier(criterion='gini'), 
                                          n_estimators = 15, bootstrap = True)
    
}
cv = 10
for clf_name in clfs:
    print("Classifier: {0}".format(clf_name))
    # Logistic regression score with Random Over-sampling
    print("Random over-sampling")
    values = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, Confusion_matrix, RandomOverSampler())
    print("sensitivity: %.2f | specificity: %.2f | g_mean: %.2f | mcc: %.2f"%(values[0], values[1], values[2], values[3]))

    # Logistic regression score with SMOTE
    print("SMOTE over-sampling")
    values = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, Confusion_matrix, SMOTE())
    print("sensitivity: %.2f | specificity: %.2f | g_mean: %.2f | mcc: %.2f"%(values[0], values[1], values[2], values[3]))

    # Logistic regression score with ADASYN
    print("ADASYN over-sampling")
    values = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, Confusion_matrix, ADASYN())
    print("sensitivity: %.2f | specificity: %.2f | g_mean: %.2f | mcc: %.2f"%(values[0], values[1], values[2], values[3]))

    # Logistic regression score with Random Under Sampling
    print("Random under-sampling")
    values = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, Confusion_matrix, RandomUnderSampler())
    print("sensitivity: %.2f | specificity: %.2f | g_mean: %.2f | mcc: %.2f"%(values[0], values[1], values[2], values[3]))

    print("-"*70)
    print()
    

Classifier: gnb
Random over-sampling
Train FP: 3880 Train FN: 27; Test FP: 432 Test FN: 1
Train FP: 3887 Train FN: 46; Test FP: 424 Test FN: 1
Train FP: 3846 Train FN: 89; Test FP: 425 Test FN: 2
Train FP: 3873 Train FN: 69; Test FP: 428 Test FN: 0
Train FP: 3863 Train FN: 105; Test FP: 435 Test FN: 2
Train FP: 3833 Train FN: 101; Test FP: 438 Test FN: 1
Train FP: 3822 Train FN: 115; Test FP: 422 Test FN: 2
Train FP: 3819 Train FN: 51; Test FP: 427 Test FN: 1
Train FP: 3866 Train FN: 54; Test FP: 427 Test FN: 4
Train FP: 3878 Train FN: 19; Test FP: 435 Test FN: 0
sensitivity: 0.97 | specificity: 0.04 | g_mean: 0.18 | mcc: 0.00
SMOTE over-sampling
Train FP: 3780 Train FN: 84; Test FP: 419 Test FN: 1
Train FP: 3748 Train FN: 96; Test FP: 410 Test FN: 1
Train FP: 3727 Train FN: 107; Test FP: 417 Test FN: 2
Train FP: 3745 Train FN: 105; Test FP: 422 Test FN: 2
Train FP: 3688 Train FN: 102; Test FP: 417 Test FN: 2
Train FP: 3721 Train FN: 97; Test FP: 426 Test FN: 2
Train FP: 3669 Train FN:

Train FP: 1 Train FN: 0; Test FP: 8 Test FN: 12
Train FP: 2 Train FN: 0; Test FP: 9 Test FN: 9
Train FP: 1 Train FN: 4; Test FP: 7 Test FN: 10
Train FP: 1 Train FN: 4; Test FP: 4 Test FN: 10
Train FP: 3 Train FN: 1; Test FP: 6 Test FN: 9
Train FP: 1 Train FN: 2; Test FP: 6 Test FN: 8
Train FP: 2 Train FN: 2; Test FP: 5 Test FN: 12
Train FP: 0 Train FN: 3; Test FP: 2 Test FN: 11
Train FP: 1 Train FN: 1; Test FP: 5 Test FN: 13
Train FP: 0 Train FN: 0; Test FP: 1 Test FN: 11
sensitivity: 0.98 | specificity: 0.55 | g_mean: 0.73 | mcc: 0.44
ADASYN over-sampling
Train FP: 0 Train FN: 0; Test FP: 6 Test FN: 13
Train FP: 0 Train FN: 0; Test FP: 7 Test FN: 11
Train FP: 2 Train FN: 3; Test FP: 9 Test FN: 10
Train FP: 1 Train FN: 1; Test FP: 4 Test FN: 12
Train FP: 0 Train FN: 3; Test FP: 3 Test FN: 9
Train FP: 1 Train FN: 0; Test FP: 4 Test FN: 8
Train FP: 3 Train FN: 0; Test FP: 4 Test FN: 14
Train FP: 0 Train FN: 0; Test FP: 2 Test FN: 10
Train FP: 2 Train FN: 1; Test FP: 2 Test FN: 11
Train F

Train FP: 1242 Train FN: 1049; Test FP: 149 Test FN: 7
Train FP: 1689 Train FN: 687; Test FP: 195 Test FN: 2
Train FP: 1290 Train FN: 947; Test FP: 132 Test FN: 9
Train FP: 1256 Train FN: 1081; Test FP: 152 Test FN: 5
sensitivity: 0.98 | specificity: 0.07 | g_mean: 0.26 | mcc: 0.13
Random under-sampling
Train FP: 31 Train FN: 69; Test FP: 125 Test FN: 10
Train FP: 50 Train FN: 49; Test FP: 199 Test FN: 5
Train FP: 35 Train FN: 75; Test FP: 103 Test FN: 5
Train FP: 54 Train FN: 50; Test FP: 162 Test FN: 5
Train FP: 41 Train FN: 58; Test FP: 142 Test FN: 8
Train FP: 51 Train FN: 47; Test FP: 160 Test FN: 6
Train FP: 45 Train FN: 52; Test FP: 165 Test FN: 8
Train FP: 50 Train FN: 56; Test FP: 142 Test FN: 8
Train FP: 46 Train FN: 58; Test FP: 134 Test FN: 8
Train FP: 40 Train FN: 58; Test FP: 150 Test FN: 10
sensitivity: 0.98 | specificity: 0.06 | g_mean: 0.25 | mcc: 0.10
----------------------------------------------------------------------

Classifier: bb
Random over-sampling
Train FP: 