In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# functions needed for pr_auc_score()
from sklearn.metrics import auc, precision_recall_curve
from sklearn import metrics

# functions needed for imbalanced_cross_validation_score()
from sklearn.model_selection import StratifiedKFold

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

# Classification models to compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM

from sklearn import preprocessing 

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def pr_auc_score(clf, x, y):
    '''
        This function computes area under the precision-recall curve. 
    '''
      
    precisions, recalls,_ = precision_recall_curve(y, clf.predict_proba(x)[:,1], pos_label=1)
    
    return auc(recalls, precisions)

In [3]:
def FP_FN_score(clf, x, y):
    cm = metrics.confusion_matrix(y, clf.predict(x))
    FP = cm[0][1]
    FN = cm[1][0]
    return FP, FN

In [4]:
def imbalanced_cross_validation_score(clf, x, y, cv, scoring, sampler):
    '''
        This function computes the cross-validation score of a given 
        classifier using a choice of sampling function to mitigate 
        the class imbalance, and stratified k-fold sampling.
        
        The first five arguments are the same as 
        sklearn.model_selection.cross_val_score.
        
        - clf.predict_proba(x) returns class label probabilities
        - clf.fit(x,y) trains the model
        
        - x = data
        
        - y = labels
        
        - cv = the number of folds in the cross validation
        
        - scoring(classifier, x, y) returns a float
        
        The last argument is a choice of random sampler: an object 
        similar to the sampler objects available from the python 
        package imbalanced-learn. In particular, this 
        object needs to have the method:
        
        sampler.fit_sample(x,y)
        
        See http://contrib.scikit-learn.org/imbalanced-learn/
        for more details and examples of other sampling objects 
        available.  
    
    '''
    
    cv_score = 0.
    train_score = 0.
    test_score = 0.
    FP = 0
    FN = 0
    
    # stratified k-fold creates folds with the same ratio of positive 
    # and negative samples as the entire dataset.
    
    skf = StratifiedKFold(n_splits=cv, random_state=0, shuffle=False)
    
    for train_idx, test_idx in skf.split(x,y):
        
        xfold_train_sampled, yfold_train_sampled = sampler.fit_sample(x[train_idx],y[train_idx])
        clf.fit(xfold_train_sampled, yfold_train_sampled)
        
        FP_train, FN_train = scoring(clf, xfold_train_sampled, yfold_train_sampled)
        FP_test, FN_test  = scoring(clf, x[test_idx], y[test_idx])
        
        print("Train FP: {0} Train FN: {1}; Test FP: {2} Test FN: {3}".format(FP_train,FN_train, FP_test, FN_test))

        FP += FP_test
        FN += FN_test
        
    return FP/cv, FN/cv

In [5]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

data_nolabel_v_f = data_nolabel_v.fillna(data_nolabel_v.mean())
data_label_v_f = data_label_v.fillna(data_label_v.mean())

In [8]:
X = data_label_v_f.values
y = data_label['Var66'].values

In [9]:
x = preprocessing.MinMaxScaler().fit_transform(X)

In [10]:
clfs={
    'gnb': GaussianNB(),
    'svm': SVC(),
    'lr':  LogisticRegression(),
    'rfc': RandomForestClassifier(),
    'et': ExtraTreesClassifier(),
    'ada': AdaBoostClassifier(),
    'ml': MLPClassifier()
}
cv = 5
for clf_name in clfs:
    print("Classifier: {0}".format(clf_name))
    # Logistic regression score with Random Over-sampling
    print("Random over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, RandomOverSampler())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with SMOTE
    print("SMOTE over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, SMOTE())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with ADASYN
    print("ADASYN over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, ADASYN())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with Random Under Sampling
    print("Random under-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, RandomUnderSampler())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

Classifier: gnb
Random over-sampling
Train FP: 3457 Train FN: 63; Test FP: 862 Test FN: 2
Train FP: 3366 Train FN: 77; Test FP: 846 Test FN: 4
Train FP: 3366 Train FN: 81; Test FP: 865 Test FN: 4
Train FP: 3332 Train FN: 86; Test FP: 831 Test FN: 3
Train FP: 3443 Train FN: 45; Test FP: 865 Test FN: 4
average FP: 853.80 average FN: 3.40 
SMOTE over-sampling
Train FP: 3350 Train FN: 85; Test FP: 830 Test FN: 2
Train FP: 3259 Train FN: 102; Test FP: 826 Test FN: 5
Train FP: 3248 Train FN: 121; Test FP: 841 Test FN: 6
Train FP: 3249 Train FN: 99; Test FP: 802 Test FN: 4
Train FP: 3318 Train FN: 68; Test FP: 824 Test FN: 5
average FP: 824.60 average FN: 4.40 
ADASYN over-sampling
Train FP: 3322 Train FN: 93; Test FP: 822 Test FN: 2
Train FP: 3269 Train FN: 92; Test FP: 827 Test FN: 5
Train FP: 3234 Train FN: 133; Test FP: 829 Test FN: 6
Train FP: 3252 Train FN: 97; Test FP: 803 Test FN: 4
Train FP: 3322 Train FN: 79; Test FP: 829 Test FN: 5
average FP: 822.00 average FN: 4.40 
Random under-

Train FP: 1318 Train FN: 801; Test FP: 364 Test FN: 7
Train FP: 1170 Train FN: 895; Test FP: 280 Test FN: 13
average FP: 343.00 average FN: 11.60 
ADASYN over-sampling
Train FP: 1305 Train FN: 666; Test FP: 323 Test FN: 13
Train FP: 972 Train FN: 830; Test FP: 272 Test FN: 16
Train FP: 1157 Train FN: 875; Test FP: 284 Test FN: 14
Train FP: 1118 Train FN: 887; Test FP: 299 Test FN: 11
Train FP: 1038 Train FN: 973; Test FP: 246 Test FN: 14
average FP: 284.80 average FN: 13.60 
Random under-sampling
Train FP: 61 Train FN: 41; Test FP: 407 Test FN: 12
Train FP: 48 Train FN: 45; Test FP: 328 Test FN: 8
Train FP: 37 Train FN: 47; Test FP: 293 Test FN: 13
Train FP: 46 Train FN: 55; Test FP: 305 Test FN: 17
Train FP: 25 Train FN: 58; Test FP: 282 Test FN: 18
average FP: 323.00 average FN: 13.60 
