In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# functions needed for pr_auc_score()
from sklearn.metrics import auc, precision_recall_curve
from sklearn import metrics

# functions needed for imbalanced_cross_validation_score()
from sklearn.model_selection import StratifiedKFold

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

# Classification models to compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def pr_auc_score(clf, x, y):
    '''
        This function computes area under the precision-recall curve. 
    '''
      
    precisions, recalls,_ = precision_recall_curve(y, clf.predict_proba(x)[:,1], pos_label=1)
    
    return auc(recalls, precisions)

In [11]:
def FP_FN_score(clf, x, y):
    cm = metrics.confusion_matrix(y, clf.predict(x))
    FP = cm[0][1]
    FN = cm[1][0]
    return FP, FN

In [22]:
def imbalanced_cross_validation_score(clf, x, y, cv, scoring, sampler):
    '''
        This function computes the cross-validation score of a given 
        classifier using a choice of sampling function to mitigate 
        the class imbalance, and stratified k-fold sampling.
        
        The first five arguments are the same as 
        sklearn.model_selection.cross_val_score.
        
        - clf.predict_proba(x) returns class label probabilities
        - clf.fit(x,y) trains the model
        
        - x = data
        
        - y = labels
        
        - cv = the number of folds in the cross validation
        
        - scoring(classifier, x, y) returns a float
        
        The last argument is a choice of random sampler: an object 
        similar to the sampler objects available from the python 
        package imbalanced-learn. In particular, this 
        object needs to have the method:
        
        sampler.fit_sample(x,y)
        
        See http://contrib.scikit-learn.org/imbalanced-learn/
        for more details and examples of other sampling objects 
        available.  
    
    '''
    
    cv_score = 0.
    train_score = 0.
    test_score = 0.
    FP = 0
    FN = 0
    
    # stratified k-fold creates folds with the same ratio of positive 
    # and negative samples as the entire dataset.
    
    skf = StratifiedKFold(n_splits=cv, random_state=0, shuffle=False)
    
    for train_idx, test_idx in skf.split(x,y):
        
        xfold_train_sampled, yfold_train_sampled = sampler.fit_sample(x[train_idx],y[train_idx])
        clf.fit(xfold_train_sampled, yfold_train_sampled)
        
        FP_train, FN_train = scoring(clf, xfold_train_sampled, yfold_train_sampled)
        FP_test, FN_test  = scoring(clf, x[test_idx], y[test_idx])
        
        print("Train FP: {0} Train FN: {1}; Test FP: {2} Test FN: {3}".format(FP_train,FN_train, FP_test, FN_test))

        FP += FP_test
        FN += FN_test
        
    return FP/cv, FN/cv

In [19]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

data_nolabel_v_f = data_nolabel_v.fillna(data_nolabel_v.mean())
data_label_v_f = data_label_v.fillna(data_label_v.mean())

In [20]:
x = data_label_v_f.values
y = data_label['Var66'].values

In [26]:
clfs={
    'gnb': GaussianNB(),
    'svm': SVC(),
    'lr':  LogisticRegression(),
    'rfc': RandomForestClassifier(),
    'et': ExtraTreesClassifier(),
    'ada': AdaBoostClassifier(),
    'ml': MLPClassifier()
}
cv = 5
for clf_name in clfs:
    print("Classifier: {0}".format(clf_name))
    # Logistic regression score with Random Over-sampling
    print("Random over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, RandomOverSampler())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with SMOTE
    print("SMOTE over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, SMOTE())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with ADASYN
    print("ADASYN over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, ADASYN())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with Random Under Sampling
    print("Random under-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, RandomUnderSampler())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

Classifier: gnb
Random over-sampling
Train FP: 3507 Train FN: 117; Test FP: 867 Test FN: 1
Train FP: 3399 Train FN: 28; Test FP: 849 Test FN: 2
Train FP: 3406 Train FN: 84; Test FP: 873 Test FN: 3
Train FP: 3368 Train FN: 79; Test FP: 843 Test FN: 3
Train FP: 3461 Train FN: 25; Test FP: 875 Test FN: 4
average FP: 861.40 average FN: 2.60 
SMOTE over-sampling
Train FP: 3424 Train FN: 87; Test FP: 847 Test FN: 1
Train FP: 3327 Train FN: 72; Test FP: 833 Test FN: 2
Train FP: 3337 Train FN: 56; Test FP: 859 Test FN: 3
Train FP: 3305 Train FN: 74; Test FP: 822 Test FN: 3
Train FP: 3387 Train FN: 46; Test FP: 853 Test FN: 5
average FP: 842.80 average FN: 2.80 
ADASYN over-sampling
Train FP: 3432 Train FN: 71; Test FP: 850 Test FN: 1
Train FP: 3319 Train FN: 71; Test FP: 834 Test FN: 2
Train FP: 3342 Train FN: 64; Test FP: 858 Test FN: 3
Train FP: 3308 Train FN: 61; Test FP: 823 Test FN: 3
Train FP: 3393 Train FN: 43; Test FP: 853 Test FN: 4
average FP: 843.60 average FN: 2.60 
Random under-sa

Train FP: 690 Train FN: 662; Test FP: 193 Test FN: 19
Train FP: 716 Train FN: 418; Test FP: 194 Test FN: 16
Train FP: 1286 Train FN: 159; Test FP: 312 Test FN: 7
Train FP: 552 Train FN: 885; Test FP: 146 Test FN: 14
Train FP: 508 Train FN: 468; Test FP: 129 Test FN: 16
average FP: 194.80 average FN: 14.40 
Random under-sampling
Train FP: 34 Train FN: 33; Test FP: 324 Test FN: 13
Train FP: 16 Train FN: 43; Test FP: 207 Test FN: 21
Train FP: 58 Train FN: 47; Test FP: 446 Test FN: 15
Train FP: 17 Train FN: 49; Test FP: 237 Test FN: 16
Train FP: 78 Train FN: 50; Test FP: 524 Test FN: 17
average FP: 347.60 average FN: 16.40 
