In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# functions needed for pr_auc_score()
from sklearn.metrics import auc, precision_recall_curve
from sklearn import metrics

# functions needed for imbalanced_cross_validation_score()
from sklearn.model_selection import StratifiedKFold

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

# Classification models to compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM

from sklearn import preprocessing 

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def pr_auc_score(clf, x, y):
    '''
        This function computes area under the precision-recall curve. 
    '''
      
    precisions, recalls,_ = precision_recall_curve(y, clf.predict_proba(x)[:,1], pos_label=1)
    
    return auc(recalls, precisions)

In [3]:
def FP_FN_score(clf, x, y):
    cm = metrics.confusion_matrix(y, clf.predict(x))
    FP = cm[0][1]
    FN = cm[1][0]
    return FP, FN

In [4]:
def imbalanced_cross_validation_score(clf, x, y, cv, scoring, sampler):
    '''
        This function computes the cross-validation score of a given 
        classifier using a choice of sampling function to mitigate 
        the class imbalance, and stratified k-fold sampling.
        
        The first five arguments are the same as 
        sklearn.model_selection.cross_val_score.
        
        - clf.predict_proba(x) returns class label probabilities
        - clf.fit(x,y) trains the model
        
        - x = data
        
        - y = labels
        
        - cv = the number of folds in the cross validation
        
        - scoring(classifier, x, y) returns a float
        
        The last argument is a choice of random sampler: an object 
        similar to the sampler objects available from the python 
        package imbalanced-learn. In particular, this 
        object needs to have the method:
        
        sampler.fit_sample(x,y)
        
        See http://contrib.scikit-learn.org/imbalanced-learn/
        for more details and examples of other sampling objects 
        available.  
    
    '''
    
    cv_score = 0.
    train_score = 0.
    test_score = 0.
    FP = 0
    FN = 0
    
    # stratified k-fold creates folds with the same ratio of positive 
    # and negative samples as the entire dataset.
    
    skf = StratifiedKFold(n_splits=cv, random_state=0, shuffle=False)
    
    for train_idx, test_idx in skf.split(x,y):
        
        xfold_train_sampled, yfold_train_sampled = sampler.fit_sample(x[train_idx],y[train_idx])
        clf.fit(xfold_train_sampled, yfold_train_sampled)
        
        FP_train, FN_train = scoring(clf, xfold_train_sampled, yfold_train_sampled)
        FP_test, FN_test  = scoring(clf, x[test_idx], y[test_idx])
        
        print("Train FP: {0} Train FN: {1}; Test FP: {2} Test FN: {3}".format(FP_train,FN_train, FP_test, FN_test))

        FP += FP_test
        FN += FN_test
        
    return FP/cv, FN/cv

In [5]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

data_nolabel_v_f = data_nolabel_v.fillna(data_nolabel_v.mean())
data_label_v_f = data_label_v.fillna(data_label_v.mean())

In [6]:
X = data_label_v_f.values
y = data_label['Var66'].values

In [7]:
x = preprocessing.Normalizer().fit_transform(X)

In [8]:
clfs={
    'gnb': GaussianNB(),
    'svm': SVC(),
    'lr':  LogisticRegression(),
    'rfc': RandomForestClassifier(),
    'et': ExtraTreesClassifier(),
    'ada': AdaBoostClassifier(),
    'ml': MLPClassifier()
}
cv = 5
for clf_name in clfs:
    print("Classifier: {0}".format(clf_name))
    # Logistic regression score with Random Over-sampling
    print("Random over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, RandomOverSampler())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with SMOTE
    print("SMOTE over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, SMOTE())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with ADASYN
    print("ADASYN over-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, ADASYN())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

    # Logistic regression score with Random Under Sampling
    print("Random under-sampling")
    FP, FN = imbalanced_cross_validation_score(clfs[clf_name], x, y, cv, FP_FN_score, RandomUnderSampler())
    print("average FP: %.2f average FN: %.2f "%(FP, FN))

Classifier: gnb
Random over-sampling
Train FP: 2889 Train FN: 389; Test FP: 709 Test FN: 6
Train FP: 2811 Train FN: 377; Test FP: 722 Test FN: 7
Train FP: 2889 Train FN: 389; Test FP: 746 Test FN: 5
Train FP: 3012 Train FN: 215; Test FP: 753 Test FN: 4
Train FP: 2913 Train FN: 368; Test FP: 710 Test FN: 2
average FP: 728.00 average FN: 4.80 
SMOTE over-sampling
Train FP: 2831 Train FN: 294; Test FP: 691 Test FN: 8
Train FP: 2788 Train FN: 273; Test FP: 712 Test FN: 7
Train FP: 2804 Train FN: 327; Test FP: 734 Test FN: 6
Train FP: 2918 Train FN: 311; Test FP: 725 Test FN: 4
Train FP: 2861 Train FN: 372; Test FP: 696 Test FN: 3
average FP: 711.60 average FN: 5.60 
ADASYN over-sampling
Train FP: 2827 Train FN: 315; Test FP: 694 Test FN: 8
Train FP: 2820 Train FN: 296; Test FP: 722 Test FN: 7
Train FP: 2828 Train FN: 347; Test FP: 737 Test FN: 6
Train FP: 2945 Train FN: 316; Test FP: 735 Test FN: 4
Train FP: 2892 Train FN: 373; Test FP: 702 Test FN: 3
average FP: 718.00 average FN: 5.60 
R

Train FP: 542 Train FN: 258; Test FP: 141 Test FN: 10
Train FP: 442 Train FN: 260; Test FP: 105 Test FN: 11
Train FP: 459 Train FN: 242; Test FP: 132 Test FN: 14
average FP: 130.40 average FN: 14.20 
ADASYN over-sampling
Train FP: 445 Train FN: 273; Test FP: 120 Test FN: 17
Train FP: 542 Train FN: 176; Test FP: 164 Test FN: 21
Train FP: 504 Train FN: 319; Test FP: 124 Test FN: 11
Train FP: 554 Train FN: 222; Test FP: 130 Test FN: 12
Train FP: 457 Train FN: 253; Test FP: 125 Test FN: 11
average FP: 132.60 average FN: 14.40 
Random under-sampling
Train FP: 32 Train FN: 43; Test FP: 305 Test FN: 9
Train FP: 34 Train FN: 31; Test FP: 322 Test FN: 12
Train FP: 40 Train FN: 31; Test FP: 305 Test FN: 7
Train FP: 39 Train FN: 29; Test FP: 263 Test FN: 9
Train FP: 29 Train FN: 37; Test FP: 238 Test FN: 11
average FP: 286.60 average FN: 9.60 
