In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
# functions needed for pr_auc_score()
from sklearn.metrics import auc, precision_recall_curve
from sklearn import metrics

# functions needed for imbalanced_cross_validation_score()
from sklearn.model_selection import StratifiedKFold

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler

# Classification models to compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from imblearn.ensemble import BalancedBaggingClassifier

from sklearn.metrics import matthews_corrcoef

from sklearn import preprocessing 

#auto encoding
from pyod.models.pca import PCA
from pyod.models.ocsvm import OCSVM
from pyod.models.knn import KNN
from pyod.models.abod import ABOD
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.auto_encoder import AutoEncoder

from pyod.utils.utility import *
from sklearn.utils.validation import *
from sklearn.metrics.classification import *
from sklearn.metrics.ranking import *

from threading import Thread
import time

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
def pr_auc_score(clf, x, y):
    '''
        This function computes area under the precision-recall curve. 
    '''
      
    precisions, recalls,_ = precision_recall_curve(y, clf.predict_proba(x)[:,1], pos_label=1)
    
    return auc(recalls, precisions)

In [3]:
def Confusion_matrix(clf, x, y):
    cm = metrics.confusion_matrix(y, clf.predict(x))
    mcc = matthews_corrcoef(y, clf.predict(x))
    
    return (cm[0][0], cm[0][1], cm[1][0], cm[1][1], mcc)

In [4]:
def imbalanced_cross_validation_score(clf, x, y, cv, scoring, sampler):
    
    cv_score = 0.
    train_score = 0.
    test_score = 0.
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    MCC = 0
    
    # stratified k-fold creates folds with the same ratio of positive 
    # and negative samples as the entire dataset.
    
    skf = StratifiedKFold(n_splits=cv, random_state=0, shuffle=False)
    
    for train_idx, test_idx in skf.split(x,y):
        
        xfold_train_sampled, yfold_train_sampled = sampler.fit_sample(x[train_idx],y[train_idx])
        clf.fit(xfold_train_sampled, yfold_train_sampled)
        
        TN_train, FP_train, FN_train, TP_train, mcc_train = scoring(clf, xfold_train_sampled, yfold_train_sampled)
        TN_test, FP_test, FN_test, TP_test, mcc_test  = scoring(clf, x[test_idx], y[test_idx])
        # tn, fp, fn, tp
        print("Train TP: {0} Train FP: {1} Train FN: {2} Train TN: {3}; Test TP: {4} Test FP: {5} Test FN: {6} Test TN: {7}".format(TP_train, FP_train, FN_train, TN_train, TP_test, FP_test, FN_test, TN_test))
        print("MCC train: {0} and MCC test: {1}".format(mcc_train, mcc_test))
        
        TP += TP_test
        FP += FP_test
        FN += FN_test
        TN += TN_test
        MCC += mcc_test

    ave_tp = TP/cv
    ave_fp = FP/cv
    ave_fn = FN/cv
    ave_tn = TN/cv
    ave_mcc = MCC/cv
    
    sensitivity = ave_tp/(ave_tp + ave_fn)
    specificity = ave_tn/(ave_fp + ave_tn)
    
    g_mean = math.sqrt(sensitivity * specificity)
    
    values = [sensitivity, specificity, g_mean, ave_mcc]
    
        
    return values

In [5]:
def evaluation_print(clf_name, y, y_pred):
    """
    Utility function for evaluating and printing the results for examples
    Internal use only

    :param clf_name: The name of the detector
    :type clf_name: str

    :param y: The ground truth
    :type y: list or array of shape (n_samples,)

    :param y_pred: The predicted outlier scores
    :type y: list or array of shape (n_samples,)
    """

    # turn raw prediction decision scores into binary labels
    y_pred = get_label_n(y, y_pred)

    # enforce formats of y and labels_
    y = column_or_1d(y)
    y_pred = column_or_1d(y_pred)

    Y_true = y.tolist()
    N = Y_true.count(0)
    P = Y_true.count(1)

    roc = np.round(roc_auc_score(y, y_pred), decimals=4)
    prn = np.round(precision_score(y, y_pred), decimals=4)
    rec = np.round(recall_score(y, y_pred), decimals=4)
    f = np.round((2 * prn * rec / (prn + rec)), decimals=4)
    fp = np.round((P * rec * (1 - prn)) / (prn * N), decimals=4)
    
    # print('Algorithm:', clf_name)
    # print('Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}'.format(roc,prn,rec,f,fp))
    # return True
    return roc, prn, rec, f, fp

In [6]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

data_nolabel_v_f = data_nolabel_v.fillna(data_nolabel_v.mean())
data_label_v_f = data_label_v.fillna(data_label_v.mean())

In [7]:
X = data_label_v_f.values
y = data_label['Var66'].values

In [8]:
x = preprocessing.Normalizer().fit_transform(X)

In [9]:
#############################################
class myAD_Thread(Thread):
    # the testing data and labels are from above
    def __init__(self, option, data=x, label=y):
        Thread.__init__(self)
        self.option = option
        self.data = data
        self.label = label
        # self.a = a
        # self.b = b
        self.clfs = {
            'PCA': PCA(),
            'OCSVM': OCSVM(),
            'KNN': KNN(),
            'ABOD': ABOD(),
            "FB": FeatureBagging(),
            'AE': AutoEncoder(verbose=0)
        }

    def AD_algo(self):
        print('testing with %s ...' % (self.option))
        # fit PCA detector
        # clf_name = 'PCA'
        clf_name = self.option
        clf = self.clfs[clf_name]
        clf.fit(self.data)

        # get the prediction labels and outlier scores
        y_pred = clf.labels_  # binary labels (0: inliers, 1: outliers)
        y_scores = clf.decision_scores_  # raw outlier scores

        cm = metrics.confusion_matrix(y, y_pred)
        mcc = matthews_corrcoef(y, y_pred)


        # evaluate and print the results
        roc, prn, rec, f1, fp = evaluation_print(clf_name, self.label, y_scores)
        print('%s: Results for Algorithm %s are:' % (self.getName(), clf_name))
        print('Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}, MCC={}, FP={}, FN={}'.format(roc, prn, rec, f1, fp, mcc, cm[0,1], cm[1,0]))
        # c = self.a + self.b
        # print('result for %s is:' % (self.getName()))
        # print(c)
        f = open("./results/AD_pyod.txt", "a")
        f.write('--------------------------------------------\n')
        f.write('%s: Results for Algorithm %s are:\n' % (self.getName(), clf_name))
        f.write('Accuracy={}, precision={}, recall={}, f_score={}, false_positive={}\n'.format(roc, prn, rec, f1, fp))
        f.close()


In [10]:
if __name__ == "__main__":
    # nameDict = ['PCA', 'OCSVM', 'KNN', 'ABOD', 'FB', 'AE']
    # threads = []
    # nameDict = ['PCA', 'KNN']
    # for idx, algo_name in enumerate(nameDict, 1):
    #     t = threading.Thread(target=myAD_pyod, args=(algo_name,))
    #     threads.append(t)
    #     t.start()
    print('Main Starting...')
    
    data_id = 'test'

    f = open("./results/AD_pyod.txt", "a")
    f.write('--------------------------------------------\n')
    f.write('--------------------------------------------\n')
    f.write('Pyod Results for Data-set: %s\n' % data_id)
    f.write('--------------------------------------------\n')
    f.close()

    myThreadOb1 = myAD_Thread("PCA")
    myThreadOb1.setName('Thread 1')

    myThreadOb2 = myAD_Thread("OCSVM")
    myThreadOb2.setName('Thread 2')

    myThreadOb3 = myAD_Thread("KNN")
    myThreadOb3.setName('Thread 3')

    myThreadOb4 = myAD_Thread("ABOD")
    myThreadOb4.setName('Thread 4')

    myThreadOb5 = myAD_Thread("FB")
    myThreadOb5.setName('Thread 5')

    myThreadOb6 = myAD_Thread("AE")
    myThreadOb6.setName('Thread 6')

    # Start running the threads!
    myThreadOb1.AD_algo()
    myThreadOb2.AD_algo()
    myThreadOb3.AD_algo()
    myThreadOb4.AD_algo()
    myThreadOb5.AD_algo()
    myThreadOb6.AD_algo()

    print('Main Terminating...')


Main Starting...
testing with PCA ...
Thread 1: Results for Algorithm PCA are:
Accuracy=0.5002, precision=0.0353, recall=0.0353, f_score=0.0353, false_positive=0.0348, MCC=0.03351388104602396, FP=462, FN=144
testing with OCSVM ...
Thread 2: Results for Algorithm OCSVM are:
Accuracy=0.5338, precision=0.1, recall=0.1, f_score=0.1, false_positive=0.0325, MCC=0.07076594866883137, FP=452, FN=134
testing with KNN ...
Thread 3: Results for Algorithm KNN are:
Accuracy=0.5063, precision=0.0471, recall=0.0471, f_score=0.0471, false_positive=0.0344, MCC=0.03351388104602396, FP=462, FN=144
testing with ABOD ...
Thread 4: Results for Algorithm ABOD are:
Accuracy=0.4972, precision=0.0294, recall=0.0294, f_score=0.0294, false_positive=0.035, MCC=0.029788674283743218, FP=463, FN=145
testing with FB ...
Thread 5: Results for Algorithm FB are:
Accuracy=0.5307, precision=0.0941, recall=0.0941, f_score=0.0941, false_positive=0.0327, MCC=0.09684239600479658, FP=445, FN=127
testing with AE ...
Instructions 