In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
# functions needed for pr_auc_score()
from sklearn.metrics import auc, precision_recall_curve
from sklearn import metrics

# functions needed for imbalanced_cross_validation_score()
from sklearn.model_selection import StratifiedKFold

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule, AllKNN, InstanceHardnessThreshold

# Classification models to compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier

from sklearn.decomposition import PCA

from xgboost import XGBClassifier

from sklearn import preprocessing 

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def pr_auc_score(clf, x, y):
    '''
        This function computes area under the precision-recall curve. 
    '''
      
    precisions, recalls,_ = precision_recall_curve(y, clf.predict_proba(x)[:,1], pos_label=1)
    
    return auc(recalls, precisions)

In [3]:
def Confusion_matrix(clf, x, y):
    cm = metrics.confusion_matrix(y, clf.predict(x))
    return (cm[0][0], cm[0][1], cm[1][0], cm[1][1])

In [4]:
def imbalanced_cross_validation_score(clf, x, y, cv, scoring, sampler):
    '''
        This function computes the cross-validation score of a given 
        classifier using a choice of sampling function to mitigate 
        the class imbalance, and stratified k-fold sampling.
        
        The first five arguments are the same as 
        sklearn.model_selection.cross_val_score.
        
        - clf.predict_proba(x) returns class label probabilities
        - clf.fit(x,y) trains the model
        
        - x = data
        
        - y = labels
        
        - cv = the number of folds in the cross validation
        
        - scoring(classifier, x, y) returns a float
        
        The last argument is a choice of random sampler: an object 
        similar to the sampler objects available from the python 
        package imbalanced-learn. In particular, this 
        object needs to have the method:
        
        sampler.fit_sample(x,y)
        
        See http://contrib.scikit-learn.org/imbalanced-learn/
        for more details and examples of other sampling objects 
        available.  
    
    '''
    
    cv_score = 0.
    train_score = 0.
    test_score = 0.
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    
    # stratified k-fold creates folds with the same ratio of positive 
    # and negative samples as the entire dataset.
    
    skf = StratifiedKFold(n_splits=cv, random_state=0, shuffle=False)
    
    for train_idx, test_idx in skf.split(x,y):
        
        xfold_train_sampled, yfold_train_sampled = sampler.fit_sample(x[train_idx],y[train_idx])
        clf.fit(xfold_train_sampled, yfold_train_sampled)
        
        TP_train, FP_train, FN_train, TN_train = scoring(clf, xfold_train_sampled, yfold_train_sampled)
        TP_test, FP_test, FN_test, TN_test  = scoring(clf, x[test_idx], y[test_idx])
        
        print("Train FP: {0} Train FN: {1}; Test FP: {2} Test FN: {3}".format(FP_train, FN_train, FP_test, FN_test))
        
        TP += TP_test
        FP += FP_test
        FN += FN_test
        TN += TN_test
        

    ave_tp = TP/cv
    ave_fp = FP/cv
    ave_fn = FN/cv
    ave_tn = TN/cv
    
    
    sensitivity = ave_tp/(ave_tp + ave_fn)
    specificity = ave_tn/(ave_fp + ave_tn)
    
    g_mean = math.sqrt(sensitivity * specificity)
    mcc = (ave_tp * ave_tn - ave_fp * ave_fn)/math.sqrt((ave_tp + ave_fp) * (ave_tp + ave_fn) * (ave_tn + ave_fp) * (ave_tn + ave_fn))
    
    values = [sensitivity, specificity, g_mean, mcc]
    
        
    return values

In [5]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

x_test_v = x_test_o.drop(columns=['Var1'])

# data_all_v = data_all.drop(columns=['Var1', 'Var66'])
# data_all_v_mean = data_all_v.mean()
# data_all_v_f = data_all_v.fillna(data_all_v_mean)
# minmax_scaler = preprocessing.MinMaxScaler().fit(data_all_v)

# data_nolabel_v_f = data_nolabel_v.fillna(data_all_v_mean)
# data_label_v_f = data_label_v.fillna(data_all_v_mean)

In [6]:
# feature_selected = ['Var28', 'Var22', 'Var25', 'Var7', 'Var27', 'Var17', 'Var35', 'Var30',
#        'Var6', 'Var63']

In [7]:
x_all = pd.concat([data_label_v, x_test_v], axis=0)

# x_all_s = x_all[feature_selected]

In [8]:
x_all_f = x_all.fillna(x_all.mean()).values
#x_all_f_scale = preprocessing.MinMaxScaler().fit_transform(x_all_f)

In [9]:
x = x_all_f[:4879,:]
y = data_label['Var66'].values

x_test_scale = x_all_f[4879:,:]

In [10]:
x_train, y_train = RandomOverSampler(random_state=94).fit_sample(x,y)

In [14]:
clf = XGBClassifier(learning_rate=0.05,scale_pos_weight=0.5, max_depth=6, subsample=0.8, n_estimators=500, gamma=0, colsample_bytree=1)
clf.fit(x_train, y_train)
y_pred_dt = clf.predict(x_test_scale)

In [15]:
x_test_business_id = x_test_o['Var1']
df_y = pd.DataFrame(y_pred_dt, columns=["Is_Bankrupted"])
upload = pd.concat([x_test_business_id, df_y], axis=1)
df_y["Is_Bankrupted"].value_counts()

0.0    1418
1.0      82
Name: Is_Bankrupted, dtype: int64

In [16]:
upload = upload.astype('int32')
upload.columns=['Business_ID', 'Is_Bankrupted']
upload.to_csv('3_31_1.csv', index=False)