In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
# functions needed for pr_auc_score()
from sklearn.metrics import auc, precision_recall_curve
from sklearn import metrics

# functions needed for imbalanced_cross_validation_score()
from sklearn.model_selection import StratifiedKFold

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule, AllKNN, InstanceHardnessThreshold

# Classification models to compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier

from sklearn.decomposition import PCA

from xgboost import XGBClassifier

from sklearn import preprocessing 

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def pr_auc_score(clf, x, y):
    '''
        This function computes area under the precision-recall curve. 
    '''
      
    precisions, recalls,_ = precision_recall_curve(y, clf.predict_proba(x)[:,1], pos_label=1)
    
    return auc(recalls, precisions)

In [3]:
def Confusion_matrix(clf, x, y):
    cm = metrics.confusion_matrix(y, clf.predict(x))
    mcc = matthews_corrcoef(y, clf.predict(x))
    
    return (cm[0][0], cm[0][1], cm[1][0], cm[1][1], mcc)

In [4]:
def imbalanced_cross_validation_score(clf, x, y, cv, scoring, sampler):
    
    cv_score = 0.
    train_score = 0.
    test_score = 0.
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    MCC = 0
    
    # stratified k-fold creates folds with the same ratio of positive 
    # and negative samples as the entire dataset.
    
    skf = StratifiedKFold(n_splits=cv, random_state=0, shuffle=False)
    
    for train_idx, test_idx in skf.split(x,y):
        
        xfold_train_sampled, yfold_train_sampled = sampler.fit_sample(x[train_idx],y[train_idx])
        clf.fit(xfold_train_sampled, yfold_train_sampled)
        
        TN_train, FP_train, FN_train, TP_train, mcc_train = scoring(clf, xfold_train_sampled, yfold_train_sampled)
        TN_test, FP_test, FN_test, TP_test, mcc_test  = scoring(clf, x[test_idx], y[test_idx])
        # tn, fp, fn, tp
        print("Train TP: {0} Train FP: {1} Train FN: {2} Train TN: {3}; Test TP: {4} Test FP: {5} Test FN: {6} Test TN: {7}".format(TP_train, FP_train, FN_train, TN_train, TP_test, FP_test, FN_test, TN_test))
        print("MCC train: {0} and MCC test: {1}".format(mcc_train, mcc_test))
        
        TP += TP_test
        FP += FP_test
        FN += FN_test
        TN += TN_test
        MCC += mcc_test

    ave_tp = TP/cv
    ave_fp = FP/cv
    ave_fn = FN/cv
    ave_tn = TN/cv
    ave_mcc = MCC/cv
    
    sensitivity = ave_tp/(ave_tp + ave_fn)
    specificity = ave_tn/(ave_fp + ave_tn)
    
    g_mean = math.sqrt(sensitivity * specificity)
    
    values = [sensitivity, specificity, g_mean, ave_mcc]
    
        
    return values

In [5]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

x_test_v = x_test_o.drop(columns=['Var1'])

# data_all_v = data_all.drop(columns=['Var1', 'Var66'])
# data_all_v_mean = data_all_v.mean()
# data_all_v_f = data_all_v.fillna(data_all_v_mean)
# minmax_scaler = preprocessing.MinMaxScaler().fit(data_all_v)

# data_nolabel_v_f = data_nolabel_v.fillna(data_all_v_mean)
# data_label_v_f = data_label_v.fillna(data_all_v_mean)

In [6]:
feature_selected = ['Var23', 'Var26', 'Var28', 'Var30', 'Var47', 'Var19', 'Var35', 'Var57', 'Var10', 'Var12', 'Var45', 'Var33', 'Var16']

In [7]:
x_all = pd.concat([data_label_v, x_test_v], axis=0)

x_all_s = x_all#[feature_selected]

In [15]:
x_all_f = x_all_s.fillna(x_all_s.mean()).values
x_all_f_scale = x_all_f#preprocessing.RobustScaler().fit_transform(x_all_f)

In [9]:
x = x_all_f[:4879,:]
y = data_label['Var66'].values

x_test_scale = x_all_f[4879:,:]

In [10]:
x_train, y_train = RandomOverSampler(random_state=42).fit_sample(x,y)

In [11]:
clf = XGBClassifier(learning_rate=0.2,scale_pos_weight=0.5, max_depth=5, subsample=0.7, n_estimators=500, base_score=0.5, gamma=0, colsample_bytree=0.9)
clf.fit(x_train, y_train)
y_pred_dt = clf.predict(x_test_scale)

In [12]:
x_test_business_id = x_test_o['Var1']
df_y = pd.DataFrame(y_pred_dt, columns=["Is_Bankrupted"])
upload = pd.concat([x_test_business_id, df_y], axis=1)
df_y["Is_Bankrupted"].value_counts()

0.0    1417
1.0      83
Name: Is_Bankrupted, dtype: int64

In [13]:
upload['Is_Bankrupted'].value_counts()

0.0    1417
1.0      83
Name: Is_Bankrupted, dtype: int64

In [14]:
# upload = upload.astype('int32')
# upload.columns=['Business_ID', 'Is_Bankrupted']
# upload.to_csv('4_7_roc_78.csv', index=False)