In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
# functions needed for pr_auc_score()
from sklearn.metrics import auc, precision_recall_curve
from sklearn import metrics

# functions needed for imbalanced_cross_validation_score()
from sklearn.model_selection import StratifiedKFold

# sampler objects
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, NeighbourhoodCleaningRule, AllKNN, InstanceHardnessThreshold

# Classification models to compare
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier

from sklearn.decomposition import PCA

from sklearn import preprocessing 

from sklearn.metrics import matthews_corrcoef

from lightgbm import LGBMClassifier

from catboost import CatBoostClassifier

from xgboost import XGBClassifier

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def pr_auc_score(clf, x, y):
    '''
        This function computes area under the precision-recall curve. 
    '''
      
    precisions, recalls,_ = precision_recall_curve(y, clf.predict_proba(x)[:,1], pos_label=1)
    
    return auc(recalls, precisions)

In [3]:
def Confusion_matrix(clf, x, y):
    cm = metrics.confusion_matrix(y, clf.predict(x))
    mcc = matthews_corrcoef(y, clf.predict(x))
    
    return (cm[0][0], cm[0][1], cm[1][0], cm[1][1], mcc)

In [4]:
def imbalanced_cross_validation_score(clf, x, y, cv, scoring, sampler):
    
    cv_score = 0.
    train_score = 0.
    test_score = 0.
    TP = 0
    FP = 0
    FN = 0
    TN = 0
    MCC = 0
    
    # stratified k-fold creates folds with the same ratio of positive 
    # and negative samples as the entire dataset.
    
    skf = StratifiedKFold(n_splits=cv, random_state=0, shuffle=False)
    
    for train_idx, test_idx in skf.split(x,y):
        
        xfold_train_sampled, yfold_train_sampled = sampler.fit_sample(x[train_idx],y[train_idx])
        clf.fit(xfold_train_sampled, yfold_train_sampled)
        
        TN_train, FP_train, FN_train, TP_train, mcc_train = scoring(clf, xfold_train_sampled, yfold_train_sampled)
        TN_test, FP_test, FN_test, TP_test, mcc_test  = scoring(clf, x[test_idx], y[test_idx])
        # tn, fp, fn, tp
        print("Train TP: {0} Train FP: {1} Train FN: {2} Train TN: {3}; Test TP: {4} Test FP: {5} Test FN: {6} Test TN: {7}".format(TP_train, FP_train, FN_train, TN_train, TP_test, FP_test, FN_test, TN_test))
        print("MCC train: {0} and MCC test: {1}".format(mcc_train, mcc_test))
        
        TP += TP_test
        FP += FP_test
        FN += FN_test
        TN += TN_test
        MCC += mcc_test

    ave_tp = TP/cv
    ave_fp = FP/cv
    ave_fn = FN/cv
    ave_tn = TN/cv
    ave_mcc = MCC/cv
    
    sensitivity = ave_tp/(ave_tp + ave_fn)
    specificity = ave_tn/(ave_fp + ave_tn)
    
    g_mean = math.sqrt(sensitivity * specificity)
    
    values = [sensitivity, specificity, g_mean, ave_mcc]
    
        
    return values

In [5]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

data_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')

data_nolabel = data_all[data_all.Var66.isnull()]
data_label = data_all[data_all.Var66.notnull()]

data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

x_test_v = x_test_o.drop(columns=['Var1'])

# data_all_v = data_all.drop(columns=['Var1', 'Var66'])
# data_all_v_mean = data_all_v.mean()
# data_all_v_f = data_all_v.fillna(data_all_v_mean)
# minmax_scaler = preprocessing.MinMaxScaler().fit(data_all_v)

# data_nolabel_v_f = data_nolabel_v.fillna(data_all_v_mean)
# data_label_v_f = data_label_v.fillna(data_all_v_mean)

In [6]:
# feature_selected = ['Var28', 'Var22', 'Var25', 'Var7', 'Var27', 'Var17', 'Var35', 'Var30',
#        'Var6', 'Var63']

In [7]:
x_all = pd.concat([data_label_v, x_test_v], axis=0)

# x_all_s = x_all[feature_selected]

In [8]:
x_all_f = x_all.fillna(x_all.mean()).values
x_all_f_scale = preprocessing.Normalizer().fit_transform(x_all_f)

In [9]:
x = x_all_f_scale[:4879,:]
y = data_label['Var66'].values

x_test_scale = x_all_f_scale[4879:,:]

In [43]:
from sklearn.cluster import DBSCAN
dbs = DBSCAN(eps=0.01, min_samples=8, metric='cosine', algorithm='auto')
dbs.fit(x)
pred_y = dbs.labels_

In [44]:
df_pred = pd.DataFrame(pred_y, columns=["Var66"])
df_pred['Var66'].value_counts()

 0    3753
-1     942
 1      59
 2      43
 3      30
 5      18
 4      11
 7       8
 6       8
 8       7
Name: Var66, dtype: int64

In [71]:
x_cluster_train = pd.concat([pd.DataFrame(x), df_pred], axis=1)
x_cluster_train.columns = data_label.drop(columns=['Var1']).columns

In [86]:
centroid = []
for i in list(range(1,9)):
    df_temp = x_cluster_train[x_cluster_train.Var66 == i]
    mean_point = list(df_temp.drop(columns=['Var66']).mean())
    centroid.append(mean_point)

In [88]:
def eucliden(X, c):
    c = c.reshape(1, X.shape[1]) # 1 x p
    distances = np.sqrt(np.sum((X - c) ** 2, axis=1))
    return distances

In [97]:
main_cluster_0 = x_cluster_train[x_cluster_train.Var66 == 0].drop(columns=['Var66']).values

In [98]:
main_cluster_1 = x_cluster_train[x_cluster_train.Var66 == -1].drop(columns=['Var66']).values

In [87]:
np.array(centroid).shape

(8, 64)

In [95]:
result = []
for test_x in centroid:
    distances = eucliden(main_cluster, np.array(test_x))
    indexs = np.argsort(distances)[0:3]
    k_dist = distances[indexs].sum()
    result.append(k_dist)

In [96]:
result

[2.5980734070060256,
 1.1068394016098777,
 3.3078698967537314,
 0.6817611043957903,
 2.29201161722656,
 4.029493241451158,
 0.8190006607979017,
 0.5710380439597493]

In [99]:
result_1 = []
for test_x in centroid:
    distances = eucliden(main_cluster_1, np.array(test_x))
    indexs = np.argsort(distances)[0:3]
    k_dist = distances[indexs].sum()
    result_1.append(k_dist)

In [100]:
result_1

[0.6009796117175947,
 0.81217752096967,
 0.6541795033472869,
 0.5890299791387115,
 0.5714436487795931,
 0.8675219228224091,
 0.5264035082575742,
 0.571097395496261]

In [111]:
x_cluster_train[x_cluster_train.Var66 == 3]['Var66'] = 100.0
x_cluster_train[x_cluster_train.Var66 == 6]['Var66'] = 100.0

In [112]:
df_pred_1 = x_cluster_train[x_cluster_train.Var66 == 100]

In [113]:
x_cluster_train[x_cluster_train.Var66 == 3]['Var66'] = 100

In [114]:
x_cluster_train[x_cluster_train.Var66 == 100]

Unnamed: 0,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,Var11,...,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65,Var66


In [10]:
x_train, y_train = SMOTE(kind='svm', sampling_strategy=0.7,random_state=42).fit_sample(x,y)

In [47]:
clf = XGBClassifier(random_state=92,learning_rate=0.2,scale_pos_weight=0.5, max_depth=5, subsample=0.7, n_estimators=500, gamma=0, colsample_bytree=0.9)
clf.fit(x_train, y_train)
y_pred_dt = clf.predict(x_test_scale)

In [48]:
x_test_business_id = x_test_o['Var1']
df_y = pd.DataFrame(y_pred_dt, columns=["Is_Bankrupted"])
upload = pd.concat([x_test_business_id, df_y], axis=1)
df_y["Is_Bankrupted"].value_counts()

0.0    1423
1.0      77
Name: Is_Bankrupted, dtype: int64

In [23]:
upload = upload.astype('int32')
upload.columns=['Business_ID', 'Is_Bankrupted']
upload.to_csv('4_15_73.csv', index=False)