In [1]:
# data manipulation
import pandas as pd
import numpy as np

# model training
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# machine learning models/techniques third parties 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.linear_model as sklm

# for final validation to plot ROC curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

from tensorflow import set_random_seed
import keras
from keras.models import Model
from keras.layers import Input, Conv2D, Dense, Activation, MaxPool2D, Dropout, Flatten
from keras import optimizers, losses
from keras.models import model_from_json


# to time experiments
import time


# to calculate kurtosis and skew
from scipy.stats import kurtosis
from scipy.stats import skew

# for plots
import seaborn as sns 
import matplotlib as pl
import matplotlib.pyplot as plt

import dataexp as dtex # our code
import dataio as dtio # our code 
import preprocessing as pre # our code

# to display pandas and other elements in html
from IPython.display import display
from IPython.core.display import HTML
from IPython.display import Image

# for ordered dictionaries 
from collections import OrderedDict

# for feature extraction
import feature_extraction as fe # our code

# machine learning models/techniques
import model_validation as me # our code
#from decision_tree import DecisionTree # our code
#from random_forest import RandomForest # our code
from logistic_regression import LogisticRegression # our code





# ignore warnings
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

In [3]:
x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

In [4]:
data_train_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')


In [7]:
data_nolabel = data_train_all[data_train_all.Var66.isnull()]
data_label = data_train_all[data_train_all.Var66.notnull()]

In [10]:
data_nolabel.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65,Var66
1,15092.0,0.049699,0.065808,0.7268,12.944,233.11,0.0,0.063192,14.19601,0.89618,...,0.059565,0.053189,0.93169,0.0,5.0492,11.152,24.784,14.727,4.2204,
3,14171.0,0.001417,0.70811,-0.052312,0.88978,-31.198,0.26952,0.001407,0.41222,1.9654,...,-0.000535,0.00482,0.9993,0.74548,17.1011,7.9482,88.147,4.1408,3.4021,
4,12900.0,0.020041,0.34652,0.33593,2.7613,39.05,0.0,0.020031,1.88591,1.2975,...,0.21241,0.030652,0.80158,0.000862,9.767,6.757,53.651,6.8032,2.7412,
5,16499.0,0.14,0.41993,0.29042,1.7416,7.1865,0.20062,0.17739,1.31961,1.1028,...,0.09323,0.25262,0.90677,0.051068,8.1696,5.1309,70.315,5.1909,6.3939,
8,16122.0,0.09456,0.097575,0.42289,5.7822,82.387,0.0,0.1088,9.24851,1.2703,...,0.076875,0.10477,0.91846,0.0,7.9041,4.503,25.41,14.365,2.5994,


In [12]:
data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

In [55]:
data_nolabel.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65,Var66
1,15092.0,0.049699,0.065808,0.7268,12.944,233.11,0.0,0.063192,14.19601,0.89618,...,0.059565,0.053189,0.93169,0.0,5.0492,11.152,24.784,14.727,4.2204,
3,14171.0,0.001417,0.70811,-0.052312,0.88978,-31.198,0.26952,0.001407,0.41222,1.9654,...,-0.000535,0.00482,0.9993,0.74548,17.1011,7.9482,88.147,4.1408,3.4021,
4,12900.0,0.020041,0.34652,0.33593,2.7613,39.05,0.0,0.020031,1.88591,1.2975,...,0.21241,0.030652,0.80158,0.000862,9.767,6.757,53.651,6.8032,2.7412,
5,16499.0,0.14,0.41993,0.29042,1.7416,7.1865,0.20062,0.17739,1.31961,1.1028,...,0.09323,0.25262,0.90677,0.051068,8.1696,5.1309,70.315,5.1909,6.3939,
8,16122.0,0.09456,0.097575,0.42289,5.7822,82.387,0.0,0.1088,9.24851,1.2703,...,0.076875,0.10477,0.91846,0.0,7.9041,4.503,25.41,14.365,2.5994,


In [14]:
data_nolabel_v_f = data_nolabel_v.fillna(data_nolabel_v.mean())

In [94]:
feature_columns = ['Var5', 'Var21', 'Var22', 'Var28', 'Var42', 'Var47', 'Var9', 'Var18', 'Var33', 'Var53', 'Var59', 'Var61', 'Var7', 'Var41', 'Var44', 'Var45', 'Var48', 'Var63', 'Var13', 'Var16', 'Var17', 'Var27', 'Var35', 'Var37', 'Var10', 'Var31', 'Var46', 'Var54', 'Var55']
feature_columns_all = ['Var1','Var5', 'Var21', 'Var22', 'Var28', 'Var42', 'Var47', 'Var9', 'Var18', 'Var33', 'Var53', 'Var59', 'Var61', 'Var7', 'Var41', 'Var44', 'Var45', 'Var48', 'Var63', 'Var13', 'Var16', 'Var17', 'Var27', 'Var35', 'Var37', 'Var10', 'Var31', 'Var46', 'Var54', 'Var55', 'Var66']


In [17]:
data_nolabel_selected = data_nolabel_v_f[feature_columns]

In [18]:
data_nolabel_selected.head()

Unnamed: 0,Var5,Var21,Var22,Var28,Var42,Var47,Var9,Var18,Var33,Var53,...,Var16,Var17,Var27,Var35,Var37,Var10,Var31,Var46,Var54,Var55
1,12.944,72.291,0.59206,27.479,0.046838,10.027,14.19601,15.196,26.353,0.072201,...,338.17,1.0793,0.87414,12.807,0.89618,0.89618,-0.5081,0.27995,4.3994,4.3994
3,0.88978,21.344,0.79908,1.0323,0.35138,0.64765,0.41222,1.4122,88.1,0.24137,...,6803.7,0.053647,0.053647,2.777,1.9654,1.9654,0.3297,0.012242,0.50528,0.88196
4,2.7613,37.371,0.79278,12.926,0.10342,2.0648,1.88591,2.8859,68.121,0.18663,...,1046.1,0.34892,0.34892,2.9491,1.2975,1.2975,0.19106,0.15078,1.3806,1.3817
5,1.7416,44.678,1.1127,0.9835,0.066891,1.1062,1.31961,2.3813,77.544,0.21245,...,756.96,0.48219,0.39311,0.43174,2.0536,1.1028,0.19096,0.56255,1.7429,1.8319
8,5.7822,46.179,0.88756,4.2187,0.023078,3.9648,9.24851,10.249,27.526,0.075413,...,248.82,1.4669,1.3209,12.018,1.2703,1.2703,0.024202,0.58832,1.8467,1.8467


In [19]:
data_nolabel_selected_stand = pd.DataFrame(preprocessing.StandardScaler().fit_transform(data_nolabel_selected),columns = data_nolabel_selected.columns)

In [21]:
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs

bandwidth = estimate_bandwidth(data_nolabel_selected_stand, quantile=0.2)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(data_nolabel_selected_stand)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

In [73]:
data_nolabel_id_noidx = data_nolabel_id.reset_index()['Var1']

In [74]:
data_nolabel_id_noidx.head()

0    15092.0
1    14171.0
2    12900.0
3    16499.0
4    16122.0
Name: Var1, dtype: float64

In [82]:
result = pd.DataFrame(labels, columns=["Var66"])
data_nolabel_pred = pd.concat([data_nolabel_id_noidx, data_nolabel_selected_stand, result], axis=1)

In [89]:
data_nolabel_pred.loc[data_nolabel_pred['Var66'] > 0, 'Var66'] = 1

In [99]:
data_label_fill = data_label.fillna(data_label.mean())[feature_columns_all]

Unnamed: 0,Var1,Var5,Var21,Var22,Var28,Var42,Var47,Var9,Var18,Var33,...,Var17,Var27,Var35,Var37,Var10,Var31,Var46,Var54,Var55,Var66
0,18399.0,3.6357,38.335,1.1051,0.17331,0.071334,2.5314,4.56831,6.6614,35.1,...,0.48737,0.44017,0.18022,1.6178,1.0112,0.027224,0.14443,1.5099,1.5099,0.0
2,19821.0,1.4043,65.049,3.7256,-337.82,-0.052357,0.98259,1.54531,2.5453,131.53,...,-0.63933,-0.63933,2.775,0.92963,0.92963,0.071085,-2.1507,1.3543,1.3543,0.0
6,17769.0,0.34907,22.961,7.61638,1.5809,0.31326,0.25256,0.38306,1.383,216.97,...,0.097876,0.097876,1.6822,1.1092,1.1092,0.64624,0.05861,0.37046,0.37046,0.0
7,19309.0,5.5883,13.956,1.4024,0.090212,0.0316,4.4914,10.19801,11.869,12.811,...,1.124,1.0752,0.25703,2.4459,1.0069,-0.054839,0.26603,1.6236,1.6236,0.0
9,20728.0,1.3733,47.458,1.014,-0.18901,0.19216,0.80964,1.28881,2.3985,83.443,...,0.36618,0.3323,-0.07318,1.6909,0.99111,0.25861,0.17642,1.0896,1.1867,0.0


In [101]:
data_tran_all  = pd.concat([data_label_fill, data_nolabel_pred], axis=0)

In [103]:
data_tran_all.head()

Unnamed: 0,Var1,Var5,Var21,Var22,Var28,Var42,Var47,Var9,Var18,Var33,...,Var17,Var27,Var35,Var37,Var10,Var31,Var46,Var54,Var55,Var66
0,18399.0,3.6357,38.335,1.1051,0.17331,0.071334,2.5314,4.56831,6.6614,35.1,...,0.48737,0.44017,0.18022,1.6178,1.0112,0.027224,0.14443,1.5099,1.5099,0.0
2,19821.0,1.4043,65.049,3.7256,-337.82,-0.052357,0.98259,1.54531,2.5453,131.53,...,-0.63933,-0.63933,2.775,0.92963,0.92963,0.071085,-2.1507,1.3543,1.3543,0.0
6,17769.0,0.34907,22.961,7.61638,1.5809,0.31326,0.25256,0.38306,1.383,216.97,...,0.097876,0.097876,1.6822,1.1092,1.1092,0.64624,0.05861,0.37046,0.37046,0.0
7,19309.0,5.5883,13.956,1.4024,0.090212,0.0316,4.4914,10.19801,11.869,12.811,...,1.124,1.0752,0.25703,2.4459,1.0069,-0.054839,0.26603,1.6236,1.6236,0.0
9,20728.0,1.3733,47.458,1.014,-0.18901,0.19216,0.80964,1.28881,2.3985,83.443,...,0.36618,0.3323,-0.07318,1.6909,0.99111,0.25861,0.17642,1.0896,1.1867,0.0


In [104]:
os_dfs_dict = OrderedDict()

os_dfs_dict['mean'] = pre.oversample_smote(data_tran_all, columns = data_tran_all.columns, verbose=True)

original dataset (labels): {0.0: 8606, 1.0: 397}
total: 9003
resampled dataset (labels): {0.0: 8606, 1.0: 8606}
total: 17212



In [109]:
x_test_business_id = x_test_o['Var1']
x_test = x_test_o.drop(columns=['Var1'])
x_test_f = x_test.fillna(x_test.mean())

In [111]:
# initialize decision tree classifier (sklearn)
decision_tree_classifier_sklearn = DecisionTreeClassifier(criterion="gini")

# initialize random forest classifier (sklearn)
random_forest_classifier_sklearn = RandomForestClassifier(criterion="gini", 
                                                          n_estimators = 10,
                                                          max_features = "sqrt", 
                                                          random_state = 94)

# initialize logistic regression classifier (sklearn)
log_reg_classifier_sklearn = sklm.LogisticRegression(C = 0.1, 
                                                     max_iter = 1000,
                                                     penalty="l2", 
                                                     solver="sag", 
                                                     random_state = 94)


# creating ordered dictionary for all different models (sklearn)
models_dict_sklearn = OrderedDict()
models_dict_sklearn["Decision Tree Sklearn"] = (decision_tree_classifier_sklearn, False)
models_dict_sklearn["Random Forest Sklearn"] = (random_forest_classifier_sklearn, False)
models_dict_sklearn["Logistic Regression Sklearn"] = (log_reg_classifier_sklearn, True)

# initialize dictionary to hold results for each experiment (sklearn)
experiments_dict_sklearn = OrderedDict()

In [114]:
best_period = os_dfs_dict['mean'][0]
X = best_period.iloc[:,1:-1].values
y = best_period["Var66"].values

random_forest_classifier_sklearn_roc_all = RandomForestClassifier(criterion="gini", 
                                                          n_estimators = 10,
                                                          max_features = "sqrt", 
                                                          random_state = 94)

random_forest_classifier_sklearn_roc_all.fit(X, y)
y_pred_dt = random_forest_classifier_sklearn_roc_all.predict(x_test_f[feature_columns])


In [124]:
df_y = pd.DataFrame(y_pred_dt, columns=["Is_Bankrupted"])
upload = pd.concat([x_test_business_id, df_y], axis=1)
df_y["Is_Bankrupted"].value_counts()

0    1468
1      32
Name: Is_Bankrupted, dtype: int64

In [127]:
upload = upload.astype('int32')
upload.columns=['Business_ID', 'Is_Bankrupted']
upload.to_csv('3_17.csv', index=False)