In [1]:
# data manipulation
import pandas as pd
import numpy as np

# model training
from sklearn.model_selection import GridSearchCV

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.svm import SVC # SVM
from sklearn.ensemble import BaggingClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# machine learning models/techniques third parties 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.linear_model as sklm

# for final validation to plot ROC curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

%matplotlib inline

from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

from tensorflow import set_random_seed
import keras
from keras.models import Model
from keras.layers import Input, Conv2D, Dense, Activation, MaxPool2D, Dropout, Flatten
from keras import optimizers, losses
from keras.models import model_from_json


# to time experiments
import time


# to calculate kurtosis and skew
from scipy.stats import kurtosis
from scipy.stats import skew

# for plots
import seaborn as sns 
import matplotlib as pl
import matplotlib.pyplot as plt

import dataexp as dtex # our code
import dataio as dtio # our code 
import preprocessing as pre # our code

# to display pandas and other elements in html
from IPython.display import display
from IPython.core.display import HTML
from IPython.display import Image

# for ordered dictionaries 
from collections import OrderedDict

# for feature extraction
import feature_extraction as fe # our code

# machine learning models/techniques
import model_validation as me # our code
#from decision_tree import DecisionTree # our code
#from random_forest import RandomForest # our code
from logistic_regression import LogisticRegression # our code





# ignore warnings
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [112]:
x_train_o = pd.read_csv('financial_data.csv')
y_train_o = pd.read_csv('revealed_businesses.csv')

x_test_o = pd.read_csv("testing_data.csv")

In [113]:
x_train_o.replace('?', np.nan, inplace=True)
x_train_o = x_train_o.astype('float64')


x_test_o.replace('?', np.nan, inplace=True)
x_test_o = x_test_o.astype('float64')

In [114]:
data_train_all = x_train_o.merge(y_train_o, on='Var1', how = 'left')


In [178]:
data_nolabel = data_train_all[data_train_all.Var66.isnull()]
data_label = data_train_all[data_train_all.Var66.notnull()]

In [179]:
data_nolabel_v = data_nolabel.drop(columns=['Var1', 'Var66'])
data_nolabel_id = data_nolabel['Var1']

data_label_v = data_label.drop(columns=['Var1', 'Var66'])
data_label_id = data_label['Var1']

In [180]:
data_nolabel_v_f = data_nolabel_v.fillna(data_nolabel_v.mean())
data_label_v_f = data_label_v.fillna(data_label_v.mean())

In [181]:
data_nolabel_v_f = pd.DataFrame(preprocessing.StandardScaler().fit_transform(data_nolabel_v_f),columns = data_nolabel_v_f.columns)
data_label_v_f = pd.DataFrame(preprocessing.StandardScaler().fit_transform(data_label_v_f),columns = data_label_v_f.columns)

In [182]:
X_pred = data_nolabel_v_f.iloc[:,:].values
X_pred.shape

(4124, 64)

In [183]:
X = data_label_v_f.iloc[:,:].values
y = data_label['Var66'].values
X.shape

(4879, 64)

In [184]:
def eucliden(X, c):
    c = c.reshape(1, X.shape[1]) # 1 x p
    distances = np.sqrt(np.sum((X - c) ** 2, axis=1))
    return distances

In [185]:
result = []
for test_x in X_pred:
    distances = eucliden(X, test_x)
    indexs = np.argsort(distances)[0:3]
    label = 1 if y[indexs].sum()>1 else 0
    result.append(label)

In [186]:
y_pred = pd.DataFrame(result, columns=["Var66"])
y_pred['Var66'].value_counts()

0    4059
1      65
Name: Var66, dtype: int64

In [154]:
#data_nolabel_selected_stand = pd.DataFrame(preprocessing.StandardScaler().fit_transform(data_nolabel_selected),columns = data_nolabel_selected.columns)
data_nolabel_selected_stand = data_nolabel_selected.reset_index().drop(columns=['index'])

In [155]:
data_nolabel_id_noidx = data_nolabel_id.reset_index()['Var1']
data_nolabel_v = data_nolabel_v.reset_index().drop(columns=['index'])

In [156]:

data_nolabel_pred = pd.concat([data_nolabel_id_noidx, data_nolabel_v_f, y_pred], axis=1)

In [157]:
data_nolabel_pred['Var66'].value_counts()

0    3659
1     465
Name: Var66, dtype: int64

In [158]:
data_label_fill = data_label.fillna(data_label.mean())

In [159]:
data_tran_all  = pd.concat([data_label_fill, data_nolabel_pred], axis=0)

In [160]:
data_tran_all.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65,Var66
0,18399.0,0.023954,0.15012,0.39567,3.6357,54.043,0.028822,0.031029,4.56831,1.0112,...,0.011041,0.034914,0.98896,0.0,9.5214,5.8248,34.713,10.515,3.4752,0.0
2,19821.0,-0.35631,0.39288,0.15884,1.4043,-2.619,-0.085597,-0.35632,1.54531,0.92963,...,-0.17277,-0.58691,1.3833,0.0,5.6112,15.779,154.26,2.3662,2.0738,0.0
6,17769.0,0.0041,0.72304,-0.47065,0.34907,-169.23,0.0,0.00409,0.38306,1.1092,...,-0.096542,0.014766,0.99667,0.0,15.8971,6.2881,237.92,1.5341,1.4837,0.0
7,19309.0,0.024596,0.084252,0.38657,5.5883,44.886,0.17429,0.028695,10.19801,1.0069,...,0.006825,0.028616,0.99317,0.0,26.1531,15.247,12.723,28.687,4.5674,0.0
9,20728.0,0.03671,0.41693,0.13777,1.3733,-18.123,0.037957,0.050825,1.28881,0.99111,...,-0.008967,0.068297,1.009,0.089127,7.6912,5.5223,84.191,4.3354,3.2441,0.0


In [161]:
data_nolabel_pred.isnull().sum().sum()

0

In [162]:
os_dfs_dict = OrderedDict()

os_dfs_dict['mean'] = pre.oversample_smote(data_tran_all, columns = data_tran_all.columns, verbose=True)

original dataset (labels): {0.0: 8368, 1.0: 635}
total: 9003
resampled dataset (labels): {0.0: 8368, 1.0: 8368}
total: 16736



In [163]:
x_test_business_id = x_test_o['Var1']
x_test = x_test_o.drop(columns=['Var1'])
x_test_f = x_test.fillna(x_test.mean())

In [164]:
best_period = os_dfs_dict['mean'][0]
X = best_period.iloc[:,1:-1].values
y = best_period["Var66"].values

random_forest_classifier_sklearn_roc_all = RandomForestClassifier(criterion="gini", 
                                                          n_estimators = 10,
                                                          max_features = "sqrt", 
                                                          random_state = 94)

random_forest_classifier_sklearn_roc_all.fit(X, y)
y_pred_dt = random_forest_classifier_sklearn_roc_all.predict(x_test_f)


In [165]:
df_y = pd.DataFrame(y_pred_dt, columns=["Is_Bankrupted"])
upload = pd.concat([x_test_business_id, df_y], axis=1)
df_y["Is_Bankrupted"].value_counts()

0    1463
1      37
Name: Is_Bankrupted, dtype: int64

In [166]:
upload = upload.astype('int32')
upload.columns=['Business_ID', 'Is_Bankrupted']
upload.to_csv('3_22_2.csv', index=False)