In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
import statsmodels.api as sma

from keras.models import Sequential 
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

In [None]:
data = pd.read_csv("final_non_time_data.csv")
print(data.head())

In [None]:
print(data.columns)

In [None]:
selected_columns = ["BorrZip","CDC_Name","CDC_Zip","LoanSum","TermInMonths","BusinessType", 'subpgmdesc',
       'NaicsCode','unemp_comp', 'biz_net_income', 'adj_gross_income',
       'net_cap_gain', 'total_tax_liab', '2018-2010', '2010-2000',
       '2000-1990',"P/E_t0","unemp_t0","interest_rate_t0","GDPchange_t0","LoanStatusCat"]
data_selected = data[selected_columns]
print(data_selected.head())

In [None]:
for col in data_selected.columns:
    if data_selected[col].isna().sum() > 0:
        print("column ", col, " has ", data_selected[col].isna().sum(), " NA values") 

In [None]:
data_selected["CDC_Zip"] = data_selected["CDC_Zip"].fillna(-1)
data_selected["BusinessType"] = data_selected["BusinessType"].fillna("Unknown") 
data_selected["NaicsCode"] = data_selected["NaicsCode"].fillna(-1)
data_selected["2018-2010"] = data_selected["2018-2010"].fillna(data_selected["2018-2010"].mean())
data_selected["2010-2000"] = data_selected["2010-2000"].fillna(data_selected["2010-2000"].mean())
data_selected["2000-1990"] = data_selected["2000-1990"].fillna(data_selected["2000-1990"].mean())


In [None]:
count = 0
for col in data_selected.columns:
    if data_selected[col].isna().sum() > 0:
        count += 1
        print("column ", col, " has ", data_selected[col].isna().sum(), " NA values") 
if count == 0:
    print("All good!")

In [None]:
print(data_selected.head())
print(data_selected.columns)

In [None]:
le1 = LabelEncoder()
data_selected["CDC_Name"] = le1.fit_transform(data_selected["CDC_Name"])

le2 = LabelEncoder()
data_selected["BusinessType"] = le2.fit_transform(data_selected["BusinessType"])

le3 = LabelEncoder()
data_selected["subpgmdesc"] = le3.fit_transform(data_selected["subpgmdesc"])


le5 = LabelEncoder()
data_selected["BorrZip"] = le5.fit_transform(data_selected["BorrZip"])

le6 = LabelEncoder()
data_selected["CDC_Zip"] = le6.fit_transform(data_selected["CDC_Zip"])

le7 = LabelEncoder()
data_selected["NaicsCode"] = le7.fit_transform(data_selected["NaicsCode"])

In [None]:


"""
sc1 = StandardScaler()
data_selected["LoanSum"] = sc1.fit_transform(np.array(data_selected["LoanSum"]).reshape(-1,1))

sc2 = StandardScaler()
data_selected["TermInMonths"] = sc2.fit_transform(np.array(data_selected["TermInMonths"]).reshape(-1,1))

sc3 = StandardScaler()
data_selected["unemp_comp"] = sc3.fit_transform(np.array(data_selected["unemp_comp"]).reshape(-1,1))

sc4 = StandardScaler()
data_selected["biz_net_income"] = sc4.fit_transform(np.array(data_selected["biz_net_income"]).reshape(-1,1))

sc5 = StandardScaler()
data_selected["adj_gross_income"] = sc5.fit_transform(np.array(data_selected["adj_gross_income"]).reshape(-1,1))

sc6 = StandardScaler()
data_selected["net_cap_gain"] = sc6.fit_transform(np.array(data_selected["net_cap_gain"]).reshape(-1,1))

sc7 = StandardScaler()
data_selected["total_tax_liab"] = sc7.fit_transform(np.array(data_selected["total_tax_liab"]).reshape(-1,1))

sc8 = StandardScaler()
data_selected["2018-2010"] = sc8.fit_transform(np.array(data_selected["2018-2010"]).reshape(-1,1))

sc9 = StandardScaler()
data_selected["2010-2000"] = sc9.fit_transform(np.array(data_selected["2010-2000"]).reshape(-1,1))

sc10 = StandardScaler()
data_selected["2000-1990"] = sc10.fit_transform(np.array(data_selected["2000-1990"]).reshape(-1,1))

sc11 = StandardScaler()
data_selected["P/E_t0"] = sc11.fit_transform(np.array(data_selected["P/E_t0"]).reshape(-1,1))

sc12 = StandardScaler()
data_selected["unemp_t0"] = sc12.fit_transform(np.array(data_selected["unemp_t0"]).reshape(-1,1))

sc13 = StandardScaler()
data_selected["interest_rate_t0"] = sc13.fit_transform(np.array(data_selected["interest_rate_t0"]).reshape(-1,1))

sc14 = StandardScaler()
data_selected["GDPchange_t0"] = sc14.fit_transform(np.array(data_selected["GDPchange_t0"]).reshape(-1,1))
"""

In [None]:
cols = set(data_selected.columns)
cols.remove("LoanStatusCat")
col_list = list(cols)
X = data_selected[col_list]
Y = data_selected["LoanStatusCat"]

In [None]:
cols = X.columns
sc = StandardScaler()
data_selected = pd.DataFrame(sc.fit_transform(X),columns = cols)

In [None]:
print(data_selected.head())

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, stratify = Y)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=0)
cols = x_train.columns

In [None]:
sm = SMOTE(ratio = 1.0)
x_train, y_train = sm.fit_sample(x_train, y_train)

In [None]:
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
train_preds = logisticRegr.predict(x_train)
test_preds = logisticRegr.predict(x_test)

In [None]:
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [None]:
model = sma.Logit(y_train, x_train)
result = model.fit()
results_summary = result.summary()

In [None]:
results_as_html = results_summary.tables[1].as_html()
table = pd.read_html(results_as_html, header=0, index_col=0)[0]

In [None]:
table.index = list(X.columns)
print(table.head(20))

In [None]:
train_acc = np.sum(train_preds == y_train)/len(y_train)
test_acc = np.sum(test_preds == y_test)/len(y_test)
print("Training Accuracy is: ", train_acc)
print("Testing Accuracy is: ", test_acc)

In [None]:
cm = confusion_matrix(y_test,test_preds)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Test Accuracy Score: {0}'.format(test_acc)
plt.title(all_sample_title, size = 15);
plt.savefig("Confusion_matrix_lr_final.png")

In [None]:
roc = roc_curve(y_test,test_preds)
roc_train = roc_curve(y_train,train_preds)
sns.set_style("darkgrid")
plt.plot(roc[0],roc[1],label = "test")
plt.plot(roc_train[0],roc_train[1],label = "train")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
#plt.savefig("lr_roc_final.png")
plt.show()

In [None]:
auc_score = auc(roc[0],roc[1])
print("Logistic Regression Test AUC Score is: ", auc_score)

auc_train_score = auc(roc_train[0],roc_train[1])
print("Logistic Regression Train AUC Score is: ", auc_train_score)

In [None]:
feat_import = logisticRegr.coef_[0]
col_names = cols
order = np.argsort(feat_import)[::-1]
feat_import = feat_import[order]
col_names = col_names[order]

for i in range(len(feat_import)):
    print("Feature ",col_names[i], "has an importance of: ", feat_import[i])

In [None]:
rf = RandomForestClassifier(1000,n_jobs = -1,max_depth = 10)
rf.fit(x_train,y_train)
train_preds = rf.predict(x_train)
test_preds = rf.predict(x_test)

In [None]:
train_acc = np.sum(train_preds == y_train)/len(y_train)
test_acc = np.sum(test_preds == y_test)/len(y_test)
print("Training Accuracy is: ", train_acc)
print("Testing Accuracy is: ", test_acc)

In [None]:
feat_import = rf.feature_importances_
col_names = cols
order = np.argsort(feat_import)[::-1]
feat_import = feat_import[order]
col_names = col_names[order]

for i in range(len(feat_import)):
    print("Feature ",col_names[i], "has an importance of: ", feat_import[i])

In [None]:
cm = confusion_matrix(y_test,test_preds)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Test Accuracy Score: {0}'.format(test_acc)
plt.title(all_sample_title, size = 15);
#plt.savefig("Confusion_matrix_rf_final.png")

In [None]:
roc = roc_curve(y_test,test_preds)
roc_train = roc_curve(y_train,train_preds)
sns.set_style("darkgrid")
plt.plot(roc[0],roc[1],label = "test")
plt.plot(roc_train[0],roc_train[1],label = "train")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig("rf_roc_final.png")
plt.show()

In [None]:
auc_score = auc(roc[0],roc[1])
print("Random Forest Test AUC Score is: ", auc_score)

auc_train_score = auc(roc_train[0],roc_train[1])
print("Random Forest Train AUC Score is: ", auc_train_score)

In [None]:
### Neural Net

In [None]:
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=0)

In [None]:
model = Sequential()
dims = x_train.shape[1]

model.add(BatchNormalization())

model.add(Dense(32, input_dim=dims))
model.add(BatchNormalization())

model.add(Dense(64,activation = "relu"))
model.add(BatchNormalization())

model.add(Dense(128,activation = "relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(64,activation = "relu"))
model.add(BatchNormalization())

model.add(Dense(32,activation = "relu"))
model.add(BatchNormalization())

model.add(Dense(1,activation = "sigmoid"))

In [None]:
num_epochs = 40
b_size = 256
es = EarlyStopping(monitor='val_loss', mode='min')
optim = Adam(lr = 0.001)
model.compile(optimizer = optim, metrics = ["accuracy"],loss = "binary_crossentropy")
#print(model.summary())

#model.fit(x_train,y_train,batch_size = b_size, epochs = num_epochs,callbacks = [es],validation_data = (x_val,y_val))
model.fit(x_train,y_train,batch_size = b_size, epochs = num_epochs,validation_data = (x_val,y_val))

In [None]:
train_preds = model.predict(x_train)
test_preds = model.predict(x_test)
train_preds = (train_preds > 0.5).astype(int).flatten()
test_preds = (test_preds > 0.5).astype(int).flatten()

In [None]:
train_acc = np.sum(train_preds == y_train)/len(y_train)
test_acc = np.sum(test_preds == y_test)/len(y_test)
print("Training Accuracy is: ", train_acc)
print("Testing Accuracy is: ", test_acc)

In [None]:
cm = confusion_matrix(y_test,test_preds)
print(cm)

In [None]:
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Test Accuracy Score: {0}'.format(0.8142932676518884)
plt.title(all_sample_title, size = 15);
#plt.savefig("Confusion_matrix_nn_final.png")

In [None]:
roc = roc_curve(y_test,test_preds)
roc_train = roc_curve(y_train,train_preds)
sns.set_style("darkgrid")
plt.plot(roc[0],roc[1],label = "test")
plt.plot(roc_train[0],roc_train[1],label = "train")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig("nn_roc_final.png")
plt.show()

In [None]:
auc_score = auc(roc[0],roc[1])
print("Neural Network Test AUC Score is: ", auc_score)

auc_train_score = auc(roc_train[0],roc_train[1])
print("Neural Network Train AUC Score is: ", auc_train_score)