# Importing Ml_credit_application table from PostgreSQL database


In [1]:
# Python SQL toolkit and Object Relational Mapper
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import config as creds
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
get_ipython().run_line_magic('load_ext', 'sql')

In [3]:
# Create connection with PostgreSQL databse
get_ipython().run_line_magic('sql', 'postgresql://postgres:{creds.password}@{creds.path}:5432/postgres')

In [4]:
# reflect an existing database into a new model
engine = create_engine(f"postgresql://postgres:{creds.password}@{creds.path}:5432/postgres")
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# We can view all of the classes that automap found
Base.classes.keys()

['application_record', 'visual_creditapp']

In [6]:
application = Base.classes.application_record
session = Session(engine)
results = []
results = session.query(application)

# Machine Learning

In [7]:
# Initial imports.
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Feature selection

In [9]:
credit_application_df = pd.read_csv("../resources/datasets/ML_credit_application.csv")
credit_application_df.drop(['ID'], axis=1, inplace=True)
# Create our features
X = credit_application_df.drop(columns="STATUS_y")

# Create our target
y = pd.DataFrame(credit_application_df["STATUS_y"])



## Split the Data into Training and Testing


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Counter(y_train)


Counter({'STATUS_y': 1})

In [11]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27342, 13)
(9115, 13)
(27342, 1)
(9115, 1)


## Scaling Dataset

In [12]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Class Imbalance
### The existing classes in the dataset is not equally represented. This is referred to as Class Imbalance and can cause the machine learning models to be biased toward the majority class. In this case, the machine learning models will be better at predicting not approved applicants. Hence, to counter this problem, we will be using Oversampling, Undersampling and Combination sampling techniques.


## First Oversampling technique: Random Oversampling

In [13]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)


Counter({'STATUS_y': 1})

## Logistic Regression Model

In [14]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [15]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3533,3546
Actual 1,977,1059


Accuracy Score : 0.5037849698299506
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.50      0.61      7079
           1       0.23      0.52      0.32      2036

    accuracy                           0.50      9115
   macro avg       0.51      0.51      0.46      9115
weighted avg       0.66      0.50      0.54      9115



## Decision Tree Model

In [16]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = tree_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5303,1776
Actual 1,745,1291


Accuracy Score : 0.7234229292375206
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.75      0.81      7079
           1       0.42      0.63      0.51      2036

    accuracy                           0.72      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.77      0.72      0.74      9115



## Random Forest Model

In [17]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5390,1689
Actual 1,768,1268


Accuracy Score : 0.730444322545255
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.76      0.81      7079
           1       0.43      0.62      0.51      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.78      0.73      0.75      9115



In [18]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.2784539061841769, 'AGE'),
 (0.19195045754034373, 'EMPLOYMENT_PERIOD'),
 (0.1749618333203321, 'AMT_INCOME_TOTAL'),
 (0.0937260421740874, 'OCCUPATION_TYPE'),
 (0.04130590183355127, 'NAME_FAMILY_STATUS'),
 (0.03548501905223345, 'NAME_INCOME_TYPE'),
 (0.032966680947901936, 'CNT_FAM_MEMBERS'),
 (0.032425619251565445, 'NAME_EDUCATION_TYPE'),
 (0.026495885765129103, 'FLAG_OWN_REALTY'),
 (0.025344350648683775, 'CODE_GENDER'),
 (0.024757170591110225, 'CNT_CHILDREN'),
 (0.021430408121898753, 'NAME_HOUSING_TYPE'),
 (0.02069672456898586, 'FLAG_OWN_CAR')]

## Gradient Boosted Tree Model

In [19]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))


Learning rate:  0.05
Accuracy score (training): 0.518
Accuracy score (validation): 0.502
Learning rate:  0.1
Accuracy score (training): 0.535
Accuracy score (validation): 0.516
Learning rate:  0.25
Accuracy score (training): 0.574
Accuracy score (validation): 0.545
Learning rate:  0.5
Accuracy score (training): 0.566
Accuracy score (validation): 0.544
Learning rate:  0.75
Accuracy score (training): 0.592
Accuracy score (validation): 0.566
Learning rate:  1
Accuracy score (training): 0.585
Accuracy score (validation): 0.563


In [20]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=1, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4067,3012
Actual 1,970,1066


Accuracy Score : 0.5631376851343939
Classification Report
              precision    recall  f1-score   support

           0       0.81      0.57      0.67      7079
           1       0.26      0.52      0.35      2036

    accuracy                           0.56      9115
   macro avg       0.53      0.55      0.51      9115
weighted avg       0.69      0.56      0.60      9115



## Second Oversampling Technique: SMOTE

In [21]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)


Counter({'STATUS_y': 1})

## Logistic Regression

In [22]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [23]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3570,3509
Actual 1,1006,1030


Accuracy Score : 0.5046626439934174
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.50      0.61      7079
           1       0.23      0.51      0.31      2036

    accuracy                           0.50      9115
   macro avg       0.50      0.51      0.46      9115
weighted avg       0.66      0.50      0.55      9115



## Decision Tree

In [24]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5944,1135
Actual 1,1004,1032


Accuracy Score : 0.7653318705430608
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.48      0.51      0.49      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.67      0.67      9115
weighted avg       0.77      0.77      0.77      9115



## Random Forest Model

In [25]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5938,1141
Actual 1,968,1068


Accuracy Score : 0.7686231486560614
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.48      0.52      0.50      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.68      0.68      9115
weighted avg       0.78      0.77      0.77      9115



[(0.24973084078758476, 'AGE'),
 (0.1970309298165691, 'AMT_INCOME_TOTAL'),
 (0.1665797359783071, 'EMPLOYMENT_PERIOD'),
 (0.11439489148770804, 'OCCUPATION_TYPE'),
 (0.04794893935725789, 'NAME_FAMILY_STATUS'),
 (0.03736903461900153, 'NAME_INCOME_TYPE'),
 (0.03467286739080155, 'NAME_EDUCATION_TYPE'),
 (0.03392849712402746, 'CNT_FAM_MEMBERS'),
 (0.026249373517129287, 'FLAG_OWN_REALTY'),
 (0.024344276968012994, 'CNT_CHILDREN'),
 (0.023380428426889793, 'CODE_GENDER'),
 (0.022533284380848628, 'FLAG_OWN_CAR'),
 (0.02183690014586199, 'NAME_HOUSING_TYPE')]

In [26]:
# # Import pickle library
import pickle

dictionary = {"model": rf_model, "scaler": X_scaler}

# # save the model to disk
filename = 'randomForest_SMOTE_new.sav'
pickle.dump(dictionary, open(filename, 'wb'))


## Gradient Boost Tree

In [27]:
## Gradient Boot Tree
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.557
Accuracy score (validation): 0.533
Learning rate:  0.1
Accuracy score (training): 0.583
Accuracy score (validation): 0.559
Learning rate:  0.25
Accuracy score (training): 0.600
Accuracy score (validation): 0.573
Learning rate:  0.5
Accuracy score (training): 0.636
Accuracy score (validation): 0.613
Learning rate:  0.75
Accuracy score (training): 0.639
Accuracy score (validation): 0.618
Learning rate:  1
Accuracy score (training): 0.651
Accuracy score (validation): 0.628


In [28]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.75, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4871,2208
Actual 1,1278,758


Accuracy Score : 0.6175534832693362
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.69      0.74      7079
           1       0.26      0.37      0.30      2036

    accuracy                           0.62      9115
   macro avg       0.52      0.53      0.52      9115
weighted avg       0.67      0.62      0.64      9115



## First Undersampling Technique: ClusterCentroids

In [29]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'STATUS_y': 1})

## Logistic Regression Model

In [30]:
# Train the Logistic Regression model using the resampled data
model_under = LogisticRegression(solver='lbfgs', random_state=1)
model_under.fit(X_resampled, y_resampled) 


LogisticRegression(random_state=1)

In [31]:
# Making predictions using the testing data.
y_pred = model_under.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Displaying results
print("Undersampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))

Undersampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,4019,3060
Actual Low Risk,1161,875


Accuracy Score : 0.5369171695008228
              precision    recall  f1-score   support

           0       0.78      0.57      0.66      7079
           1       0.22      0.43      0.29      2036

    accuracy                           0.54      9115
   macro avg       0.50      0.50      0.47      9115
weighted avg       0.65      0.54      0.57      9115



## Decision Tree Model

In [32]:
## Decision Tree
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4866,2213
Actual 1,703,1333


Accuracy Score : 0.6800877674163467
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.69      0.77      7079
           1       0.38      0.65      0.48      2036

    accuracy                           0.68      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.68      0.70      9115



## Random Forest Model

In [33]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)



Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4813,2266
Actual 1,689,1347


Accuracy Score : 0.675809105869446
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.68      0.77      7079
           1       0.37      0.66      0.48      2036

    accuracy                           0.68      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.68      0.70      9115



[(0.25434520254723714, 'AGE'),
 (0.20316198111660222, 'EMPLOYMENT_PERIOD'),
 (0.18807378812315492, 'AMT_INCOME_TOTAL'),
 (0.09020690273157055, 'OCCUPATION_TYPE'),
 (0.043126401205716, 'NAME_FAMILY_STATUS'),
 (0.03543764033229364, 'NAME_EDUCATION_TYPE'),
 (0.03329217842199752, 'CNT_FAM_MEMBERS'),
 (0.030529455955449893, 'NAME_INCOME_TYPE'),
 (0.026130045742184724, 'FLAG_OWN_CAR'),
 (0.02557679979098576, 'NAME_HOUSING_TYPE'),
 (0.025036656559007803, 'CNT_CHILDREN'),
 (0.02289553700677564, 'FLAG_OWN_REALTY'),
 (0.022187410467024077, 'CODE_GENDER')]

## First Combination (Over and Under) Sampling Technique: SMOTEENN

# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled,y_train)
Counter(y_resampled)

## Logistic Regression Model

In [34]:
# Train the Logistic Regression model using the resampled data
model_comb = LogisticRegression(solver='lbfgs', random_state=1)
model_comb.fit(X_resampled, y_resampled)

y_pred = model_comb.predict(X_test_scaled)

acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Print the imbalanced classification report
print("Combination (Over and Under) Sampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))


Combination (Over and Under) Sampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,4019,3060
Actual Low Risk,1161,875


Accuracy Score : 0.5369171695008228
              precision    recall  f1-score   support

           0       0.78      0.57      0.66      7079
           1       0.22      0.43      0.29      2036

    accuracy                           0.54      9115
   macro avg       0.50      0.50      0.47      9115
weighted avg       0.65      0.54      0.57      9115

