# Importing Ml_credit_application table from PostgreSQL database


In [1]:
# Python SQL toolkit and Object Relational Mapper
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import config as creds
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
get_ipython().run_line_magic('load_ext', 'sql')

In [3]:
# Create connection with PostgreSQL databse
get_ipython().run_line_magic('sql', 'postgresql://postgres:{creds.password}@{creds.path}:5432/postgres')

In [4]:
# reflect an existing database into a new model
engine = create_engine(f"postgresql://postgres:{creds.password}@{creds.path}:5432/postgres")
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# We can view all of the classes that automap found
Base.classes.keys()

['ml_application_record', 'application_record', 'visual_creditapp']

In [7]:
credit_application_df = pd.read_sql('SELECT * FROM ml_application_record', engine)

In [9]:
credit_application_df.dtypes

id                       int64
code_gender              int64
flag_own_car             int64
flag_own_realty          int64
cnt_children             int64
amt_income_total       float64
name_income_type         int64
name_education_type      int64
name_family_status       int64
name_housing_type        int64
occupation_type          int64
cnt_fam_members        float64
age                    float64
employment_period      float64
status_y                 int64
code                     int64
dtype: object

# Machine Learning

In [10]:
# Initial imports.
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Feature selection

In [11]:
credit_application_df.drop(['id'], axis=1, inplace=True)
credit_application_df.drop(['code'], axis=1, inplace=True)

# Create our features
X = credit_application_df.drop(columns="status_y")

# Create our target
y = pd.DataFrame(credit_application_df["status_y"])



## Split the Data into Training and Testing


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Counter(y_train)


Counter({'status_y': 1})

In [13]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27342, 13)
(9115, 13)
(27342, 1)
(9115, 1)


## Scaling Dataset

In [14]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Class Imbalance
### The existing classes in the dataset is not equally represented. This is referred to as Class Imbalance and can cause the machine learning models to be biased toward the majority class. In this case, the machine learning models will be better at predicting not approved applicants. Hence, to counter this problem, we will be using Oversampling, Undersampling and Combination sampling techniques.


## First Oversampling technique: Random Oversampling

In [15]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)


Counter({'status_y': 1})

## Logistic Regression Model

In [16]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [17]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3539,3540
Actual 1,986,1050


Accuracy Score : 0.5034558420186506
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.50      0.61      7079
           1       0.23      0.52      0.32      2036

    accuracy                           0.50      9115
   macro avg       0.51      0.51      0.46      9115
weighted avg       0.66      0.50      0.54      9115



## Decision Tree Model

In [18]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = tree_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5322,1757
Actual 1,740,1296


Accuracy Score : 0.726055951727921
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.75      0.81      7079
           1       0.42      0.64      0.51      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.78      0.73      0.74      9115



## Random Forest Model

In [19]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5435,1644
Actual 1,776,1260


Accuracy Score : 0.734503565551289
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.77      0.82      7079
           1       0.43      0.62      0.51      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.78      0.73      0.75      9115



In [20]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.27677096198765694, 'age'),
 (0.19195818771289055, 'employment_period'),
 (0.1740651816678719, 'amt_income_total'),
 (0.09328477681797231, 'occupation_type'),
 (0.03983330559178436, 'name_family_status'),
 (0.03536868453388802, 'name_income_type'),
 (0.03353757403712021, 'name_education_type'),
 (0.03260091704070759, 'cnt_fam_members'),
 (0.02699299670156437, 'flag_own_realty'),
 (0.025764170919673854, 'cnt_children'),
 (0.025089384859850968, 'code_gender'),
 (0.02241207968948978, 'flag_own_car'),
 (0.022321778439529198, 'name_housing_type')]

## Gradient Boosted Tree Model

In [21]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))


Learning rate:  0.05
Accuracy score (training): 0.540
Accuracy score (validation): 0.522
Learning rate:  0.1
Accuracy score (training): 0.542
Accuracy score (validation): 0.517
Learning rate:  0.25
Accuracy score (training): 0.566
Accuracy score (validation): 0.541
Learning rate:  0.5
Accuracy score (training): 0.569
Accuracy score (validation): 0.546
Learning rate:  0.75
Accuracy score (training): 0.594
Accuracy score (validation): 0.573
Learning rate:  1
Accuracy score (training): 0.574
Accuracy score (validation): 0.547


In [22]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=1, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3871,3208
Actual 1,920,1116


Accuracy Score : 0.5471201316511245
Classification Report
              precision    recall  f1-score   support

           0       0.81      0.55      0.65      7079
           1       0.26      0.55      0.35      2036

    accuracy                           0.55      9115
   macro avg       0.53      0.55      0.50      9115
weighted avg       0.69      0.55      0.58      9115



## Second Oversampling Technique: SMOTE

In [23]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)


Counter({'status_y': 1})

## Logistic Regression

In [24]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [25]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3570,3509
Actual 1,1006,1030


Accuracy Score : 0.5046626439934174
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.50      0.61      7079
           1       0.23      0.51      0.31      2036

    accuracy                           0.50      9115
   macro avg       0.50      0.51      0.46      9115
weighted avg       0.66      0.50      0.55      9115



## Decision Tree

In [26]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5931,1148
Actual 1,1001,1035


Accuracy Score : 0.7642347778387274
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.47      0.51      0.49      2036

    accuracy                           0.76      9115
   macro avg       0.66      0.67      0.67      9115
weighted avg       0.77      0.76      0.77      9115



## Random Forest Model

In [27]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5978,1101
Actual 1,978,1058


Accuracy Score : 0.771914426769062
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.49      0.52      0.50      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.68      0.68      9115
weighted avg       0.78      0.77      0.77      9115



[(0.2487366677990461, 'age'),
 (0.19699836083022954, 'amt_income_total'),
 (0.16750291288671937, 'employment_period'),
 (0.11391730489384921, 'occupation_type'),
 (0.047870084139161356, 'name_family_status'),
 (0.038016389182143395, 'name_income_type'),
 (0.03467169802828837, 'name_education_type'),
 (0.0340429085304141, 'cnt_fam_members'),
 (0.025576771208851965, 'flag_own_realty'),
 (0.024677012611349573, 'cnt_children'),
 (0.023625472758878396, 'code_gender'),
 (0.022503027748837952, 'flag_own_car'),
 (0.02186138938223044, 'name_housing_type')]

In [28]:
# # Import pickle library
import pickle

dictionary = {"model": rf_model, "scaler": X_scaler}

# # save the model to disk
filename = 'randomForest_SMOTE_new.sav'
pickle.dump(dictionary, open(filename, 'wb'))


## Gradient Boost Tree

In [29]:
## Gradient Boot Tree
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.557
Accuracy score (validation): 0.533
Learning rate:  0.1
Accuracy score (training): 0.583
Accuracy score (validation): 0.559
Learning rate:  0.25
Accuracy score (training): 0.600
Accuracy score (validation): 0.573
Learning rate:  0.5
Accuracy score (training): 0.636
Accuracy score (validation): 0.613
Learning rate:  0.75
Accuracy score (training): 0.639
Accuracy score (validation): 0.618
Learning rate:  1
Accuracy score (training): 0.651
Accuracy score (validation): 0.628


In [30]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.75, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4871,2208
Actual 1,1278,758


Accuracy Score : 0.6175534832693362
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.69      0.74      7079
           1       0.26      0.37      0.30      2036

    accuracy                           0.62      9115
   macro avg       0.52      0.53      0.52      9115
weighted avg       0.67      0.62      0.64      9115



## First Undersampling Technique: ClusterCentroids

In [31]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'status_y': 1})

## Logistic Regression Model

In [32]:
# Train the Logistic Regression model using the resampled data
model_under = LogisticRegression(solver='lbfgs', random_state=1)
model_under.fit(X_resampled, y_resampled) 


LogisticRegression(random_state=1)

In [33]:
# Making predictions using the testing data.
y_pred = model_under.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Displaying results
print("Undersampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))

Undersampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,4019,3060
Actual Low Risk,1161,875


Accuracy Score : 0.5369171695008228
              precision    recall  f1-score   support

           0       0.78      0.57      0.66      7079
           1       0.22      0.43      0.29      2036

    accuracy                           0.54      9115
   macro avg       0.50      0.50      0.47      9115
weighted avg       0.65      0.54      0.57      9115



## Decision Tree Model

In [34]:
## Decision Tree
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4864,2215
Actual 1,708,1328


Accuracy Score : 0.6793198025233133
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.69      0.77      7079
           1       0.37      0.65      0.48      2036

    accuracy                           0.68      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.68      0.70      9115



## Random Forest Model

In [35]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)



Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4817,2262
Actual 1,694,1342


Accuracy Score : 0.6756993965990126
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.68      0.77      7079
           1       0.37      0.66      0.48      2036

    accuracy                           0.68      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.68      0.70      9115



[(0.2541399431441034, 'age'),
 (0.2005063691917014, 'employment_period'),
 (0.18756906878342727, 'amt_income_total'),
 (0.09208600339671669, 'occupation_type'),
 (0.043914556552202055, 'name_family_status'),
 (0.03501560531825361, 'cnt_fam_members'),
 (0.03472373143406642, 'name_education_type'),
 (0.03074141813172945, 'name_income_type'),
 (0.026626754792405075, 'cnt_children'),
 (0.025855210141755562, 'name_housing_type'),
 (0.02478419554301806, 'flag_own_car'),
 (0.022183146043609134, 'flag_own_realty'),
 (0.021853997527011983, 'code_gender')]

## First Combination (Over and Under) Sampling Technique: SMOTEENN

# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled,y_train)
Counter(y_resampled)

## Logistic Regression Model

In [36]:
# Train the Logistic Regression model using the resampled data
model_comb = LogisticRegression(solver='lbfgs', random_state=1)
model_comb.fit(X_resampled, y_resampled)

y_pred = model_comb.predict(X_test_scaled)

acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Print the imbalanced classification report
print("Combination (Over and Under) Sampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))


Combination (Over and Under) Sampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,4019,3060
Actual Low Risk,1161,875


Accuracy Score : 0.5369171695008228
              precision    recall  f1-score   support

           0       0.78      0.57      0.66      7079
           1       0.22      0.43      0.29      2036

    accuracy                           0.54      9115
   macro avg       0.50      0.50      0.47      9115
weighted avg       0.65      0.54      0.57      9115

