## Import Dependencies

In [134]:
# Import our dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

In [135]:
import warnings
warnings.filterwarnings('ignore')

## Import Datasets

In [136]:
# Import dataset - credit_record.csv
credit_record_df = pd.read_csv('../Resources/credit_record.csv')
# Import dataset - application_record.csv
application_record_df = pd.read_csv('../Resources/application_record.csv')

## Datasets Demographics

In [137]:
# Dataframes display

credit_record_df.head(5)

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [138]:
# Dataframes display

application_record_df.head(5)

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [139]:
# Get the info of the Dataframe - credit_record_df
credit_record_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [140]:
# Get the info of the DataFrame - application_record_df
application_record_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

### Get the unique counts of each variable from the two DataFrames

In [141]:
credit_record_df.nunique()


ID                45985
MONTHS_BALANCE       61
STATUS                8
dtype: int64

In [142]:
application_record_df.nunique()

ID                     438510
CODE_GENDER                 2
FLAG_OWN_CAR                2
FLAG_OWN_REALTY             2
CNT_CHILDREN               12
AMT_INCOME_TOTAL          866
NAME_INCOME_TYPE            5
NAME_EDUCATION_TYPE         5
NAME_FAMILY_STATUS          5
NAME_HOUSING_TYPE           6
DAYS_BIRTH              16379
DAYS_EMPLOYED            9406
FLAG_MOBIL                  1
FLAG_WORK_PHONE             2
FLAG_PHONE                  2
FLAG_EMAIL                  2
OCCUPATION_TYPE            18
CNT_FAM_MEMBERS            13
dtype: int64

# Cleaning "credit_record_df"

In [143]:
# Count how many entries for each STATUS
credit_record_df['STATUS'].value_counts()

C    442031
0    383120
X    209230
1     11090
5      1693
2       868
3       320
4       223
Name: STATUS, dtype: int64

In [144]:
#Calculate the age of each MONTH_BALANCE & 
credit_record_df['Months_from_Today']= credit_record_df['MONTHS_BALANCE']* -1

In [145]:
# Sort by ID and Months_from_Today
credit_record_df = credit_record_df.sort_values(['ID','Months_from_Today'], ascending=True)
credit_record_df

Unnamed: 0,ID,MONTHS_BALANCE,STATUS,Months_from_Today
0,5001711,0,X,0
1,5001711,-1,0,1
2,5001711,-2,0,2
3,5001711,-3,0,3
4,5001712,0,C,0
...,...,...,...,...
1048570,5150487,-25,C,25
1048571,5150487,-26,C,26
1048572,5150487,-27,C,27
1048573,5150487,-28,C,28


### From above, oversampling will be an issue as over 99% of the data shows "bad" applicants

# Lucas' Idea of "Good" and "Bad" applicants
## First, we replace X and C to be 0 and 0-5 to be 1. The main idea here is count the number of bad months each participant has and if a participant has 3 or more "Bad" months (three 1's) then the participant is "Bad" and is then rejected for a credit card.



In [146]:
credit_record_df["STATUS"].replace({"X":0, "C":0, "0":1, "1":1, "2":1, "3":1, "4":1, "5":1} , inplace= True )

In [147]:
credit_record_df

Unnamed: 0,ID,MONTHS_BALANCE,STATUS,Months_from_Today
0,5001711,0,0,0
1,5001711,-1,1,1
2,5001711,-2,1,2
3,5001711,-3,1,3
4,5001712,0,0,0
...,...,...,...,...
1048570,5150487,-25,0,25
1048571,5150487,-26,0,26
1048572,5150487,-27,0,27
1048573,5150487,-28,0,28


In [148]:
lucas_df = credit_record_df
bad_months = lucas_df.groupby(["ID"])["STATUS"].sum()
bad_months

ID
5001711     3
5001712    10
5001713     0
5001714     0
5001715     0
           ..
5150482    12
5150483     0
5150484    12
5150485     2
5150487     0
Name: STATUS, Length: 45985, dtype: int64

In [149]:
status = bad_months.apply(lambda x: 0 if x >= 3 else 1)

In [150]:
new_credit_record = credit_record_df.merge(status, how="left", on="ID")
new_credit_record

Unnamed: 0,ID,MONTHS_BALANCE,STATUS_x,Months_from_Today,STATUS_y
0,5001711,0,0,0,0
1,5001711,-1,1,1,0
2,5001711,-2,1,2,0
3,5001711,-3,1,3,0
4,5001712,0,0,0,0
...,...,...,...,...,...
1048570,5150487,-25,0,25,1
1048571,5150487,-26,0,26,1
1048572,5150487,-27,0,27,1
1048573,5150487,-28,0,28,1


In [151]:
# Group by ID 
new_credit_grouped = new_credit_record.drop(["STATUS_x"], axis= 1)
new_credit_grouped

Unnamed: 0,ID,MONTHS_BALANCE,Months_from_Today,STATUS_y
0,5001711,0,0,0
1,5001711,-1,1,0
2,5001711,-2,2,0
3,5001711,-3,3,0
4,5001712,0,0,0
...,...,...,...,...
1048570,5150487,-25,25,1
1048571,5150487,-26,26,1
1048572,5150487,-27,27,1
1048573,5150487,-28,28,1


In [152]:
new_credit = new_credit_grouped.groupby("ID", as_index=False)
new_credit = new_credit.first()
new_credit

Unnamed: 0,ID,MONTHS_BALANCE,Months_from_Today,STATUS_y
0,5001711,0,0,0
1,5001712,0,0,0
2,5001713,0,0,1
3,5001714,0,0,1
4,5001715,0,0,1
...,...,...,...,...
45980,5150482,-11,11,0
45981,5150483,0,0,1
45982,5150484,0,0,0
45983,5150485,0,0,1


# Cleaning "application_record_df"

## Removing duplicate ID's

In [153]:
# Find number of duplicates in 'ID' column
len(application_record_df['ID']) - len(application_record_df['ID'].unique())

47

In [154]:
# Drop the duplicate entries from the 'ID' Column

application_record_df = application_record_df.drop_duplicates('ID', keep='last')

In [155]:
application_record_df.shape

(438510, 18)

## Finding the Null columns

In [156]:
new_credit.isnull().sum()

ID                   0
MONTHS_BALANCE       0
Months_from_Today    0
STATUS_y             0
dtype: int64

In [157]:
application_record_df.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134187
CNT_FAM_MEMBERS             0
dtype: int64

In [158]:
# Replace nulls with "No Occupation Type"
application_record_df["OCCUPATION_TYPE"].fillna("No Occupation Type", inplace = True)
application_record_df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,No Occupation Type,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,No Occupation Type,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [159]:
application_record_df.isnull().sum()

ID                     0
CODE_GENDER            0
FLAG_OWN_CAR           0
FLAG_OWN_REALTY        0
CNT_CHILDREN           0
AMT_INCOME_TOTAL       0
NAME_INCOME_TYPE       0
NAME_EDUCATION_TYPE    0
NAME_FAMILY_STATUS     0
NAME_HOUSING_TYPE      0
DAYS_BIRTH             0
DAYS_EMPLOYED          0
FLAG_MOBIL             0
FLAG_WORK_PHONE        0
FLAG_PHONE             0
FLAG_EMAIL             0
OCCUPATION_TYPE        0
CNT_FAM_MEMBERS        0
dtype: int64

## Convert string columns to numbers

In [160]:
#Convert string columns to numbers using get_dummies
application_record_df = pd.get_dummies(application_record_df, columns=["CODE_GENDER","FLAG_OWN_CAR","FLAG_OWN_REALTY",
                                  "NAME_INCOME_TYPE","NAME_EDUCATION_TYPE","NAME_FAMILY_STATUS",
                                  "NAME_HOUSING_TYPE","OCCUPATION_TYPE"], drop_first = True)

In [161]:
application_record_df.head()

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,...,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_No Occupation Type,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff
0,5008804,0,427500.0,-12005,-4542,1,1,0,0,2.0,...,0,0,0,1,0,0,0,0,0,0
1,5008805,0,427500.0,-12005,-4542,1,1,0,0,2.0,...,0,0,0,1,0,0,0,0,0,0
2,5008806,0,112500.0,-21474,-1134,1,0,0,0,2.0,...,0,0,0,0,0,0,0,0,1,0
3,5008808,0,270000.0,-19110,-3051,1,0,1,1,1.0,...,0,0,0,0,0,0,1,0,0,0
4,5008809,0,270000.0,-19110,-3051,1,0,1,1,1.0,...,0,0,0,0,0,0,1,0,0,0


## Convert DAYS_BIRTH & DAYS_EMPLOYED into years

In [162]:
application_record_df['AGE'] = round(application_record_df['DAYS_BIRTH'] / -365, 2)

In [163]:
application_record_df['EMPLOYMENT_PERIOD'] = round(application_record_df['DAYS_EMPLOYED'] / -365, 2)

In [164]:
application_record_df.head(5)

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,...,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_No Occupation Type,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,AGE,EMPLOYMENT_PERIOD
0,5008804,0,427500.0,-12005,-4542,1,1,0,0,2.0,...,0,1,0,0,0,0,0,0,32.89,12.44
1,5008805,0,427500.0,-12005,-4542,1,1,0,0,2.0,...,0,1,0,0,0,0,0,0,32.89,12.44
2,5008806,0,112500.0,-21474,-1134,1,0,0,0,2.0,...,0,0,0,0,0,0,1,0,58.83,3.11
3,5008808,0,270000.0,-19110,-3051,1,0,1,1,1.0,...,0,0,0,0,1,0,0,0,52.36,8.36
4,5008809,0,270000.0,-19110,-3051,1,0,1,1,1.0,...,0,0,0,0,1,0,0,0,52.36,8.36


## Create bins for continous variable categories column 'AMT_INCOME_TOTAL'

In [165]:
application_record_df.AMT_INCOME_TOTAL.min()

26100.0

In [166]:
application_record_df.AMT_INCOME_TOTAL.max()

6750000.0

In [167]:
# Creating bins for income amount

# bins = [0,25000,50000,75000,100000,125000,150000,175000,200000,225000,250000,275000,300000,325000,350000,375000,400000,425000,450000,475000,500000,10000000000]
# slot = ['0-25000', '25000-50000','50000-75000','75000,100000','100000-125000', '125000-150000', '150000-175000','175000-200000',
#        '200000-225000','225000-250000','250000-275000','275000-300000','300000-325000','325000-350000','350000-375000',
#        '375000-400000','400000-425000','425000-450000','450000-475000','475000-500000','500000 and above']

# application_record_df['AMT_INCOME_RANGE']=pd.cut(application_record_df['AMT_INCOME_TOTAL'],bins,labels=slot)

# Lucas
# The scale for Annual Income is much larger than all the other values in the dataset. We format by dividing by 100000 to rescale those data points
application_record_df['AMT_INCOME_TOTAL'] = application_record_df['AMT_INCOME_TOTAL'] / 100000

In [168]:
# application_record_df['AGE'] = application_record_df['AGE'] / 10

## Remove unnecessary columns from application_record

In [169]:
# Remove the columns that are not needed from the application_record_df

Columns_to_remove = ['DAYS_BIRTH', 'DAYS_EMPLOYED','FLAG_MOBIL','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL']

application_record_df.drop(labels=Columns_to_remove,axis=1,inplace=True)

In [170]:
application_record_df.head(5)

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,CODE_GENDER_M,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_Y,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,...,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_No Occupation Type,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,AGE,EMPLOYMENT_PERIOD
0,5008804,0,4.275,2.0,1,1,1,0,0,0,...,0,1,0,0,0,0,0,0,32.89,12.44
1,5008805,0,4.275,2.0,1,1,1,0,0,0,...,0,1,0,0,0,0,0,0,32.89,12.44
2,5008806,0,1.125,2.0,1,1,1,0,0,0,...,0,0,0,0,0,0,1,0,58.83,3.11
3,5008808,0,2.7,1.0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,52.36,8.36
4,5008809,0,2.7,1.0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,52.36,8.36


### Join the credit_record_grouped_df to the application_record_df

In [171]:
credit_application_df = application_record_df.join(new_credit.set_index('ID'), on='ID', how='inner')
credit_application_df.drop(['MONTHS_BALANCE'], axis=1, inplace=True)
credit_application_df.drop(['Months_from_Today'], axis=1, inplace=True)
credit_application_df.drop(['ID'], axis=1, inplace=True)
credit_application_df

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,CODE_GENDER_M,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_Y,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Working,...,OCCUPATION_TYPE_No Occupation Type,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,AGE,EMPLOYMENT_PERIOD,STATUS_y
0,0,4.275,2.0,1,1,1,0,0,0,1,...,1,0,0,0,0,0,0,32.89,12.44,1
1,0,4.275,2.0,1,1,1,0,0,0,1,...,1,0,0,0,0,0,0,32.89,12.44,1
2,0,1.125,2.0,1,1,1,0,0,0,1,...,0,0,0,0,0,1,0,58.83,3.11,0
3,0,2.700,1.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,52.36,8.36,1
4,0,2.700,1.0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,52.36,8.36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
434808,0,3.150,2.0,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,47.53,6.63,0
434809,0,1.575,2.0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,33.94,3.63,0
434810,0,1.575,2.0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,33.94,3.63,0
434811,0,2.835,2.0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,49.20,1.79,0


In [172]:
credit_application_df.nunique()

CNT_CHILDREN                                            9
AMT_INCOME_TOTAL                                      265
CNT_FAM_MEMBERS                                        10
CODE_GENDER_M                                           2
FLAG_OWN_CAR_Y                                          2
FLAG_OWN_REALTY_Y                                       2
NAME_INCOME_TYPE_Pensioner                              2
NAME_INCOME_TYPE_State servant                          2
NAME_INCOME_TYPE_Student                                2
NAME_INCOME_TYPE_Working                                2
NAME_EDUCATION_TYPE_Higher education                    2
NAME_EDUCATION_TYPE_Incomplete higher                   2
NAME_EDUCATION_TYPE_Lower secondary                     2
NAME_EDUCATION_TYPE_Secondary / secondary special       2
NAME_FAMILY_STATUS_Married                              2
NAME_FAMILY_STATUS_Separated                            2
NAME_FAMILY_STATUS_Single / not married                 2
NAME_FAMILY_ST

In [173]:
# Save Dataframe to CSV
# credit_application_df.to_csv('../resources/ML_credit_application.csv',index=False, header=True)

# Importing Ml_credit_application table from PostgreSQL database

In [174]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import config as creds

In [175]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [176]:
# Create connection with PostgreSQL databse
%sql postgresql://postgres:{creds.password}@{creds.path}:5432/postgres

In [177]:
# reflect an existing database into a new model
engine = create_engine(f"postgresql://postgres:{creds.password}@{creds.path}:5432/postgres")
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [178]:
# We can view all of the classes that automap found
Base.classes.keys()

['application_record', 'visual_creditapp']

In [179]:
import pandas as pd
application = Base.classes.application_record
session = Session(engine)
results = []
results = session.query(application)

# Machine Learning

In [180]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Split the Data into Training and Testing

In [181]:
# Create our features
X = credit_application_df.drop(columns="STATUS_y")

# Create our target
y = pd.DataFrame(credit_application_df["STATUS_y"])

In [182]:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Counter(y_train)

Counter({'STATUS_y': 1})

In [183]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27342, 43)
(9115, 43)
(27342, 1)
(9115, 1)


## Scale Data

In [184]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Class Imbalance
## The existing classes in the dataset is not equally represented. This is referred to as Class Imbalance and can cause the machine learning models to be biased toward the majority class. In this case, the machine learning models will be better at predicting not approved applicants. Hence, to counter this problem, we will be using Oversampling, Undersampling and Combination sampling techniques.

# Random Oversampling

In [185]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)

Counter({'STATUS_y': 1})

## Logistic Regression

In [186]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [187]:
# Calculated the balanced accuracy score

predictions = log_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

In [188]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [189]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3766,3313
Actual 1,1027,1009


Accuracy Score : 0.523861766319254
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.53      0.63      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.52      9115
   macro avg       0.51      0.51      0.48      9115
weighted avg       0.66      0.52      0.56      9115



## SVM

In [190]:
# Instantiate a linear SVM model

# svc_model = SVC(probability=False)
# # Fit the data
# svc_model.fit(X_resampled, y_resampled)
# predictions = svc_model.predict(X_test_scaled)
# acc_score = accuracy_score(y_test, predictions)

In [191]:
# # Calculating the confusion matrix
# cm = confusion_matrix(y_test, predictions)

# # Create a DataFrame from the confusion matrix.
# cm_df = pd.DataFrame(
#     cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [192]:
# Displaying results
# print("Confusion Matrix")
# display(cm_df)
# print(f"Accuracy Score : {acc_score}")
# print("Classification Report")
# print(classification_report(y_test, predictions))


## Decision Tree

In [193]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

DecisionTreeClassifier()

In [194]:
predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

In [195]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [196]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5384,1695
Actual 1,777,1259


Accuracy Score : 0.7287986834887548
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      7079
           1       0.43      0.62      0.50      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.77      0.73      0.74      9115



# Random Forest

In [197]:

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 

In [198]:
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)


In [199]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [200]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5439,1640
Actual 1,798,1238


In [201]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [202]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5439,1640
Actual 1,798,1238


Accuracy Score : 0.7325287986834887
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.77      0.82      7079
           1       0.43      0.61      0.50      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.77      0.73      0.75      9115



In [203]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2656957513563365, 'AGE'),
 (0.18151085986558402, 'AMT_INCOME_TOTAL'),
 (0.17686047974760863, 'EMPLOYMENT_PERIOD'),
 (0.03192221333109384, 'CNT_FAM_MEMBERS'),
 (0.029363875851829452, 'FLAG_OWN_REALTY_Y'),
 (0.02612608694549809, 'CNT_CHILDREN'),
 (0.025614281182067187, 'NAME_INCOME_TYPE_Working'),
 (0.024783889861828754, 'CODE_GENDER_M'),
 (0.022787156954810178, 'FLAG_OWN_CAR_Y'),
 (0.014769174295555968, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.014175886372357253, 'NAME_FAMILY_STATUS_Married'),
 (0.014175373151383456, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.014146630414030912, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.013112739799932435, 'OCCUPATION_TYPE_Laborers'),
 (0.012333160442368417, 'OCCUPATION_TYPE_Core staff'),
 (0.010972230531336584, 'NAME_INCOME_TYPE_State servant'),
 (0.010351804346324565, 'OCCUPATION_TYPE_Sales staff'),
 (0.010037073869115742, 'OCCUPATION_TYPE_Managers'),
 (0.009583096355057665, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.00861

## Gradient Boosted Tree

In [204]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.521
Accuracy score (validation): 0.506
Learning rate:  0.1
Accuracy score (training): 0.550
Accuracy score (validation): 0.540
Learning rate:  0.25
Accuracy score (training): 0.561
Accuracy score (validation): 0.542
Learning rate:  0.5
Accuracy score (training): 0.586
Accuracy score (validation): 0.559
Learning rate:  0.75
Accuracy score (training): 0.576
Accuracy score (validation): 0.566
Learning rate:  1
Accuracy score (training): 0.586
Accuracy score (validation): 0.572


In [205]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=1, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)

In [206]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)

In [207]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4245,2834
Actual 1,1064,972


Accuracy Score : 0.5723532638507954
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.60      0.69      7079
           1       0.26      0.48      0.33      2036

    accuracy                           0.57      9115
   macro avg       0.53      0.54      0.51      9115
weighted avg       0.68      0.57      0.61      9115



# SMOTE Oversampling

In [208]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'STATUS_y': 1})

## Logistic Regression

In [209]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [210]:
# Calculated the balanced accuracy score
predictions = log_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

In [211]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


In [212]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3683,3396
Actual 1,1009,1027


Accuracy Score : 0.5167306637410861
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.52      0.63      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.52      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.52      0.56      9115



## Decision Tree

In [213]:
## Decision Tree
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5939,1140
Actual 1,998,1038


Accuracy Score : 0.7654415798134943
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.48      0.51      0.49      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.67      0.67      9115
weighted avg       0.77      0.77      0.77      9115



## Random Forest

In [214]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5974,1105
Actual 1,979,1057


Accuracy Score : 0.7713658804168952
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.49      0.52      0.50      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.68      0.68      9115
weighted avg       0.78      0.77      0.77      9115



[(0.24246529421774418, 'AGE'),
 (0.19207288319482124, 'AMT_INCOME_TOTAL'),
 (0.15810148435078292, 'EMPLOYMENT_PERIOD'),
 (0.03635705794359359, 'CNT_FAM_MEMBERS'),
 (0.03331739108325271, 'CNT_CHILDREN'),
 (0.030434214544559256, 'FLAG_OWN_REALTY_Y'),
 (0.030291229772631482, 'NAME_INCOME_TYPE_Working'),
 (0.03012085312293548, 'FLAG_OWN_CAR_Y'),
 (0.02607957181381715, 'CODE_GENDER_M'),
 (0.020273693337330173, 'NAME_FAMILY_STATUS_Married'),
 (0.016186492256501032, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.01517398089745551, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.014492122917668373, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.013538343700326293, 'OCCUPATION_TYPE_Laborers'),
 (0.011680999927447692, 'OCCUPATION_TYPE_Core staff'),
 (0.01137639489198916, 'OCCUPATION_TYPE_Sales staff'),
 (0.009964725906360357, 'OCCUPATION_TYPE_Managers'),
 (0.009959893403336025, 'NAME_INCOME_TYPE_State servant'),
 (0.00988136493460684, 'NAME_HOUSING_TYPE_House / apartment'),
 (0.0096065476994

## Gradient Boost Tree

In [215]:
## Gradient Boot Tree
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.576
Accuracy score (validation): 0.564
Learning rate:  0.1
Accuracy score (training): 0.572
Accuracy score (validation): 0.552
Learning rate:  0.25
Accuracy score (training): 0.598
Accuracy score (validation): 0.578
Learning rate:  0.5
Accuracy score (training): 0.605
Accuracy score (validation): 0.581
Learning rate:  0.75
Accuracy score (training): 0.612
Accuracy score (validation): 0.584
Learning rate:  1
Accuracy score (training): 0.619
Accuracy score (validation): 0.598


In [216]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.75, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4490,2589
Actual 1,1199,837


Accuracy Score : 0.5844212835984641
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.63      0.70      7079
           1       0.24      0.41      0.31      2036

    accuracy                           0.58      9115
   macro avg       0.52      0.52      0.50      9115
weighted avg       0.67      0.58      0.61      9115



## Undersampling
We will test an undersampling algorithms to determine which algorithm results in the best performance compared to the other algorithms above.

In [217]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled_under, y_resampled_under = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled_under)

Counter({'STATUS_y': 1})

In [218]:
# Train the Logistic Regression model using the resampled data
model_under = LogisticRegression(solver='lbfgs', random_state=1)
model_under.fit(X_resampled_under, y_resampled_under)

LogisticRegression(random_state=1)

In [219]:
# Calculated the balanced accuracy score
y_pred_under = model_under.predict(X_test_scaled)

acc_score =balanced_accuracy_score(y_test, y_pred_under)

In [220]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred_under)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

cm_df

Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,3654,3425
Actual Low Risk,1014,1022


In [221]:
# Print the imbalanced classification report

# Displaying results
print("Undersampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred_under))

Undersampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,3654,3425
Actual Low Risk,1014,1022


Accuracy Score : 0.5090696187372874
              precision    recall  f1-score   support

           0       0.78      0.52      0.62      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.51      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.51      0.55      9115



## Combination (Over and Under) Sampling
We will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

In [222]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled_comb, y_resampled_comb = smote_enn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled_comb)

Counter({'STATUS_y': 1})

In [223]:
# Train the Logistic Regression model using the resampled data
model_comb = LogisticRegression(solver='lbfgs', random_state=1)
model_comb.fit(X_resampled_comb, y_resampled_comb)

LogisticRegression(random_state=1)

In [224]:
# Calculated the balanced accuracy score
y_pred_comb = model_comb.predict(X_test_scaled)

acc_score = balanced_accuracy_score(y_test, y_pred_comb)

In [225]:
# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred_comb)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

cm_df

Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,4531,2548
Actual Low Risk,1247,789


In [226]:
# Print the imbalanced classification report
print("Combination (Over and Under) Sampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred_comb))

Combination (Over and Under) Sampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,4531,2548
Actual Low Risk,1247,789


Accuracy Score : 0.5137933568142414
              precision    recall  f1-score   support

           0       0.78      0.64      0.70      7079
           1       0.24      0.39      0.29      2036

    accuracy                           0.58      9115
   macro avg       0.51      0.51      0.50      9115
weighted avg       0.66      0.58      0.61      9115

