# Importing Ml_credit_application table from PostgreSQL database


In [1]:
# Python SQL toolkit and Object Relational Mapper
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import config as creds
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
get_ipython().run_line_magic('load_ext', 'sql')

In [3]:
# Create connection with PostgreSQL databse
get_ipython().run_line_magic('sql', 'postgresql://postgres:{creds.password}@{creds.path}:5432/postgres')

In [4]:
# reflect an existing database into a new model
engine = create_engine(f"postgresql://postgres:{creds.password}@{creds.path}:5432/postgres")
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# We can view all of the classes that automap found
Base.classes.keys()

['application_record', 'visual_creditapp']

In [6]:
application = Base.classes.application_record
session = Session(engine)
results = []
results = session.query(application)

# Machine Learning

In [7]:
# Initial imports.
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Feature selection

In [8]:
credit_application_df = pd.read_csv("../resources/ML_credit_application.csv")
credit_application_df.drop(['ID'], axis=1, inplace=True)
# Create our features
X = credit_application_df.drop(columns="STATUS_y")

# Create our target
y = pd.DataFrame(credit_application_df["STATUS_y"])



## Split the Data into Training and Testing


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Counter(y_train)


Counter({'STATUS_y': 1})

In [10]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27342, 43)
(9115, 43)
(27342, 1)
(9115, 1)


## Scaling Dataset

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



# Class Imbalance
### The existing classes in the dataset is not equally represented. This is referred to as Class Imbalance and can cause the machine learning models to be biased toward the majority class. In this case, the machine learning models will be better at predicting not approved applicants. Hence, to counter this problem, we will be using Oversampling, Undersampling and Combination sampling techniques.


## First Oversampling technique: Random Oversampling

In [12]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)


Counter({'STATUS_y': 1})

## Logistic Regression Model

In [13]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [14]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3646,3433
Actual 1,997,1039


Accuracy Score : 0.5139879319802523
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.52      0.62      7079
           1       0.23      0.51      0.32      2036

    accuracy                           0.51      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.51      0.55      9115



## Decision Tree Model

In [15]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = tree_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5367,1712
Actual 1,773,1263


Accuracy Score : 0.7273724629731212
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      7079
           1       0.42      0.62      0.50      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.77      0.73      0.74      9115



## Random Forest Model

In [16]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5409,1670
Actual 1,791,1245


Accuracy Score : 0.7300054854635216
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      7079
           1       0.43      0.61      0.50      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.77      0.73      0.75      9115



In [17]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.26536849625352144, 'AGE'),
 (0.17991047785850717, 'AMT_INCOME_TOTAL'),
 (0.1751945652921347, 'EMPLOYMENT_PERIOD'),
 (0.03318845393614025, 'CNT_FAM_MEMBERS'),
 (0.02646657331136577, 'CNT_CHILDREN'),
 (0.025685869826296854, 'CODE_GENDER'),
 (0.025484682282033398, 'FLAG_OWN_REALTY'),
 (0.02548190505725296, 'FLAG_OWN_CAR'),
 (0.02435696580575567, 'NAME_INCOME_TYPE_Working'),
 (0.01519834005281654, 'NAME_FAMILY_STATUS_Married'),
 (0.014935888298271395, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.014212911486672147, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.013740661185494902, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.013285132346579638, 'OCCUPATION_TYPE_Laborers'),
 (0.012399745252799294, 'OCCUPATION_TYPE_Core staff'),
 (0.011454392728635252, 'OCCUPATION_TYPE_Sales staff'),
 (0.011002983681447219, 'NAME_INCOME_TYPE_State servant'),
 (0.010091122437041793, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.0098680852660067, 'OCCUPATION_TYPE_Managers'),
 (0.0091652370485040

## Gradient Boosted Tree Model

In [18]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))


Learning rate:  0.05
Accuracy score (training): 0.535
Accuracy score (validation): 0.522
Learning rate:  0.1
Accuracy score (training): 0.543
Accuracy score (validation): 0.530
Learning rate:  0.25
Accuracy score (training): 0.549
Accuracy score (validation): 0.529
Learning rate:  0.5
Accuracy score (training): 0.563
Accuracy score (validation): 0.548
Learning rate:  0.75
Accuracy score (training): 0.570
Accuracy score (validation): 0.545
Learning rate:  1
Accuracy score (training): 0.590
Accuracy score (validation): 0.563


In [19]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=1, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4199,2880
Actual 1,1104,932


Accuracy Score : 0.5629182665935272
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.59      0.68      7079
           1       0.24      0.46      0.32      2036

    accuracy                           0.56      9115
   macro avg       0.52      0.53      0.50      9115
weighted avg       0.67      0.56      0.60      9115



## Second Oversampling Technique: SMOTE

In [34]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)


Counter({'STATUS_y': 1})

## Logistic Regression

In [21]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [22]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3683,3396
Actual 1,1009,1027


Accuracy Score : 0.5167306637410861
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.52      0.63      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.52      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.52      0.56      9115



## Decision Tree

In [23]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5939,1140
Actual 1,1000,1036


Accuracy Score : 0.7652221612726275
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.48      0.51      0.49      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.67      0.67      9115
weighted avg       0.77      0.77      0.77      9115



## Random Forest Model

In [35]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5970,1109
Actual 1,971,1065


Accuracy Score : 0.7718047174986287
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.49      0.52      0.51      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.68      0.68      9115
weighted avg       0.78      0.77      0.77      9115



[(0.2425498853456231, 'AGE'),
 (0.1932249939813588, 'AMT_INCOME_TOTAL'),
 (0.1574650305737228, 'EMPLOYMENT_PERIOD'),
 (0.03722173262807689, 'CNT_FAM_MEMBERS'),
 (0.03290412306201037, 'CNT_CHILDREN'),
 (0.030423212085961507, 'FLAG_OWN_REALTY'),
 (0.030073557041525575, 'FLAG_OWN_CAR'),
 (0.02982689822699816, 'NAME_INCOME_TYPE_Working'),
 (0.02560596573055158, 'CODE_GENDER'),
 (0.02002930238285172, 'NAME_FAMILY_STATUS_Married'),
 (0.015729040806886925, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.014551517149669757, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.01441895710340275, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.01393876515622981, 'OCCUPATION_TYPE_Laborers'),
 (0.012024758723144707, 'OCCUPATION_TYPE_Core staff'),
 (0.011595371647595222, 'OCCUPATION_TYPE_Sales staff'),
 (0.010463459146527432, 'NAME_HOUSING_TYPE_House / apartment'),
 (0.00983668498373623, 'OCCUPATION_TYPE_Managers'),
 (0.009722902429226136, 'NAME_INCOME_TYPE_State servant'),
 (0.009681581459613712, 'NA

In [36]:
# # Import pickle library
import pickle

# # save the model to disk
filename = 'randomForest_SMOTE.sav'
pickle.dump(rf_model, open(filename, 'wb'))


## Gradient Boost Tree

In [26]:
## Gradient Boot Tree
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.583
Accuracy score (validation): 0.569
Learning rate:  0.1
Accuracy score (training): 0.580
Accuracy score (validation): 0.562
Learning rate:  0.25
Accuracy score (training): 0.598
Accuracy score (validation): 0.579
Learning rate:  0.5
Accuracy score (training): 0.623
Accuracy score (validation): 0.593
Learning rate:  0.75
Accuracy score (training): 0.593
Accuracy score (validation): 0.564
Learning rate:  1
Accuracy score (training): 0.600
Accuracy score (validation): 0.575


In [27]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.75, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4222,2857
Actual 1,1116,920


Accuracy Score : 0.564125068568294
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.60      0.68      7079
           1       0.24      0.45      0.32      2036

    accuracy                           0.56      9115
   macro avg       0.52      0.52      0.50      9115
weighted avg       0.67      0.56      0.60      9115



## First Undersampling Technique: ClusterCentroids

In [28]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'STATUS_y': 1})

## Logistic Regression Model

In [29]:
# Train the Logistic Regression model using the resampled data
model_under = LogisticRegression(solver='lbfgs', random_state=1)
model_under.fit(X_resampled, y_resampled)


LogisticRegression(random_state=1)

In [30]:
# Making predictions using the testing data.
y_pred = model_under.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Displaying results
print("Undersampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))

Undersampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,3654,3425
Actual Low Risk,1014,1022


Accuracy Score : 0.5130005485463521
              precision    recall  f1-score   support

           0       0.78      0.52      0.62      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.51      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.51      0.55      9115



## Decision Tree Model

In [31]:
## Decision Tree
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4797,2282
Actual 1,703,1333


Accuracy Score : 0.6725178277564454
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.68      0.76      7079
           1       0.37      0.65      0.47      2036

    accuracy                           0.67      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.67      0.70      9115



## Random Forest Model

In [32]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)



Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4790,2289
Actual 1,677,1359


Accuracy Score : 0.6746023038946791
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.68      0.76      7079
           1       0.37      0.67      0.48      2036

    accuracy                           0.67      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.67      0.70      9115



[(0.23869387409216927, 'AGE'),
 (0.18465264305748755, 'AMT_INCOME_TOTAL'),
 (0.18421181531939765, 'EMPLOYMENT_PERIOD'),
 (0.03356935940448145, 'CNT_FAM_MEMBERS'),
 (0.02939904831703595, 'FLAG_OWN_CAR'),
 (0.02652824349809737, 'CNT_CHILDREN'),
 (0.024905375492767452, 'FLAG_OWN_REALTY'),
 (0.023649954462595935, 'CODE_GENDER'),
 (0.021506700196176076, 'NAME_INCOME_TYPE_Working'),
 (0.01811413255131567, 'NAME_FAMILY_STATUS_Married'),
 (0.016010229090567183, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.014979109425014073, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.012941261336811428, 'OCCUPATION_TYPE_Laborers'),
 (0.012925223568166243, 'NAME_HOUSING_TYPE_House / apartment'),
 (0.012869429150198486, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.012062601950392774, 'OCCUPATION_TYPE_Core staff'),
 (0.010864643079285276, 'NAME_INCOME_TYPE_State servant'),
 (0.010683675647227192, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.01008730270506158, 'OCCUPATION_TYPE_Sales staff'),
 (0.00

## First Combination (Over and Under) Sampling Technique: SMOTEENN

# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled,y_train)
Counter(y_resampled)

## Logistic Regression Model

In [33]:
# Train the Logistic Regression model using the resampled data
model_comb = LogisticRegression(solver='lbfgs', random_state=1)
model_comb.fit(X_resampled, y_resampled)

y_pred = model_comb.predict(X_test_scaled)

acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Print the imbalanced classification report
print("Combination (Over and Under) Sampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))


Combination (Over and Under) Sampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,3654,3425
Actual Low Risk,1014,1022


Accuracy Score : 0.5130005485463521
              precision    recall  f1-score   support

           0       0.78      0.52      0.62      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.51      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.51      0.55      9115

