# Importing Ml_credit_application table from PostgreSQL database


In [1]:
# Python SQL toolkit and Object Relational Mapper
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import config as creds
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
get_ipython().run_line_magic('load_ext', 'sql')

In [3]:
# Create connection with PostgreSQL databse
get_ipython().run_line_magic('sql', 'postgresql://postgres:{creds.password}@{creds.path}:5432/postgres')

In [4]:
# reflect an existing database into a new model
engine = create_engine(f"postgresql://postgres:{creds.password}@{creds.path}:5432/postgres")
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# We can view all of the classes that automap found
Base.classes.keys()

['application_record', 'visual_creditapp']

In [6]:
application = Base.classes.application_record
session = Session(engine)
results = []
results = session.query(application)

# Machine Learning

In [7]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Feature selection

In [8]:
credit_application_df = pd.read_csv("../Resources/ML_credit_application.csv")
credit_application_df.drop(['ID'], axis=1, inplace=True)
# Create our features
X = credit_application_df.drop(columns="STATUS_y")

# Create our target
y = pd.DataFrame(credit_application_df["STATUS_y"])



## Split the Data into Training and Testing


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Counter(y_train)


Counter({'STATUS_y': 1})

In [10]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27342, 43)
(9115, 43)
(27342, 1)
(9115, 1)


## Scaling Dataset

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



# Class Imbalance
### The existing classes in the dataset is not equally represented. This is referred to as Class Imbalance and can cause the machine learning models to be biased toward the majority class. In this case, the machine learning models will be better at predicting not approved applicants. Hence, to counter this problem, we will be using Oversampling, Undersampling and Combination sampling techniques.


## First Oversampling technique: Random Oversampling

In [12]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)


Counter({'STATUS_y': 1})

## Logistic Regression Model

In [13]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [14]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3738,3341
Actual 1,1027,1009


Accuracy Score : 0.5207899067471201
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.53      0.63      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.52      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.52      0.56      9115



## Decision Tree Model

In [15]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = tree_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5341,1738
Actual 1,762,1274


Accuracy Score : 0.7257268239166209
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.75      0.81      7079
           1       0.42      0.63      0.50      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.77      0.73      0.74      9115



## Random Forest Model

In [16]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5432,1647
Actual 1,779,1257


Accuracy Score : 0.733845309928689
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.77      0.82      7079
           1       0.43      0.62      0.51      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.78      0.73      0.75      9115



In [17]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.2681890504197553, 'AGE'),
 (0.17599279893930972, 'AMT_INCOME_TOTAL'),
 (0.17357753385526395, 'EMPLOYMENT_PERIOD'),
 (0.03454358712646125, 'CNT_FAM_MEMBERS'),
 (0.028587901436233965, 'FLAG_OWN_REALTY_Y'),
 (0.026361357550779507, 'CNT_CHILDREN'),
 (0.025862919693237105, 'NAME_INCOME_TYPE_Working'),
 (0.024350678032607886, 'CODE_GENDER_M'),
 (0.024087242525229403, 'FLAG_OWN_CAR_Y'),
 (0.017949164671487904, 'NAME_FAMILY_STATUS_Married'),
 (0.015107910384100236, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.014077084263026535, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.012980739621535415, 'OCCUPATION_TYPE_Laborers'),
 (0.012773983018377672, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.011532698327587416, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.01103658468536231, 'OCCUPATION_TYPE_Core staff'),
 (0.010858124890119496, 'NAME_INCOME_TYPE_State servant'),
 (0.010606696168106017, 'OCCUPATION_TYPE_Sales staff'),
 (0.010283849882443519, 'OCCUPATION_TYPE_Managers'),
 (0.00864

## Gradient Boosted Tree Model

In [18]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))


Learning rate:  0.05
Accuracy score (training): 0.518
Accuracy score (validation): 0.503
Learning rate:  0.1
Accuracy score (training): 0.514
Accuracy score (validation): 0.497
Learning rate:  0.25
Accuracy score (training): 0.555
Accuracy score (validation): 0.533
Learning rate:  0.5
Accuracy score (training): 0.561
Accuracy score (validation): 0.545
Learning rate:  0.75
Accuracy score (training): 0.563
Accuracy score (validation): 0.538
Learning rate:  1
Accuracy score (training): 0.586
Accuracy score (validation): 0.574


In [19]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=1, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4223,2856
Actual 1,1031,1005


Accuracy Score : 0.5735600658255623
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.60      0.68      7079
           1       0.26      0.49      0.34      2036

    accuracy                           0.57      9115
   macro avg       0.53      0.55      0.51      9115
weighted avg       0.68      0.57      0.61      9115



## Second Oversampling Technique: SMOTE

In [20]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)


Counter({'STATUS_y': 1})

## Logistic Regression

In [21]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [22]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3683,3396
Actual 1,1009,1027


Accuracy Score : 0.5167306637410861
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.52      0.63      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.52      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.52      0.56      9115



## Decision Tree

In [23]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5926,1153
Actual 1,993,1043


Accuracy Score : 0.7645639056500274
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.47      0.51      0.49      2036

    accuracy                           0.76      9115
   macro avg       0.67      0.67      0.67      9115
weighted avg       0.77      0.76      0.77      9115



## Random Forest Model

In [24]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5955,1124
Actual 1,993,1043


Accuracy Score : 0.7677454744925947
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.48      0.51      0.50      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.68      0.67      9115
weighted avg       0.77      0.77      0.77      9115



[(0.24219088477478723, 'AGE'),
 (0.19338576396285256, 'AMT_INCOME_TOTAL'),
 (0.15736418758071824, 'EMPLOYMENT_PERIOD'),
 (0.03693719928194003, 'CNT_FAM_MEMBERS'),
 (0.03382458116430659, 'CNT_CHILDREN'),
 (0.030559567691105852, 'FLAG_OWN_CAR_Y'),
 (0.030032050463229095, 'NAME_INCOME_TYPE_Working'),
 (0.02983963631161779, 'FLAG_OWN_REALTY_Y'),
 (0.025892321265394837, 'CODE_GENDER_M'),
 (0.019908233743085715, 'NAME_FAMILY_STATUS_Married'),
 (0.016233578190704947, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.01537166221610715, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.01447188088208396, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.01384371209419692, 'OCCUPATION_TYPE_Laborers'),
 (0.011820852695779278, 'OCCUPATION_TYPE_Core staff'),
 (0.011597647702022975, 'OCCUPATION_TYPE_Sales staff'),
 (0.009878604487510746, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.009702481145350559, 'NAME_HOUSING_TYPE_House / apartment'),
 (0.009660454346530128, 'NAME_INCOME_TYPE_State servant'),
 

## Gradient Boost Tree

In [25]:
## Gradient Boot Tree
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.576
Accuracy score (validation): 0.564
Learning rate:  0.1
Accuracy score (training): 0.572
Accuracy score (validation): 0.552
Learning rate:  0.25
Accuracy score (training): 0.598
Accuracy score (validation): 0.578
Learning rate:  0.5
Accuracy score (training): 0.605
Accuracy score (validation): 0.581
Learning rate:  0.75
Accuracy score (training): 0.612
Accuracy score (validation): 0.584
Learning rate:  1
Accuracy score (training): 0.619
Accuracy score (validation): 0.598


In [26]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.75, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4490,2589
Actual 1,1199,837


Accuracy Score : 0.5844212835984641
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.63      0.70      7079
           1       0.24      0.41      0.31      2036

    accuracy                           0.58      9115
   macro avg       0.52      0.52      0.50      9115
weighted avg       0.67      0.58      0.61      9115



## First Undersampling Technique: ClusterCentroids

In [27]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'STATUS_y': 1})

## Logistic Regression Model

In [28]:
# Train the Logistic Regression model using the resampled data
model_under = LogisticRegression(solver='lbfgs', random_state=1)
model_under.fit(X_resampled, y_resampled)


LogisticRegression(random_state=1)

In [29]:
# Making predictions using the testing data.
y_pred = model_under.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Displaying results
print("Undersampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))

Undersampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,3654,3425
Actual Low Risk,1014,1022


Accuracy Score : 0.5130005485463521
              precision    recall  f1-score   support

           0       0.78      0.52      0.62      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.51      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.51      0.55      9115



## Decision Tree Model

In [30]:
## Decision Tree
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4820,2259
Actual 1,697,1339


Accuracy Score : 0.6756993965990126
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.68      0.77      7079
           1       0.37      0.66      0.48      2036

    accuracy                           0.68      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.68      0.70      9115



## Random Forest Model

In [31]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)



Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4758,2321
Actual 1,687,1349


Accuracy Score : 0.6699945145364783
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.67      0.76      7079
           1       0.37      0.66      0.47      2036

    accuracy                           0.67      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.67      0.70      9115



[(0.2364064390693618, 'AGE'),
 (0.18478222199475283, 'EMPLOYMENT_PERIOD'),
 (0.1839592181677551, 'AMT_INCOME_TOTAL'),
 (0.03320140876505904, 'CNT_FAM_MEMBERS'),
 (0.029313327286417085, 'FLAG_OWN_CAR_Y'),
 (0.026248827579551035, 'CNT_CHILDREN'),
 (0.024540703967090474, 'FLAG_OWN_REALTY_Y'),
 (0.022351270424790154, 'NAME_INCOME_TYPE_Working'),
 (0.022328390685635954, 'CODE_GENDER_M'),
 (0.017528878736495574, 'NAME_FAMILY_STATUS_Married'),
 (0.016069310127594882, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.015062434331058971, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.013686416193626857, 'OCCUPATION_TYPE_Laborers'),
 (0.01333406170082822, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.012935030814204409, 'NAME_HOUSING_TYPE_House / apartment'),
 (0.01205925466425303, 'OCCUPATION_TYPE_Core staff'),
 (0.011441143508265212, 'NAME_INCOME_TYPE_State servant'),
 (0.010757496461571951, 'OCCUPATION_TYPE_Sales staff'),
 (0.010669022625321592, 'NAME_FAMILY_STATUS_Single / not married'),


## First Combination (Over and Under) Sampling Technique: SMOTEENN

# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled,y_train)
Counter(y_resampled)

## Logistic Regression Model

In [32]:
# Train the Logistic Regression model using the resampled data
model_comb = LogisticRegression(solver='lbfgs', random_state=1)
model_comb.fit(X_resampled, y_resampled)

y_pred = model_comb.predict(X_test_scaled)

acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Print the imbalanced classification report
print("Combination (Over and Under) Sampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))


Combination (Over and Under) Sampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,3654,3425
Actual Low Risk,1014,1022


Accuracy Score : 0.5130005485463521
              precision    recall  f1-score   support

           0       0.78      0.52      0.62      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.51      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.51      0.55      9115

