# Importing Ml_credit_application table from PostgreSQL database


In [1]:
# Python SQL toolkit and Object Relational Mapper
import pandas as pd
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import config as creds
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

In [2]:
get_ipython().run_line_magic('load_ext', 'sql')

In [3]:
# Create connection with PostgreSQL databse
get_ipython().run_line_magic('sql', 'postgresql://postgres:{creds.password}@{creds.path}:5432/postgres')

In [4]:
# reflect an existing database into a new model
engine = create_engine(f"postgresql://postgres:{creds.password}@{creds.path}:5432/postgres")
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# We can view all of the classes that automap found
Base.classes.keys()

['application_record', 'visual_creditapp']

In [6]:
application = Base.classes.application_record
session = Session(engine)
results = []
results = session.query(application)

# Machine Learning

In [7]:
# Initial imports.
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Feature selection

In [8]:
credit_application_df = pd.read_csv("../Resources/datasets/ML_credit_application.csv")
credit_application_df.drop(['ID'], axis=1, inplace=True)
# Create our features
X = credit_application_df.drop(columns="STATUS_y")

# Create our target
y = pd.DataFrame(credit_application_df["STATUS_y"])



## Split the Data into Training and Testing


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

Counter(y_train)


Counter({'STATUS_y': 1})

In [10]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(27342, 43)
(9115, 43)
(27342, 1)
(9115, 1)


## Scaling Dataset

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)



# Class Imbalance
### The existing classes in the dataset is not equally represented. This is referred to as Class Imbalance and can cause the machine learning models to be biased toward the majority class. In this case, the machine learning models will be better at predicting not approved applicants. Hence, to counter this problem, we will be using Oversampling, Undersampling and Combination sampling techniques.


## First Oversampling technique: Random Oversampling

In [12]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

Counter(y_resampled)


Counter({'STATUS_y': 1})

## Logistic Regression Model

In [13]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [14]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3774,3305
Actual 1,1002,1034


Accuracy Score : 0.5274821722435545
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.53      0.64      7079
           1       0.24      0.51      0.32      2036

    accuracy                           0.53      9115
   macro avg       0.51      0.52      0.48      9115
weighted avg       0.67      0.53      0.57      9115



## Decision Tree Model

In [15]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = tree_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5357,1722
Actual 1,778,1258


Accuracy Score : 0.7257268239166209
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.76      0.81      7079
           1       0.42      0.62      0.50      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.77      0.73      0.74      9115



## Random Forest Model

In [16]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5411,1668
Actual 1,781,1255


Accuracy Score : 0.7313219967087219
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.76      0.82      7079
           1       0.43      0.62      0.51      2036

    accuracy                           0.73      9115
   macro avg       0.65      0.69      0.66      9115
weighted avg       0.77      0.73      0.75      9115



In [17]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


[(0.2644402337732209, 'AGE'),
 (0.17930321617725453, 'AMT_INCOME_TOTAL'),
 (0.17608245489915794, 'EMPLOYMENT_PERIOD'),
 (0.03318701149819786, 'CNT_FAM_MEMBERS'),
 (0.027623446674924713, 'FLAG_OWN_REALTY_Y'),
 (0.026687582649528835, 'CNT_CHILDREN'),
 (0.02510243027376391, 'NAME_INCOME_TYPE_Working'),
 (0.024362424284928016, 'CODE_GENDER_M'),
 (0.024265001584654375, 'FLAG_OWN_CAR_Y'),
 (0.014657715931111582, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.014546113877179755, 'NAME_FAMILY_STATUS_Married'),
 (0.014060493322981139, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.013638445096537688, 'OCCUPATION_TYPE_Laborers'),
 (0.01312496463491489, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.012464410297654256, 'OCCUPATION_TYPE_Core staff'),
 (0.011303186081648238, 'OCCUPATION_TYPE_Sales staff'),
 (0.01053817477520548, 'NAME_INCOME_TYPE_State servant'),
 (0.00989954367011294, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.009891587066023066, 'OCCUPATION_TYPE_Managers'),
 (0.00892156

## Gradient Boosted Tree Model

In [18]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))


Learning rate:  0.05
Accuracy score (training): 0.541
Accuracy score (validation): 0.524
Learning rate:  0.1
Accuracy score (training): 0.547
Accuracy score (validation): 0.531
Learning rate:  0.25
Accuracy score (training): 0.558
Accuracy score (validation): 0.539
Learning rate:  0.5
Accuracy score (training): 0.574
Accuracy score (validation): 0.556
Learning rate:  0.75
Accuracy score (training): 0.573
Accuracy score (validation): 0.550
Learning rate:  1
Accuracy score (training): 0.593
Accuracy score (validation): 0.576


In [19]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=1, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4261,2818
Actual 1,1046,990


Accuracy Score : 0.5760833790455293
Classification Report
              precision    recall  f1-score   support

           0       0.80      0.60      0.69      7079
           1       0.26      0.49      0.34      2036

    accuracy                           0.58      9115
   macro avg       0.53      0.54      0.51      9115
weighted avg       0.68      0.58      0.61      9115



## Second Oversampling Technique: SMOTE

In [20]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train_scaled, y_train)
Counter(y_resampled)


Counter({'STATUS_y': 1})

## Logistic Regression

In [21]:
# Train the Logistic Regression model using the resampled data
log_model = LogisticRegression(solver='lbfgs', random_state=1)
log_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [22]:
# Making predictions using the testing data.
predictions = log_model.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3683,3396
Actual 1,1009,1027


Accuracy Score : 0.5167306637410861
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.52      0.63      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.52      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.52      0.56      9115



## Decision Tree

In [23]:
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5932,1147
Actual 1,995,1041


Accuracy Score : 0.7650027427317608
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.48      0.51      0.49      2036

    accuracy                           0.77      9115
   macro avg       0.67      0.67      0.67      9115
weighted avg       0.77      0.77      0.77      9115



## Random Forest Model

In [24]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5981,1098
Actual 1,973,1063


Accuracy Score : 0.7727921009325288
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7079
           1       0.49      0.52      0.51      2036

    accuracy                           0.77      9115
   macro avg       0.68      0.68      0.68      9115
weighted avg       0.78      0.77      0.78      9115



[(0.2414271305165444, 'AGE'),
 (0.19323825085000354, 'AMT_INCOME_TOTAL'),
 (0.15689303307759755, 'EMPLOYMENT_PERIOD'),
 (0.036181568642070304, 'CNT_FAM_MEMBERS'),
 (0.034838137769655414, 'CNT_CHILDREN'),
 (0.03101211615182868, 'FLAG_OWN_REALTY_Y'),
 (0.030763565386238305, 'FLAG_OWN_CAR_Y'),
 (0.0291536367012491, 'NAME_INCOME_TYPE_Working'),
 (0.026184563490192995, 'CODE_GENDER_M'),
 (0.0202196821934919, 'NAME_FAMILY_STATUS_Married'),
 (0.01577692672344784, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.014924307790380129, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.014878845339820685, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.013655826349676476, 'OCCUPATION_TYPE_Laborers'),
 (0.011831049845456711, 'OCCUPATION_TYPE_Core staff'),
 (0.011723546660751096, 'OCCUPATION_TYPE_Sales staff'),
 (0.009796947865996912, 'NAME_HOUSING_TYPE_House / apartment'),
 (0.009771116330248358, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.009767209884515777, 'OCCUPATION_TYPE_Managers'),
 (0.0095

In [25]:
# # Import pickle library
# import pickle

# # save the model to disk
# filename = 'randomForest_SMOTE.sav'
# pickle.dump(rf_model, open(filename, 'wb'))


## Gradient Boost Tree

In [26]:
## Gradient Boot Tree
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
   classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=learning_rate,
   max_features=5,
   max_depth=3,
   random_state=0)
   classifier.fit(X_resampled, y_resampled)
   print("Learning rate: ", learning_rate)
   print("Accuracy score (training): {0:.3f}".format(
       classifier.score(
           X_train_scaled,
           y_train)))
   print("Accuracy score (validation): {0:.3f}".format(
       classifier.score(
           X_test_scaled,
           y_test)))

Learning rate:  0.05
Accuracy score (training): 0.576
Accuracy score (validation): 0.564
Learning rate:  0.1
Accuracy score (training): 0.572
Accuracy score (validation): 0.552
Learning rate:  0.25
Accuracy score (training): 0.598
Accuracy score (validation): 0.578
Learning rate:  0.5
Accuracy score (training): 0.605
Accuracy score (validation): 0.581
Learning rate:  0.75
Accuracy score (training): 0.612
Accuracy score (validation): 0.584
Learning rate:  1
Accuracy score (training): 0.619
Accuracy score (validation): 0.598


In [27]:
GB_classifier = GradientBoostingClassifier(n_estimators=20,
   learning_rate=0.75, max_features=5, max_depth=3, random_state=0)

GB_classifier.fit(X_resampled, y_resampled)
predictions = GB_classifier.predict(X_test_scaled)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
   cm, index=["Actual 0", "Actual 1"],
   columns=["Predicted 0", "Predicted 1"]
)
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4490,2589
Actual 1,1199,837


Accuracy Score : 0.5844212835984641
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.63      0.70      7079
           1       0.24      0.41      0.31      2036

    accuracy                           0.58      9115
   macro avg       0.52      0.52      0.50      9115
weighted avg       0.67      0.58      0.61      9115



## First Undersampling Technique: ClusterCentroids

In [28]:
# Resample the data using the ClusterCentroids resampler
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({'STATUS_y': 1})

## Logistic Regression Model

In [29]:
# Train the Logistic Regression model using the resampled data
model_under = LogisticRegression(solver='lbfgs', random_state=1)
model_under.fit(X_resampled, y_resampled)


LogisticRegression(random_state=1)

In [30]:
# Making predictions using the testing data.
y_pred = model_under.predict(X_test_scaled)

# Calculated the accuracy score
acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Displaying results
print("Undersampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))

Undersampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,3654,3425
Actual Low Risk,1014,1022


Accuracy Score : 0.5130005485463521
              precision    recall  f1-score   support

           0       0.78      0.52      0.62      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.51      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.51      0.55      9115



## Decision Tree Model

In [31]:
## Decision Tree
# Creating the decision tree classifier instance.
tree_model = tree.DecisionTreeClassifier()
# Fitting the model.
tree_model.fit(X_resampled, y_resampled)

predictions = tree_model.predict(X_test_scaled)
acc_score = accuracy_score(y_test, predictions)

# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4810,2269
Actual 1,699,1337


Accuracy Score : 0.6743828853538124
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.68      0.76      7079
           1       0.37      0.66      0.47      2036

    accuracy                           0.67      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.67      0.70      9115



## Random Forest Model

In [32]:
## Random Forest
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128) 
# Fitting the model
rf_model = rf_model.fit(X_resampled, y_resampled)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)



Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,4810,2269
Actual 1,690,1346


Accuracy Score : 0.6753702687877126
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.68      0.76      7079
           1       0.37      0.66      0.48      2036

    accuracy                           0.68      9115
   macro avg       0.62      0.67      0.62      9115
weighted avg       0.76      0.68      0.70      9115



[(0.23686407625280734, 'AGE'),
 (0.18525961429438353, 'AMT_INCOME_TOTAL'),
 (0.18399300657100465, 'EMPLOYMENT_PERIOD'),
 (0.03415894625022409, 'CNT_FAM_MEMBERS'),
 (0.028913915709783022, 'FLAG_OWN_CAR_Y'),
 (0.027111531871883674, 'CNT_CHILDREN'),
 (0.02440135195035264, 'CODE_GENDER_M'),
 (0.023478460513502514, 'FLAG_OWN_REALTY_Y'),
 (0.020645934114714395, 'NAME_INCOME_TYPE_Working'),
 (0.01822471108253514, 'NAME_FAMILY_STATUS_Married'),
 (0.01585358490086493, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.015434831168523677, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.013240038732775334, 'OCCUPATION_TYPE_No Occupation Type'),
 (0.012876199531768818, 'OCCUPATION_TYPE_Laborers'),
 (0.012648192854890746, 'NAME_HOUSING_TYPE_House / apartment'),
 (0.011690703080955557, 'OCCUPATION_TYPE_Core staff'),
 (0.011147706270873307, 'NAME_INCOME_TYPE_State servant'),
 (0.010714098716313783, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.01024500473870663, 'OCCUPATION_TYPE_Sales staff'),


## First Combination (Over and Under) Sampling Technique: SMOTEENN

# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled,y_train)
Counter(y_resampled)

## Logistic Regression Model

In [33]:
# Train the Logistic Regression model using the resampled data
model_comb = LogisticRegression(solver='lbfgs', random_state=1)
model_comb.fit(X_resampled, y_resampled)

y_pred = model_comb.predict(X_test_scaled)

acc_score = accuracy_score(y_test, y_pred)

# Display the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create df for confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual High Risk", "Actual Low Risk"], columns=["Predicted High Risk", "Predicted Low Risk"])

# Print the imbalanced classification report
print("Combination (Over and Under) Sampling")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(classification_report(y_test, y_pred))


Combination (Over and Under) Sampling


Unnamed: 0,Predicted High Risk,Predicted Low Risk
Actual High Risk,3654,3425
Actual Low Risk,1014,1022


Accuracy Score : 0.5130005485463521
              precision    recall  f1-score   support

           0       0.78      0.52      0.62      7079
           1       0.23      0.50      0.32      2036

    accuracy                           0.51      9115
   macro avg       0.51      0.51      0.47      9115
weighted avg       0.66      0.51      0.55      9115

