# BankLoan Dataset
Responsibility : Yahia Ehab

## Imports

In [56]:
# For data manipulation and analysis
import pandas as pd
import numpy as np

# For data preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# For implementing ensemble models
from sklearn.ensemble import (
    RandomForestClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
)

# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# For model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns

## Dataset Selection

In [57]:
df = pd.read_csv("./AMLAss1Datasets/bankloan.csv")  # Suitable for regression

## Preprocessing

* Dataset has no nulls
* Choosing only columns that we will use
* Scaling the numerical features 
    ```python
    StandardScaler() #Scaling data to fit a standard normal distribution
    ```
* Encoding the categorical features
    ```python
    label_encoder = LabelEncoder() #Convert categorical variables into numerical format
    ```
* Removing outliers using `IQR` method

In [58]:
df = df[
    [
        "Age", "Experience", "Income",
        "Family", "CCAvg", "Education",
        "Mortgage", "Personal.Loan", "Securities.Account",
        "CD.Account", "Online", "CreditCard",
    ]
]

# Renaming the columns
df.columns = [
    "age", "experience", "income", "family",
    "cc_avg","education", "mortgage", 
    "personal_loan", "securities_account", 
    "cd_account", "online", "credit_card",
]

In [59]:
# Scaling the numerical features
scaler = StandardScaler()
df[["age", "experience", "income", "cc_avg", "mortgage"]] = scaler.fit_transform(
    df[["age", "experience", "income", "cc_avg", "mortgage"]]
)

# Encoding the categorical features
label_encoder = LabelEncoder()
df["education"] = label_encoder.fit_transform(df["education"])

In [60]:
df.head()

Unnamed: 0,age,experience,income,family,cc_avg,education,mortgage,personal_loan,securities_account,cd_account,online,credit_card
0,-1.774417,-1.666078,-0.538229,4,-0.193385,0,-0.555524,0,1,0,0,0
1,-0.029524,-0.09633,-0.864109,3,-0.250611,0,-0.555524,0,1,0,0,0
2,-0.552992,-0.445163,-1.363793,1,-0.536736,0,-0.555524,0,0,0,0,0
3,-0.90197,-0.968413,0.569765,1,0.436091,1,-0.555524,0,0,0,0,0
4,-0.90197,-1.055621,-0.62513,4,-0.536736,1,-0.555524,0,0,0,0,1


## Model Implementation &  Hyperparameter Turning

References :
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
* https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html 
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
* https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

### Train Test split

In [61]:
X = df.drop("personal_loan", axis=1)
y = df["personal_loan"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Random Forrest Classifier

In [62]:
# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

# Model evaluation
print("Random Forest Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred,zero_division=1))
print("Recall:", recall_score(y_test, y_pred,zero_division=1))
print("F1 Score:", f1_score(y_test, y_pred,zero_division=1))

Random Forest Classifier
Accuracy: 0.991
Precision: 0.9897959183673469
Recall: 0.9238095238095239
F1 Score: 0.9556650246305419


In [63]:
# Hyperparameters
param_dist = {
    "n_estimators": [50, 100, 150, 200, 250],
    "max_depth": [3, 5, 7, 9, 11, 13],
    "max_features": ["auto", "sqrt", "log2"],
    "min_samples_split": [2, 4, 6, 8, 10],
    "min_samples_leaf": [1, 2, 4, 6, 8],
    "bootstrap": [True, False],
}

In [64]:
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

rf_random_search = RandomizedSearchCV(
    rf_model, param_distributions=param_dist, n_iter=50, cv=5, random_state=42
)

rf_random_search.fit(X_train, y_train)

rf_random_search.best_params_

{'n_estimators': 250,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 11,
 'bootstrap': False}

In [66]:
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

rf_grid_search = GridSearchCV(rf_model, param_grid=param_dist, cv=5, n_jobs=-1)

rf_grid_search.fit(X_train, y_train)

rf_grid_search.best_params_

KeyboardInterrupt: 

In [None]:
# Random Forest Classifier with best hyperparameters
rf_model_best = RandomForestClassifier(
    n_estimators=100,
    max_features="sqrt",
)

rf_model_best.fit(X_train, y_train)
y_pred = rf_model_best.predict(X_test)

# Model evaluation
print("Random Forest Classifier with best hyperparameters")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred,zero_division=1))
print("Recall:", recall_score(y_test, y_pred,zero_division=1))
print("F1 Score:", f1_score(y_test, y_pred,zero_division=1))


Random Forest Classifier with best hyperparameters
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


### AdaBoost

In [None]:
# AdaBoost Classifier
ada_model = AdaBoostClassifier(random_state=42)
ada_model.fit(X_train, y_train)
y_pred = ada_model.predict(X_test)

# Model evaluation
print("AdaBoost Classifier")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred,zero_division=1))
print("Recall:", recall_score(y_test, y_pred,zero_division=1))
print("F1 Score:", f1_score(y_test, y_pred,zero_division=1))

AdaBoost Classifier
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [None]:
# Hyperparameters
param_dist = {
    "n_estimators": [50, 100, 150, 200, 250],
    "learning_rate": [0.01, 0.05, 0.1, 0.5, 1],
}

In [None]:
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

ada_random_search = RandomizedSearchCV(
    ada_model, param_distributions=param_dist, n_iter=100, cv=5, random_state=42
)

ada_random_search.fit(X_train, y_train)

ada_random_search.best_params_

{'n_estimators': 50, 'learning_rate': 0.01}

In [None]:
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

ada_grid_search = GridSearchCV(ada_model, param_grid=param_dist, cv=5)

ada_grid_search.fit(X_train, y_train)

ada_grid_search.best_params_