# Business Understanding

Loan approval prediction is to predict whether the new loan application from new applicants should be rejected or approved. 




In [1]:
!pip install --upgrade pip
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(font_scale=1)
pd.set_option ('display.max_columns', None)
pd.set_option ('display.max_rows', None)

#import warnings to surpress warnings

import warnings
warnings.filterwarnings("ignore")

# 1. Data Understanding

In [3]:
# Initializing dataset
df = pd.read_csv("loan_data_2007_2014.csv", index_col=0)
print(f'\n Shape of the Dataset: {df.shape[0]} rows and {df.shape[1]} columns')
df.head(5)

FileNotFoundError: [Errno 2] No such file or directory: 'loan_data_2007_2014.csv'

In [None]:
# Understand the data structure
df.info()

In [None]:
#Checking the shape of the dataframe
df.shape
print(f'row : {df.shape[0]}')
print(f'column : {df.shape[1]}')

# Exploring the descriptive statistics of the dataframe
df.describe().T

# 2. Exploratory Data Analysis

- Numerical and categorical features
- Data Cleaning
- Imbalanced data and outliers

## 2.1 Numerical & Categorical Features

In [None]:
# Separate numerical & categorical feature columns
numerical = df.select_dtypes (include = ['int64', 'float64'])
categorical = df.select_dtypes (include = 'object')

#Convert columns to list
numerical_col = numerical.columns.to_list()
categorical_col = categorical.columns.to_list()

### 2.1.1 Numerical Features

In [None]:
# Explore numerical features' descriptive
print(f'rows: {numerical.shape[0]}')
print (f'columns: {numerical.shape[1]}')

numerical.describe().T


### 2.1.2 Categorical Features

In [None]:
#Exploring the categorical features' descriptive

print (f'rows: {categorical.shape[0]}')
print (f'columns: {categorical.shape[1]}')

categorical.describe().T

## 2.2 Data Cleaning

Checking and handling:
- Duplicate values
- Missing values
- Irrelevant Features
- Data leakage
- Uninformative features

### 2.2.1 Duplicate Values

### 2.2.2 Missing Values

In [None]:
#checking the total missing values in each feature
sum_null = df.isnull().sum()

# Calculate the percentage of the missing values in each feature
missing_percent = (sum_null*100)/len(df)

# Checking the data type for each feature
data_type= [df[col].dtype for col in df.columns]

# Create a new dataframe for the missing value
data_isnull = pd.DataFrame({"total_null": sum_null, "data_type": data_type, "missing_%": missing_percent})

#sort the percentage of missing values from the largest to the lowest
data_isnull.sort_values("missing_%", ascending = False, inplace = True)

#display the missing value dataframe
data_isnull_sort = data_isnull[data_isnull["missing_%"]>0].reset_index()
data_isnull_sort

#### Handling Missing Values

##### a. Dropping 50% missing values

There are 21 columns that have more than 50% missing values. Thus, they are dropped from the dataframe

In [None]:
# Drop features with missing value ≥ 50%

col_null = data_isnull.loc[data_isnull["missing_%"]>=50].index.tolist()
df.drop(columns = col_null, inplace = True)

In [None]:
df.shape
print(f'row : {df.shape[0]}')
print(f'column : {df.shape[1]}')
print(f'numericals_features : {df.select_dtypes(include=["int64", "float64"]).shape[1]}')
print (f'categorical_features : {df.select_dtypes(include = "object").shape[1]}')
df.head()

##### b. Replacing Missing Values with 0

There are features with missing values that can be replaced with 0:

- tot_coll_amt	 = Total collection amounts ever owed -> assumption: customers don't borrow.
- tot_cur_bal	     = Total current balance of all accounts  -> assumption: customers don't have any current balance.
- total_rev_hi_lim = Total revolving high credit/credit limit -> assumption: customers don't have a revolving limit.

In [None]:
# Replacing the selected features with missing values with 0

for item in ["tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim"]:
    df[item] = df[item].fillna(0)

df[['tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']].head(3)

##### c. Handling missing values in numerical features

Since the numerical features are in continuous values, the median of each of these features is used to replace the missing values

In [None]:
# Replacing the missing values in numerical features with median
numericals = df.select_dtypes(include=['int64', 'float64'])
for item in numericals:
    df[item] = df[item].fillna(df[item].median())
    
print(f'numerical_features : {df.select_dtypes(include=["int64", "float64"]).shape[1]}')
numericals.head(3)

##### d. Handling missing values in categorical features

The mode of each of categorical features is used to replace the missing values

In [None]:
# Replacing the missing values in categorical features with mode
categoricals = df.select_dtypes(include=['object'])
for item in categoricals:
    df[item] = df[item].fillna(df[item].mode().iloc[0])
    
print(f'categorical_features : {df.select_dtypes(include="object").shape[1]}')
categoricals.head(3)

### 2.2.3 Irrelevant Features


There are irrelevant features that don't have any correlation or association with the target features due to their nature of unique entries, no association with the target feature, leading to bias, etc. These features are:

- id = A unique LC assigned ID for the loan listing. -> the unique entries with no association to the target feature
- member_id = A unique LC assigned Id for the borrower member. -> too many unique entries
- grade = LC assigned loan grade -> Instead, using sub_grade since it has more complete information
- emp_title = The job title supplied by the Borrower when applying for the loan -> too many unique entries
- url = URL for the LC page with listing data -> too many unique entries, no association with the target feature
- zip_code = The first 3 numbers of the zip code provided by the borrower in the loan application. -> can lead to bias
- addr_state = The state provided by the borrower in the loan application -> Can lead to bias
- title = The loan title provided by the borrower -> too many unique entries
- inq_last_6mths = The number of inquiries in the past 6 months.
- issue_d = The month in which the loan was funded

In [None]:
# drop the irrelevant features
df.drop(columns =["id","member_id","grade","emp_title","url", 
                       "zip_code", "addr_state","title", 
                       "inq_last_6mths","issue_d"], inplace = True)

### 2.2.4 Data Leakage

There are features are acquired after the loan is approved and are not available by the time when the model is making predictions. Thus, using these feature will lead to data leakage and unrealistic high accuracy during training and test, but poor performance during real-world prediction.

- funded_amnt = The total amount committed to that loan at that point in time.
- funded_amnt_inv = The total amount committed to that loan at that point in time by investor
- out_prncp = Remaining outstanding principal for total amount funded
- out_prncp_inv = Remaining outstanding principal for portion of total amount funded by investors
- total_pymnt = Payments received to date for total amount funded
- total_pymnt_inv = Payments received to date for portion of total amount funded by investors
- total_rec_prncp = Principal received to date
- total_rec_int = Interest received to date
- total_rec_late_fee = Late fees received to date
- recoveries = Indicates if a payment plan has been put in place for the loan
- collection_recovery_fee = post charge off collection fee
- tot_coll_amt = Total collection amounts ever owed
- last_pymnt_amnt = Last total payment amount received
- revol_util = Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.        
- collections_12_mths_ex_med = Number of collections in 12 months excluding medical collections
- last_pymnt_d = Last month's payment was received
- next_pymnt_d = Next scheduled payment date
- last_credit_pull_d = The most recent month LC pulled credit for this loan
- initial_list_status = The initial listing status of the loan. Possible values are – Whole, Fractional
- total_rev_hi_lim = total revolving high credit/credit limit
  

In [None]:
# Drop the data leakage features
df.drop(columns = ['funded_amnt', 'funded_amnt_inv', 'out_prncp', 'out_prncp_inv', 
                   'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 
                   'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 
                   'tot_coll_amt', 'last_pymnt_amnt', 'revol_util', 'collections_12_mths_ex_med',
                   'last_pymnt_d','next_pymnt_d','last_credit_pull_d','initial_list_status',
                   'total_rev_hi_lim' ], inplace= True)

### 2.2.5 Uniformative Features for Classification

There are categorical features that don't give enough information for classification predictions due to their lack of heterogeneity in values. 

- pymnt_plan = Indicates if a payment plan has been put in place for the loan -> 99.998% has the same value
- policy_code = publicly available policy_code=1, new products not publicly available policy_code=2" -> only 1 unique
- application_type = Indicates whether the loan is an individual application or a joint application with two co-borrowers -> only 1 unique


In [None]:
fig, axs = plt.subplots(1, 3, figsize=(8, 5))

# Flatten the axes array to easily iterate
axs = axs.flatten()

# List of features to plot
features = ["pymnt_plan", "policy_code", "application_type"]

# Loop through features and axes
for i, feature in enumerate(features):
    df[feature].value_counts().plot.pie(
        ax=axs[i], 
        autopct='%.3f%%',
        startangle=90,
        ylabel="",  # hide default label
        legend=False
    )
    axs[i].set_title(feature)

# Overall title
fig.suptitle('Distribution of Uniformative Features', fontsize=25)
plt.tight_layout()
plt.show()


Thus, these three features need to be dropped since they bring very little information as features to make predictions

In [None]:
# drop the selected features

df.drop(columns= ["pymnt_plan", "policy_code", "application_type"], inplace = True)

In [None]:
# Checking the latest dataframe

print(f'numericals_features : {df.select_dtypes(include=["int64", "float64"]).shape[1]}')
print (f'categorical_features : {df.select_dtypes(include = "object").shape[1]}')

df.info()

In [None]:
# Checking and removing duplicates in the dataset
df.duplicated().sum()

No duplicate values

## 2.3 Univariate Analysis

### 2.3.1 Numerical Features Univariate Analysis

In [None]:
from scipy.stats import skew, kurtosis, probplot
from IPython.display import display_html


# Separate the latest numerical from categorical features
num_cols = df.select_dtypes(include=[np.number]).columns

# Statistics for numerical features
for col in num_cols:
    data = df[col].dropna()

    # Quantile Statistics
    quantile_stats = {
        'Minimum': data.min(),
        '5-th Percentile': np.percentile(data, 5),
        'Q1': data.quantile(0.25),
        'Median': data.median(),
        'Q3': data.quantile(0.75),
        '95-th Percentile': np.percentile(data, 95),
        'Maximum': data.max(),
        'Range': data.max() - data.min(),
        'IQR': data.quantile(0.75) - data.quantile(0.25)
    }

    # Descriptive Statistics
    descriptive_stats = {
        'Mean': data.mean(),
        'Standard Deviation': data.std(),
        'Variance': data.var(),
        'Sum': data.sum(),
        'Skewness': skew(data),
        'Kurtosis': kurtosis(data),
        'Coefficient of Variation': data.std() / data.mean()
    }

    # Convert to DataFrames
    quantile_df = pd.DataFrame(quantile_stats.items(), columns=["Statistic", "Value"])
    descriptive_df = pd.DataFrame(descriptive_stats.items(), columns=["Statistic", "Value"])

    # Convert DataFrames to HTML tables
    q_html = quantile_df.to_html(index=False)
    d_html = descriptive_df.to_html(index=False)

    # Display Side by Side
    print(f"\n=== Statistics for {col} ===")
    display_html(f"""
        <div style='display: flex; gap: 40px;'>
            <div><h3>Quantile Statistics</h3>{q_html}</div>
            <div><h3>Descriptive Statistics</h3>{d_html}</div>
        </div>
    """, raw=True)

    # Plots
    plt.figure(figsize=(16, 4))

    # Histogram + KDE
    plt.subplot(1, 3, 1)
    sns.histplot(data, kde=True)
    plt.title(f'Histogram & KDE: {col}')

    # Boxplot
    plt.subplot(1, 3, 2)
    sns.boxplot(x=data)
    plt.title(f'Boxplot: {col}')

    # Q-Q Plot
    plt.subplot(1, 3, 3)
    probplot(data, dist="norm", plot=plt)
    plt.title(f'Q-Q Plot: {col}')

    plt.tight_layout()
    plt.show()

In [None]:
from scipy.stats import skew

#Checking the skewness

def check_skewness_outlier_effect(df):
    skew_report = []
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        data = df[col].dropna()

        # Skewness before
        original_skew = skew(data)

        # Outlier removal using IQR
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        filtered_data = data[(data >= lower_bound) & (data <= upper_bound)]

        # Skewness after
        filtered_skew = skew(filtered_data)

        # Interpretation
        def interpret_skew(value):
            if abs(value) < 0.5:
                return "Symmetric"
            elif abs(value) < 1:
                return "Moderately Skewed"
            else:
                return "Highly Skewed"

        skew_report.append({
            'Feature': col,
            'Skewness Before': round(original_skew, 4),
            'Interpretation Before': interpret_skew(original_skew),
            'Skewness After': round(filtered_skew, 4),
            'Interpretation After': interpret_skew(filtered_skew),
            'Change': round(original_skew - filtered_skew, 4)
        })

    return pd.DataFrame(skew_report)


In [None]:
report = check_skewness_outlier_effect(df)
print(report)

In [None]:
numerical_dataset = df.select_dtypes(include=['number'])
correlation_ = numerical_dataset.corr()

plt.figure(figsize=(25, 15))
sns.heatmap(correlation_, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()

### 2.3.2 Categorical Features Univariate Analysis

In [None]:
# Define the latest categorical features
cat_cols = df.select_dtypes(exclude=[np.number]).columns

# statistic function
def _cat_stats_core(series: pd.Series) -> pd.DataFrame:
    s_nn = series.dropna()
    if s_nn.empty:
        return pd.DataFrame({
            "Metric": [
                "Unique (cardinality)", "Mode (top category)", "Mode count", 
                "Mode (%)", "Entropy (bits)", "Gini diversity", 
                "Avg. length", "Min length", "Max length"
            ],
            "Value": [np.nan]*9
        })
    
    vc = s_nn.value_counts()
    mode_val = vc.index[0]
    mode_count = int(vc.iloc[0])
    mode_pct = mode_count / s_nn.shape[0] * 100
    p = (vc / s_nn.shape[0]).values
    entropy = -np.sum(p * np.log2(p))
    gini = 1 - np.sum(p**2)
    lens = s_nn.astype(str).str.len()
    avg_len, min_len, max_len = lens.mean(), lens.min(), lens.max()

    return pd.DataFrame({
        "Metric": [
            "Unique (cardinality)", "Mode (top category)", "Mode count",
            "Mode (%)", "Entropy (bits)", "Gini diversity",
            "Avg. length", "Min length", "Max length"
        ],
        "Value": [
            s_nn.nunique(), mode_val, mode_count, round(mode_pct, 2),
            round(entropy, 4), round(gini, 4),
            round(avg_len, 2), min_len, max_len
        ]
    })

# Main function
def analyze_categorical_features(df, top_n_bar=20, top_n_table=50, top_n_pie=10):
    cat_cols = df.select_dtypes(include=["object", "category"]).columns
    if len(cat_cols) == 0:
        print("No categorical features found.")
        return

    for col in cat_cols:
        s = df[col]
        s_nn = s.dropna()

        # Stats table
        stats_df = _cat_stats_core(s)

        # Value table
        vc = s_nn.value_counts()
        vt = (
            vc.head(top_n_table)
              .rename("Count")
              .to_frame()
              .assign(Percentage=lambda d: (d["Count"] / len(s_nn) * 100).round(2))
              .reset_index()
              .rename(columns={"index": col})
        )

        # Display tables side-by-side
        display_html(
            f"""
            <div style="display:flex; gap:40px; align-items:flex-start;">
                <div style="flex:0 0 360px;">
                    <h3>Stats for {col}</h3>
                    {stats_df.to_html(index=False)}
                </div>
                <div style="flex:1;">
                    <h3>Value Table (top {min(top_n_table, len(vc))})</h3>
                    {vt.to_html(index=False)}
                </div>
            </div>
            """,
            raw=True
        )

        # Bar plot
        plt.figure(figsize=(14, 5))
        plt.subplot(1, 2, 1)
        vc_bar = vc.head(top_n_bar)[::-1]
        plt.barh(vc_bar.index.astype(str), vc_bar.values)
        plt.title(f'Bar Plot (Top {min(top_n_bar, len(vc))}): {col}')
        plt.xlabel('Count')
        plt.ylabel('Category')

        plt.tight_layout()
        plt.show()


analyze_categorical_features(df)



In [None]:
from scipy.stats import entropy
import pandas as pd

def check_categorical_imbalance(df, dominance_threshold=3, entropy_threshold=1.0):
    cat_report = []
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    for col in categorical_cols:
        counts = df[col].value_counts()
        dominance_ratio = counts.max() / counts.min() if counts.min() > 0 else None
        ent = entropy(counts)

        # Determine imbalance flag
        if (dominance_ratio and dominance_ratio >= dominance_threshold) or (ent <= entropy_threshold):
            flag = "Imbalanced"
        else:
            flag = "Balanced"

        cat_report.append({
            'Feature': col,
            'Unique Categories': len(counts),
            'Entropy': round(ent, 3),
            'Dominance Ratio': round(dominance_ratio, 2) if dominance_ratio else None,
            'Most Common Category': counts.idxmax(),
            'Most Common Freq (%)': round((counts.max()/counts.sum())*100, 2),
            'Imbalance Flag': flag
        })
    return pd.DataFrame(cat_report)

report = check_categorical_imbalance(df)
print(report)

## 2.4 Bivariate Analysis

In [None]:

from IPython.display import display_html

def bivariate_analysis(df, target="loan_status"):
    df = df.copy()
    df = df[df[target].isin(["Fully Paid", "Charged Off"])]
    df[target] = df[target].map({
        "Fully Paid": "Good Loan",
        "Charged Off": "Bad Loan"
    })
    
    # Separate features
    num_cols = df.select_dtypes(include=["float", "int"]).columns.drop(target, errors="ignore")
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.drop(target, errors="ignore")

    # --- Numerical Features ---
    for col in num_cols:
        print(f"\n=== Numerical Feature: {col} vs {target} ===")

        # Stats per class
        stats_good = df[df[target] == "Good Loan"][col].describe().round(2)
        stats_bad = df[df[target] == "Bad Loan"][col].describe().round(2)
        stats_df = pd.DataFrame({"Good Loan": stats_good, "Bad Loan": stats_bad})
        
        # Display side-by-side table
        display_html(f"""
        <h4>Descriptive Stats for {col}</h4>
        {stats_df.to_html()}
        """, raw=True)

        # Boxplot
        plt.figure(figsize=(14, 5))
        plt.subplot(1, 2, 1)
        sns.boxplot(x=target, y=col, data=df, palette="Set2")
        plt.title(f"Boxplot of {col} by Loan Status")

        # KDE Plot
        plt.subplot(1, 2, 2)
        for label, color in zip(["Good Loan", "Bad Loan"], ["green", "red"]):
            sns.kdeplot(df[df[target]==label][col], label=label, fill=True, alpha=0.4, color=color)
        plt.title(f"KDE of {col} by Loan Status")
        plt.legend()

        plt.tight_layout()
        plt.show()

    # --- Categorical Features ---
    for col in cat_cols:
        print(f"\n=== Categorical Feature: {col} vs {target} ===")

        # Crosstab for counts & percentage
        ct = pd.crosstab(df[col], df[target])
        pct = ct.div(ct.sum(axis=1), axis=0).mul(100).round(2)
        combined = ct.astype(str) + " (" + pct.astype(str) + "%)"
        
        display_html(f"""
        <h4>Counts & Percentages for {col}</h4>
        {combined.to_html()}
        """, raw=True)

        # Bar Plot
        plt.figure(figsize=(8, 5))
        ct.plot(kind="bar", stacked=False, figsize=(10, 5), color=["green", "red"])
        plt.title(f"{col} Distribution by Loan Status")
        plt.ylabel("Count")
        plt.xlabel(col)
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()


In [None]:
bivariate_analysis(df, target="loan_status")

# 3. Feature Engineering

- Feature transformation
- Feature Encoding
- Feature selection
- Scaling

## 3.1 Feature Transformation

Transforming categorical features to be numerical features.
- term
- earliest_cr_line

### 3.1.1 Feature: term

Converting the term value from 36 months or 60 months to be 36 or 60

In [None]:
#Convert feature term
df['term'] = df['term'].apply(lambda x: int(x.split()[0]))
df['term'] = df['term'].astype(int)

### 3.1.2 Feature: earliest_cr_line  

Extracting the feature earliest_cr_line date for the year only with 4-digit year format

In [None]:
# Convert strings like "Jan-85" to proper datetime first
df['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line'], format='%b-%y')

# Extract the 4-digit year
df['earliest_cr_line'] = df['earliest_cr_line'].dt.year

## 3.2 Feature Encoding

Feature encoding:
- Label encoding
- One-hot encoding

### 3.2.1 Label Encoding:
- Feature: sub_grade
- Feature: emp_lenght
- Feature: verification_status

#### a. Feature: sub_grade

Label Encoding scale: A1 = 35 (the best) → G5 = 1 (the worst)

In [None]:
# Data Sub-Grade
sub_grades = [
    "A1", "A2", "A3", "A4", "A5",
    "B1", "B2", "B3", "B4", "B5",
    "C1", "C2", "C3", "C4", "C5",
    "D1", "D2", "D3", "D4", "D5",
    "E1", "E2", "E3", "E4", "E5",
    "F1", "F2", "F3", "F4", "F5",
    "G1", "G2", "G3", "G4", "G5"]

# Label Encoding
sub_grade_labels = {grade: i for i, grade in enumerate(reversed(sub_grades), start=1)}
df['sub_grade'] = df['sub_grade'].map(sub_grade_labels)

#### b. Feature: emp_lenght

Convert emp_lenght to be interger. 
- 0 and <1 year are considered 0
- 10+ years are considered 10
- the rest are the same

In [None]:
emp_length = {
    '0' : 0,
    '< 1 year': 0,
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '7 years': 7,
    '8 years': 8,
    '9 years': 9,
    '10+ years': 10
}

# Mapping the new 'emp_lenght' into the dataset
df['emp_length'] = df['emp_length'].map(emp_length)

#### c. Feature: verification_status

Convert verification_status to interger with label encoding:
- Verified = 0
- Not Verified = 1
- Source Verified = 2

In [None]:
verification_status = {
    'Verified': 0,
    'Not Verified': 1,
    'Source Verified': 2
}

# Mapping the new 'verification_status' into the dataset
df['verification_status'] = df['verification_status'].map(verification_status)

### 3.2.2 One-Hot Encoding:
- Feature: home_ownership
- Feature: purpose

#### Feature: home_ownership

In [None]:
# Changing home_ownership to 4 categories
home_ownership = {
    'OTHER': 'OTHER',
    'NONE': 'OTHER',
    'ANY': 'OTHER'
}

# Mapping the new 'home_ownership' into the dataset
df['home_ownership'] = df['home_ownership'].replace(home_ownership)


#### One-Hot Encoding Feature: home_ownership & Feature: purpose

In [None]:
df = pd.get_dummies(df, columns=['home_ownership',
                                    'purpose'], drop_first=True)


## Final check on latest dataframe (df)

In [None]:
df.head()

In [None]:
df.info()

Categorical features have been transformed into numerical. Thus, all features are in numerical type now

## 3.3 Correlation

In [None]:
numerical_dataset = df.select_dtypes(include=['number'])
correlation_ = numerical_dataset.corr()

plt.figure(figsize=(25, 15))
sns.heatmap(correlation_, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()

The correlation hatmap shows that feature 'loan_amnt' and feature 'installment' have strong correlation (0.95). Thus, only one feature is needed, and feature 'installment' will be dropped.

## 3.4 TARGET FEATURE

In [None]:
# New Dataframe with values in loan_status only 'Fully Paid' and 'Charged Off'

loan_data= df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])].copy()

#Mapping the target feature
loan_data['target'] = loan_data['loan_status'].map({
    'Fully Paid': 0,
    'Charged Off': 1,
})

In [None]:
loan_data.drop(columns= ['loan_status'], inplace = True)

In [None]:
loan_data['target'].value_counts()

In [None]:
# Separate Features & Target
X = loan_data.drop(columns=['target'])
y = loan_data['target']

## 3.5 Best Features

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [None]:
selector = SelectKBest(score_func=mutual_info_classif, k=20)  
X_new = selector.fit_transform(X, y)

In [None]:
scores = selector.scores_
feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'Score': scores
})

# Sort by highest score
feature_scores = feature_scores.sort_values(by='Score', ascending=False)
print(feature_scores)

### Split Train and Test Dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# 4. Modelling

In [None]:
!{sys.executable} -m pip install xgboost
!{sys.executable} -m pip install imblearn

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score, roc_curve, auc, make_scorer,confusion_matrix,precision_recall_curve

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

#XGB
import xgboost as xgb

import joblib

#hyperparameter
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, KFold, cross_val_score, cross_validate

from scipy.stats import uniform, randint

#import itertools
import itertools

### Definining Functions and Metrics

#### Evaluation Report

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Evaluates the performance of a trained model on test data using various metrics.
    """
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Probabilities (only used for ROC-AUC)
    y_prob_train = model.predict_proba(X_train)[:, 1]
    y_prob_test = model.predict_proba(X_test)[:, 1]
    
    # Get classification report
    report_train = classification_report(y_train, y_pred_train, output_dict=True)
    report_test = classification_report(y_test, y_pred_test, output_dict=True)
    
 
    
    
    
    # Extracting metrics
    train_metrics = {
        "precision_0": report_train["0"]["precision"],
        "precision_1": report_train["1"]["precision"],
        "recall_0":   report_train["0"]["recall"],
        "recall_1":   report_train["1"]["recall"],
        "f1_0":   report_train["0"]["f1-score"],
        "f1_1":   report_train["1"]["f1-score"],
        "macro_avg_precision":  report_train["macro avg"]["precision"],
        "macro_avg_recall":   report_train["macro avg"]["recall"],
        "macro_avg_f1":   report_train["macro avg"]["f1-score"],
        "accuracy": accuracy_score(y_train, y_pred_train),
        "ROC-AUC": roc_auc_score(y_train, y_prob_train),
        "precision_recall": precision_recall_curve(y_train, y_prob_train)
        
    }


    test_metrics = {
        "precision_0": report_test["0"]["precision"],
        "precision_1": report_test["1"]["precision"],
        "recall_0":  report_test["0"]["recall"],
        "recall_1":  report_test["1"]["recall"],
        "f1_0":  report_test["0"]["f1-score"],
        "f1_1":  report_test["1"]["f1-score"],
        "macro_avg_precision":  report_test["macro avg"]["precision"],
        "macro_avg_recall":  report_test["macro avg"]["recall"],
        "macro_avg_f1":  report_test["macro avg"]["f1-score"],
        "accuracy": accuracy_score(y_test, y_pred_test),
        "ROC-AUC": roc_auc_score(y_test, y_prob_test),
        "precision_recall": precision_recall_curve(y_test, y_prob_test)
    }



    # Convert dictionary to dataframe
    metrics = pd.DataFrame([train_metrics,test_metrics],  index=["Train", "Test"])
    
    return metrics

#### Confusion Matrix

In [None]:
#confusion matrix function
def plot_confusion_matrix(c_matrix,
                          target_names,
                          title = 'Confusion Matrix',
                          cmap= None, 
                          normalize= True):
    
    #np.trace = sum of all the elements of a diagonal of given matrix
    accuracy = np.trace(c_matrix) / np.sum(c_matrix).astype('float')
    misclass = 1 - accuracy
    
    #get colormap instance
    if cmap is None:
        cmap = plt.get_cmap('coolwarm')
        
    #image size
    plt.figure(figsize=(8,6))
    
    #display data as an image
    #data is resampled to the pixel size of the image on the figure canvas 
    #'nearest' interpolation is used if the number of display pixels is at least three times the size of the data array
    plt.imshow(c_matrix, interpolation='nearest', cmap= cmap)
    
    plt.title(title)
    
    #add a colorbar to a plot
    plt.colorbar()
    
    #set tick locations
    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)
    
    #percentage with normalize
    #np.newaxis increase the dimensions of an array by adding new axes.
    if normalize:
        c_matrix = c_matrix.astype('float') / c_matrix.sum(axis =1)[:, np.newaxis]
    
    #threshold
    threshold = c_matrix.max()/1 if normalize else c_matrix.max()/2
    
    #itertools.product returns the cartesian product of the input iterables
    #The Cartesian product is the set of all combinations of elements from multiple sets
    for i,j in itertools.product(range(c_matrix.shape[0]), range(c_matrix.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(c_matrix[i,j]),
                     horizontalalignment = 'center',
                     color = 'white' if c_matrix[i,j] > threshold else 'black')
        else:
            plt.text(j, i, "{:,}".format(c_matrix[i,j]),
                     horizontalalignment = 'center',
                     color = 'white' if c_matrix[i,j] > threshold else 'black')
    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show() 

#### Hyperparameter Tuning Function

In [None]:
# Defining the function of hyperparameter tuning with RandomizedSearch

def tune_clf_hyperparameters_random(clf, param_distributions, X_train, y_train,
                                    scoring='recall', n_splits=3, n_iter=10, random_state=42):
    
    #cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    cv = 3

    clf_random = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_distributions,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring,
        n_jobs=-1, 
        random_state=random_state,
        refit = 'recall'
    )

    clf_random.fit(X_train, y_train)
    best_hyperparameters = clf_random.best_params_
    return clf_random.best_estimator_, best_hyperparameters


## 4.1 Decision Tree

### 4.1.1 Model

In [None]:
# Define the base DT model
dt_base = DecisionTreeClassifier(class_weight='balanced',random_state=0)
dt_base.fit(X_train, y_train)

### 4.1.2 Hyperparameter Tuning with RandomizedSearch

In [None]:
# setting up the parameter options
param_dist_dt = {
    'max_depth': randint(0,5),
    'splitter': ['best'],
    'min_samples_split': randint(2, 5),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': randint (1, 5)
}

In [None]:
# Call the function for hyperparameter tuning
best_dt, best_dt_hyperparams = tune_clf_hyperparameters_random(dt_base, param_dist_dt, X_train, y_train)

In [None]:
print('Decision Tree Optimal Hyperparameters: \n', best_dt_hyperparams)

### 4.1.3 Decision Tree Evaluation

In [None]:
y_pred_train= best_dt.predict(X_train)
y_pred_test = best_dt.predict(X_test)

print('Train Classification Report: \n',classification_report(y_train, y_pred_train))
print('Test Classification Report: \n',classification_report(y_test,y_pred_test ))

In [None]:
dt_evaluation = evaluate_model(best_dt, X_train, y_train, X_test, y_test)
dt_evaluation

In [None]:
#Confusion matrix for test prediction

cm_dt = confusion_matrix(y_test, y_pred_test)

plot_confusion_matrix(
    cm_dt,
    target_names=['Good Loan', 'Bad Loan'], 
    title='Confusion Matrix',
    normalize=True
)

## 4.2 Random Forest

### 4.2.1 Model

In [None]:
rf_base = RandomForestClassifier(class_weight = {0: 1, 1: 5}, random_state=0)
rf_base = rf_base.fit(X_train, y_train)
rf_base

### 4.2.2. Hyperparameter Tuning

In [None]:
param_dist_rf = {
    'n_estimators': [100],
    'criterion': ['gini', 'entropy', 'logloss'],
   'max_depth': randint(0,5),
    'min_samples_split':randint(2,5),
    'min_samples_leaf':randint(1,5),
    'bootstrap': [True, False],
}

In [None]:
# Using the tune_clf_hyperparameters function to get the best estimator
best_rf, best_rf_hyperparams = tune_clf_hyperparameters_random(rf_base, param_dist_rf, X_train, y_train)


In [None]:
print('RF Optimal Hyperparameters: \n', best_rf_hyperparams)

### 4.2.3 Evaluation

In [None]:
y_pred_train= best_rf.predict(X_train)
y_pred_test = best_rf.predict(X_test)

print('Train Classification Report: \n',classification_report(y_train, y_pred_train))
print('Test Classification Report: \n',classification_report(y_test,y_pred_test ))

In [None]:
rf_evaluation = evaluate_model(best_rf, X_train, y_train, X_test, y_test)
rf_evaluation

In [None]:
#Confusion matrix for test prediction

cm_rf = confusion_matrix(y_test, y_pred_test)

plot_confusion_matrix(
    cm_rf,
    target_names=['Good Loan', 'Bad Loan'], 
    title='Confusion Matrix',
    normalize=True
)

## 4.3 XGBoost

In [None]:
# Defining the function of hyperparameter tuning with RandomizedSearch

def tune_clf_hyperparameters_random(clf, param_distributions, X_train, y_train,
                                    scoring='recall', n_splits=3, n_iter=20, random_state=42):
    
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
   

    clf_random = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_distributions,
        n_iter=n_iter,
        cv=cv,
        scoring=scoring,
        n_jobs=-1, 
        random_state=random_state,
        refit = 'roc',
        
    )

    clf_random.fit(X_train, y_train)
    best_hyperparameters = clf_random.best_params_
    return clf_random.best_estimator_, best_hyperparameters


### 4.3.1 Model

In [None]:
import xgboost as xgb
xgb_base = xgb.XGBClassifier(scale_pos_weight=4.3,
    eval_metric='aucpr',
    use_label_encoder=False,
    random_state=42)
#xgb_base.fit(X_train, y_train)

### 4.3.2 Hyperparameter Tuning

In [None]:
xgb_param_dist = {
    'max_depth': randint(5,10),           # Range [3, 10)
    'learning_rate': uniform(0.01, 1),   # Range [0.01, 0.31)
    'n_estimators': randint(50, 200),
    'subsample': uniform(0.4, 1.5),
    'objective':['binary:logistic'], # Range [0.6, 1.0)
     'colsample_bytree': [0.3,0.6, 1.0]
    
}

In [None]:
# Hyperparameter tuning for XGB
best_xgb, best_xgb_hyperparams = tune_clf_hyperparameters_random(xgb_base, xgb_param_dist, X_train, y_train)


In [None]:
print('XGB Optimal Hyperparameters: \n', best_xgb_hyperparams)

### 4.3.3 Evaluation

In [None]:
y_pred_train= best_xgb.predict(X_train)
y_pred_test = best_xgb.predict(X_test)

print('Train Classification Report: \n',classification_report(y_train, y_pred_train))
print('Test Classification Report: \n',classification_report(y_test,y_pred_test ))

In [None]:
xgb_evaluation = evaluate_model(best_xgb, X_train, y_train, X_test, y_test)
xgb_evaluation

In [None]:
#Confusion matrix for test prediction

cm_xgb = confusion_matrix(y_test, y_pred_test)

plot_confusion_matrix(
    cm_xgb,
    target_names=['Good Loan', 'Bad Loan'], 
    title='Confusion Matrix',
    normalize=True
)

## 4.4. Stacking

### 4.4.1 Model

#### Random Forest as Estimators

In [None]:
estimators =[('rf', RandomForestClassifier(
    class_weight= 'balanced', 
    criterion = 'gini',
    max_depth = 1,
    min_samples_split = 3,
    min_samples_leaf = 4,
    bootstrap = False,
))]

#### XGB as Final Estimator

In [None]:
final_estimator = xgb.XGBClassifier(scale_pos_weight=4.3,
    eval_metric="auc",
    use_label_encoder=False,
    random_state=42,
    max_depth = 7,         
    learning_rate =  0.0564,  
    n_estimators= 184,
    subsample = 0.655786,
    objective= 'binary:logistic',
    colsample_bytree =  0.3)


#### Model Stacking

In [None]:
from sklearn.ensemble import  StackingClassifier
stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator= final_estimator,
    passthrough=True,          
    n_jobs=-1,
)

In [None]:
from sklearn.preprocessing import RobustScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

pipeline = Pipeline(steps=[
    #('scaler', RobustScaler()),    
    #('smote', SMOTE(random_state=42)),
    ('model', stack_model)
])

pipeline =pipeline.fit(X_train, y_train)
pipeline


### 4.4.2 Evaluation

In [None]:
y_pred_test = pipeline.predict(X_test)
y_pred_train = pipeline.predict(X_train)

print("Train Classification Report:\n", classification_report(y_train, y_pred_train))
print("Test Classification Report:\n", classification_report(y_test, y_pred_test))

In [None]:
pp_evaluation = evaluate_model(pipeline, X_train, y_train, X_test, y_test)
pp_evaluation

In [None]:
#Confusion matrix for test prediction

cm_pp = confusion_matrix(y_test, y_pred_test)

plot_confusion_matrix(
    cm_pp,
    target_names=['Good Loan', 'Bad Loan'], 
    title='Confusion Matrix',
    normalize=True
)

# Model Comparison

In [None]:
comparison_df_train = pd.concat(
    [dt_evaluation, rf_evaluation, xgb_evaluation, pp_evaluation],
    keys=["Decision Tree", "Random Forest", "XGBoost", "STacking Random Forest & XGB"]
)

In [None]:
comparison_df_train