In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('Loan_dataset_v1.csv')

In [4]:
df.sample(5)

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Default,...,EmploymentType_Unemployed,MaritalStatus_Married,MaritalStatus_Single,HasMortgage_Yes,HasDependents_Yes,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,HasCoSigner_Yes
29162,53,133701,48839,623,116,1,3.09,48,0.1,0,...,True,True,False,True,False,False,False,True,False,False
158482,19,90338,217191,570,30,4,11.01,12,0.63,0,...,False,True,False,True,False,False,True,False,False,False
92243,28,97318,204559,597,78,2,10.65,60,0.35,0,...,False,False,True,False,False,False,False,False,True,True
144753,67,100641,113461,764,78,2,24.08,24,0.25,0,...,False,False,False,True,True,False,False,True,False,True
127367,68,53661,34654,615,52,3,21.91,24,0.29,0,...,False,False,False,True,True,False,True,False,False,False


In [6]:
# df = df.drop('LoanID',axis=1)
df = df.drop(columns=[
    'LoanPurpose_Business',
    'LoanPurpose_Education',
    'LoanPurpose_Home',
    'LoanPurpose_Other'
],axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255347 entries, 0 to 255346
Data columns (total 21 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Age                           255347 non-null  int64  
 1   Income                        255347 non-null  int64  
 2   LoanAmount                    255347 non-null  int64  
 3   CreditScore                   255347 non-null  int64  
 4   MonthsEmployed                255347 non-null  int64  
 5   NumCreditLines                255347 non-null  int64  
 6   InterestRate                  255347 non-null  float64
 7   LoanTerm                      255347 non-null  int64  
 8   DTIRatio                      255347 non-null  float64
 9   Default                       255347 non-null  int64  
 10  Education_High School         255347 non-null  bool   
 11  Education_Master's            255347 non-null  bool   
 12  Education_PhD                 255347 non-nul

In [11]:
df['Default'].value_counts()

Default
0    225694
1     29653
Name: count, dtype: int64

## Data Preparation
   Features and Target

In [12]:
X = df.drop('Default', axis=1)
y = df['Default']


## Train-Test Split

We’ll stratify to maintain class distribution:

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## Handling Imbalance

For tree-based models (Decision Tree, Random Forest, XGBoost), we can use class weights instead of oversampling.
For Naive Bayes, we may need SMOTE or undersampling because it’s not tree-based.

### Class weights example for trees:

In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = dict(zip(np.unique(y_train), compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)))
print(class_weights)  # Example output: {0: 0.55, 1: 4.2}


{np.int64(0): np.float64(0.5656918944365983), np.int64(1): np.float64(4.30564454936346)}


### Scaling

Decision Tree, Random Forest, XGBoost: Scaling is not required.

Naive Bayes: GaussianNB expects scaled numeric values for better performance.

In [26]:
# from sklearn.metrics import classification_report

# def evaluate_model(model, X_test, y_test):
#     y_pred = model.predict(X_test)
#     print(classification_report(y_test, y_pred, target_names=['Non-Default', 'Default']))

from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, confusion_matrix

def evaluate_model(model, X_test, y_test):
    # Predict class labels
    y_pred = model.predict(X_test)
    
    # Predict probabilities for ROC and PR curves
    if hasattr(model, "predict_proba"):
        y_probs = model.predict_proba(X_test)[:, 1]  # Probability for class 1
    else:  # Some models (like SVM) may not have predict_proba
        y_probs = model.decision_function(X_test)
    
    # Classification report
    print("=== Classification Report ===")
    print(classification_report(y_test, y_pred, target_names=['Non-Default', 'Default']))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print("=== Confusion Matrix ===")
    print(cm)
    print(f"TN: {tn}, FP: {fp}, FN: {fn}, TP: {tp}")
    
    # ROC-AUC
    roc_auc = roc_auc_score(y_test, y_probs)
    print(f"ROC-AUC Score: {roc_auc:.4f}")
    
    # PR-AUC
    precision, recall, _ = precision_recall_curve(y_test, y_probs)
    pr_auc = auc(recall, precision)
    print(f"PR-AUC Score: {pr_auc:.4f}")




### Step 1: Prepare Data for Naive Bayes

In [16]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Separate features and target
X = df.drop('Default', axis=1)
y = df['Default']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features only
num_cols = ['Age','Income','LoanAmount','CreditScore','MonthsEmployed','NumCreditLines','InterestRate','LoanTerm','DTIRatio']
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# SMOTE to balance classes
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_train_res.value_counts())


Before SMOTE: Default
0    180555
1     23722
Name: count, dtype: int64
After SMOTE: Default
0    180555
1    180555
Name: count, dtype: int64


In [23]:
# Initialize model
nb_model = GaussianNB()

# Train model
nb_model.fit(X_train_res, y_train_res)


0,1,2
,"priors  priors: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",
,"var_smoothing  var_smoothing: float, default=1e-9 Portion of the largest variance of all features that is added to variances for calculation stability. .. versionadded:: 0.20",1e-09


In [27]:
# Evaluate
# y_pred = nb_model.predict(X_test)
# print(classification_report(y_test, y_pred, target_names=['Non-Default', 'Default']))

# Use the evaluate_model function we created
evaluate_model(nb_model, X_test, y_test)


=== Classification Report ===
              precision    recall  f1-score   support

 Non-Default       0.94      0.69      0.79     45139
     Default       0.22      0.65      0.32      5931

    accuracy                           0.68     51070
   macro avg       0.58      0.67      0.56     51070
weighted avg       0.85      0.68      0.74     51070

=== Confusion Matrix ===
[[31061 14078]
 [ 2055  3876]]
TN: 31061, FP: 14078, FN: 2055, TP: 3876
ROC-AUC Score: 0.7314
PR-AUC Score: 0.2766


In [29]:
from sklearn.model_selection import train_test_split

# Features and target
X = df.drop('Default', axis=1)
y = df['Default']

# Train-test split (stratified to keep class ratio)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set class distribution:")
print(y_train.value_counts())
print("Test set class distribution:")
print(y_test.value_counts())


Training set class distribution:
Default
0    180555
1     23722
Name: count, dtype: int64
Test set class distribution:
Default
0    45139
1     5931
Name: count, dtype: int64


In [30]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree with class weights
dt_model = DecisionTreeClassifier(
    max_depth=None,      # Let tree grow fully (you can tune this later)
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',
    random_state=42
)


In [31]:
# Train the Decision Tree
dt_model.fit(X_train, y_train)


0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",10
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",5
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",42
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [32]:
evaluate_model(dt_model, X_test, y_test)


=== Classification Report ===
              precision    recall  f1-score   support

 Non-Default       0.91      0.78      0.84     45139
     Default       0.19      0.39      0.25      5931

    accuracy                           0.73     51070
   macro avg       0.55      0.59      0.55     51070
weighted avg       0.82      0.73      0.77     51070

=== Confusion Matrix ===
[[35042 10097]
 [ 3596  2335]]
TN: 35042, FP: 10097, FN: 3596, TP: 2335
ROC-AUC Score: 0.5933
PR-AUC Score: 0.2146


## HyperParameter Tuning

In [33]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [10, 50, 100],
    'min_samples_leaf': [5, 10, 20],
    'criterion': ['gini', 'entropy']
}

grid = GridSearchCV(
    estimator=DecisionTreeClassifier(class_weight='balanced', random_state=42),
    param_grid=params,
    scoring='average_precision',  # PR-AUC
    cv=3,
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best Params:", grid.best_params_)


Best Params: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 20, 'min_samples_split': 100}


In [34]:
# Use the best estimator from GridSearch
best_dt_model = grid.best_estimator_

# Evaluate on test set
evaluate_model(best_dt_model, X_test, y_test)


=== Classification Report ===
              precision    recall  f1-score   support

 Non-Default       0.94      0.66      0.78     45139
     Default       0.20      0.66      0.31      5931

    accuracy                           0.66     51070
   macro avg       0.57      0.66      0.54     51070
weighted avg       0.85      0.66      0.72     51070

=== Confusion Matrix ===
[[29898 15241]
 [ 2023  3908]]
TN: 29898, FP: 15241, FN: 2023, TP: 3908
ROC-AUC Score: 0.7123
PR-AUC Score: 0.2707


## Random Forest

In [35]:
from sklearn.model_selection import train_test_split

# Features and target
X = df.drop('Default', axis=1)
y = df['Default']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set class distribution:")
print(y_train.value_counts())


Training set class distribution:
Default
0    180555
1     23722
Name: count, dtype: int64


In [36]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,         # number of trees
    max_depth=None,           # let trees grow fully, can tune later
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight='balanced',  # handle class imbalance
    random_state=42,
    n_jobs=-1                 # use all cores
)


In [37]:
rf_model.fit(X_train, y_train)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",200
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",10
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",5
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [38]:
evaluate_model(rf_model, X_test, y_test)


=== Classification Report ===
              precision    recall  f1-score   support

 Non-Default       0.90      0.96      0.93     45139
     Default       0.40      0.23      0.29      5931

    accuracy                           0.87     51070
   macro avg       0.65      0.59      0.61     51070
weighted avg       0.85      0.87      0.86     51070

=== Confusion Matrix ===
[[43152  1987]
 [ 4579  1352]]
TN: 43152, FP: 1987, FN: 4579, TP: 1352
ROC-AUC Score: 0.7491
PR-AUC Score: 0.3180


## HyperParameter Tuning:

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [10, 50, 100],
    'min_samples_leaf': [5, 20, 50],
    'criterion': ['gini', 'entropy']
}

grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1),
    param_grid=param_grid,
    scoring='average_precision',  # PR-AUC
    cv=3,
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)
print("Best Params:", grid_rf.best_params_)
