In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, classification_report, confusion_matrix

In [3]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:

train_data_path = '/content/drive/My Drive/DSW_ML_Assignment-Dec24/train_data.xlsx'
test_data_path = '/content/drive/My Drive/DSW_ML_Assignment-Dec24/test_data.xlsx'

In [5]:
train_data = pd.read_excel(train_data_path)
test_data = pd.read_excel(test_data_path)

In [6]:

for data in [train_data, test_data]:
    data['transaction_year'] = data['transaction_date'].dt.year
    data['transaction_month'] = data['transaction_date'].dt.month

In [7]:

categorical_cols = ['sub_grade', 'term', 'home_ownership', 'purpose',
                    'application_type', 'verification_status']

In [8]:
train_data_encoded = pd.get_dummies(train_data, columns=categorical_cols, drop_first=True)
test_data_encoded = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

In [9]:

missing_cols = set(train_data_encoded.columns) - set(test_data_encoded.columns)
for col in missing_cols:
    test_data_encoded[col] = 0

missing_cols = set(test_data_encoded.columns) - set(train_data_encoded.columns)
for col in missing_cols:
    train_data_encoded[col] = 0

In [10]:

train_data_encoded = train_data_encoded.reindex(sorted(train_data_encoded.columns), axis=1)
test_data_encoded = test_data_encoded.reindex(sorted(test_data_encoded.columns), axis=1)

In [11]:

feature_cols = [col for col in train_data_encoded.columns if col not in ['loan_status', 'customer_id', 'transaction_date']]
X_train = train_data_encoded[feature_cols]
y_train = train_data_encoded['loan_status']
X_test = test_data_encoded[feature_cols]
y_test = test_data_encoded['loan_status']

In [12]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

param_dist = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 6],
    'subsample': [0.8],
    'colsample_bytree': [0.8]
}


random_search = RandomizedSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
    param_distributions=param_dist,
    scoring='f1',
    cv=2,
    n_iter=5,
    verbose=2
)


random_search.fit(X_train_scaled, y_train)


best_xgb_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)


xgb_predictions = best_xgb_model.predict(X_test_scaled)
xgb_probabilities


Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=100, subsample=0.8; total time=   1.1s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, subsample=0.8; total time=   1.8s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=6, n_estimators=100, subsample=0.8; total time=   4.3s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=6, n_estimators=150, subsample=0.8; total time=   2.0s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=6, n_estimators=150, subsample=0.8; total time=   2.0s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=150, subsample=0.8; total time=   1.4s
[CV] END colsample_bytree=0.8, learning_rate=0.05, max_depth=3, n_estimators=150, subsample=0.8; total time=   1.4s
[CV] END colsa

array([0.91413784, 0.8769307 , 0.5842513 , ..., 0.30043003, 0.77617514,
       0.5215945 ], dtype=float32)

In [26]:

xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=6, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)

Parameters: { "use_label_encoder" } are not used.



In [27]:

xgb_predictions = xgb_model.predict(X_test_scaled)
xgb_probabilities = xgb_model.predict_proba(X_test_scaled)[:, 1]

In [28]:

xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_roc_auc = roc_auc_score(y_test, xgb_probabilities)
xgb_f1 = f1_score(y_test, xgb_predictions)

In [29]:
print("\n--- XGBoost Model Evaluation ---")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"ROC-AUC: {xgb_roc_auc:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, xgb_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, xgb_predictions))


--- XGBoost Model Evaluation ---
Accuracy: 0.6805
ROC-AUC: 0.6958
F1-Score: 0.7761

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.35      0.44      3055
           1       0.70      0.87      0.78      5400

    accuracy                           0.68      8455
   macro avg       0.65      0.61      0.61      8455
weighted avg       0.67      0.68      0.66      8455


Confusion Matrix:
[[1073 1982]
 [ 719 4681]]


In [30]:

logistic_model = LogisticRegression(max_iter=500, random_state=42)
logistic_model.fit(X_train_scaled, y_train)


logistic_predictions = logistic_model.predict(X_test_scaled)
logistic_probabilities = logistic_model.predict_proba(X_test_scaled)[:, 1]


logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_roc_auc = roc_auc_score(y_test, logistic_probabilities)
logistic_f1 = f1_score(y_test, logistic_predictions)

print("\n--- Logistic Regression Model Evaluation ---")
print(f"Accuracy: {logistic_accuracy:.4f}")
print(f"ROC-AUC: {logistic_roc_auc:.4f}")
print(f"F1-Score: {logistic_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, logistic_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, logistic_predictions))


--- Logistic Regression Model Evaluation ---
Accuracy: 0.6744
ROC-AUC: 0.6933
F1-Score: 0.7756

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.31      0.41      3055
           1       0.69      0.88      0.78      5400

    accuracy                           0.67      8455
   macro avg       0.64      0.60      0.59      8455
weighted avg       0.66      0.67      0.64      8455


Confusion Matrix:
[[ 945 2110]
 [ 643 4757]]


In [31]:

rf_model = RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42)
rf_model.fit(X_train_scaled, y_train)


rf_predictions = rf_model.predict(X_test_scaled)
rf_probabilities = rf_model.predict_proba(X_test_scaled)[:, 1]


rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_roc_auc = roc_auc_score(y_test, rf_probabilities)
rf_f1 = f1_score(y_test, rf_predictions)

print("\n--- Random Forest Model Evaluation ---")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"ROC-AUC: {rf_roc_auc:.4f}")
print(f"F1-Score: {rf_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, rf_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))


--- Random Forest Model Evaluation ---
Accuracy: 0.6778
ROC-AUC: 0.6996
F1-Score: 0.7867

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.23      0.34      3055
           1       0.68      0.93      0.79      5400

    accuracy                           0.68      8455
   macro avg       0.67      0.58      0.56      8455
weighted avg       0.67      0.68      0.63      8455


Confusion Matrix:
[[ 708 2347]
 [ 377 5023]]


In [32]:

logistic_model_l2 = LogisticRegression(penalty='l2', max_iter=500, random_state=42)
logistic_model_l2.fit(X_train_scaled, y_train)


logistic_predictions_l2 = logistic_model_l2.predict(X_test_scaled)
logistic_probabilities_l2 = logistic_model_l2.predict_proba(X_test_scaled)[:, 1]

logistic_accuracy_l2 = accuracy_score(y_test, logistic_predictions_l2)
logistic_roc_auc_l2 = roc_auc_score(y_test, logistic_probabilities_l2)
logistic_f1_l2 = f1_score(y_test, logistic_predictions_l2)

print("\n--- Logistic Regression with L2 Regularization Model Evaluation ---")
print(f"Accuracy: {logistic_accuracy_l2:.4f}")
print(f"ROC-AUC: {logistic_roc_auc_l2:.4f}")
print(f"F1-Score: {logistic_f1_l2:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, logistic_predictions_l2))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, logistic_predictions_l2))



--- Logistic Regression with L2 Regularization Model Evaluation ---
Accuracy: 0.6744
ROC-AUC: 0.6933
F1-Score: 0.7756

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.31      0.41      3055
           1       0.69      0.88      0.78      5400

    accuracy                           0.67      8455
   macro avg       0.64      0.60      0.59      8455
weighted avg       0.66      0.67      0.64      8455


Confusion Matrix:
[[ 945 2110]
 [ 643 4757]]


XGBoost outperformed other models on the given dataset due to several key advantages. It effectively handles class imbalance using the scale_pos_weight parameter, ensuring accurate predictions for the minority class. XGBoost also captures complex, non-linear relationships between features, such as loan amount and CIBIL score, which models like Logistic Regression cannot. It provides insights into feature importance, highlighting factors like annual income and loan amount that drive defaults. Additionally, it handles missing values automatically, reduces preprocessing, and includes L2 regularization to prevent overfitting. Optimized for speed and scalability, XGBoost is well-suited for large datasets and offers extensive hyperparameter tuning, enhancing its accuracy and performance.