In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import joblib


In [2]:
dataset = pd.read_csv("dataset/train.csv")

print("Dataset shape:", dataset.shape)
dataset.head()


Dataset shape: (593994, 13)


Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.1,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.2,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.1,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.7,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0


In [3]:
print(dataset.isnull().sum())

print("\nTarget distribution:")
print(dataset['loan_paid_back'].value_counts(normalize=True))


id                      0
annual_income           0
debt_to_income_ratio    0
credit_score            0
loan_amount             0
interest_rate           0
gender                  0
marital_status          0
education_level         0
employment_status       0
loan_purpose            0
grade_subgrade          0
loan_paid_back          0
dtype: int64

Target distribution:
loan_paid_back
1.0    0.79882
0.0    0.20118
Name: proportion, dtype: float64


In [4]:
dataset = dataset.drop(columns=['id'], errors='ignore')


In [5]:
TARGET = 'loan_paid_back'

X = dataset.drop(columns=[TARGET])
y = dataset[TARGET]

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = [c for c in X.columns if c not in num_cols]

le = LabelEncoder()
for col in cat_cols:
    X[col] = le.fit_transform(X[col].astype(str))

print("Categorical features encoded:", len(cat_cols))


Categorical features encoded: 6


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train:", X_train.shape)
print("Test :", X_test.shape)


Train: (475195, 11)
Test : (118799, 11)


In [7]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


In [9]:
param_grids = {
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10]
    },
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5]
    },
    "Gradient Boosting": {
        "n_estimators": [100, 200],
        "learning_rate": [0.05, 0.1],
        "max_depth": [3, 5]
    }
}


In [10]:
best_models = {}
results = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grids[name],
        cv=cv,
        scoring='accuracy',
        n_jobs=-1
    )
    
    grid.fit(X_train_scaled, y_train)
    
    best_models[name] = grid.best_estimator_
    
    print("Best Parameters:", grid.best_params_)
    
    y_pred = grid.best_estimator_.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    
    results.append((name, acc))
    
    print("Test Accuracy:", acc)



Training Logistic Regression...
Best Parameters: {'C': 0.01}
Test Accuracy: 0.8299480635358883

Training Random Forest...
Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}
Test Accuracy: 0.9027853769812877

Training Gradient Boosting...
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Test Accuracy: 0.9060345625804931


In [11]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy'])
results_df.sort_values(by='Accuracy', ascending=False)


Unnamed: 0,Model,Accuracy
2,Gradient Boosting,0.906035
1,Random Forest,0.902785
0,Logistic Regression,0.829948


In [12]:
best_model_name = results_df.sort_values(by='Accuracy', ascending=False).iloc[0]['Model']
best_model = best_models[best_model_name]

print("BEST MODEL:", best_model_name)


BEST MODEL: Gradient Boosting


In [13]:
train_acc = accuracy_score(y_train, best_model.predict(X_train_scaled))
test_acc = accuracy_score(y_test, best_model.predict(X_test_scaled))

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy : {test_acc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, best_model.predict(X_test_scaled)))


Train Accuracy: 0.9089
Test Accuracy : 0.9060

Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.61      0.72     23900
         1.0       0.91      0.98      0.94     94899

    accuracy                           0.91    118799
   macro avg       0.90      0.79      0.83    118799
weighted avg       0.90      0.91      0.90    118799



In [17]:
from sklearn.preprocessing import LabelEncoder

encoders = {}

categorical_cols = ['gender', 'education_level', 'marital_status','employment_status','loan_purpose']

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le


In [18]:
joblib.dump(best_model, "best_loan_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(X.columns.tolist(), "feature_columns.pkl")
joblib.dump(X_train_scaled, "scaled_dataset.pkl")
joblib.dump(encoders, "encoders.pkl")



print("All artifacts saved successfully!")


All artifacts saved successfully!


In [None]:
# Example: take 5 unseen samples from test set
new_samples = X_test.iloc[:5]

new_samples_scaled = scaler.transform(new_samples)

predictions = best_model.predict(new_samples_scaled)
probabilities = best_model.predict_proba(new_samples_scaled)[:, 1]

baseline_df = pd.DataFrame({
    "Prediction": predictions,
    "Probability": probabilities
})

baseline_df
