In [115]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [116]:
df = pd.read_csv("Dataset_A_loan.csv")
df.head()
df.describe()

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
count,45000.0,42750.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0,45000.0
mean,27.764178,80182.23,5.410333,9583.157556,11.006606,0.139725,5.867489,632.608756,0.222222
std,6.045108,73311.58,6.063532,6314.886691,2.978808,0.087212,3.879702,50.435865,0.415744
min,20.0,8000.0,0.0,500.0,5.42,0.0,2.0,390.0,0.0
25%,24.0,47267.25,1.0,5000.0,8.59,0.07,3.0,601.0,0.0
50%,26.0,67055.0,4.0,8000.0,11.01,0.12,4.0,640.0,0.0
75%,30.0,95821.25,8.0,12237.25,12.99,0.19,8.0,670.0,0.0
max,144.0,5556399.0,125.0,35000.0,20.0,0.66,30.0,850.0,1.0


In [117]:
df.dtypes

person_age                        float64
person_gender                      object
person_education                   object
person_income                     float64
person_emp_exp                      int64
person_home_ownership              object
loan_amnt                         float64
loan_intent                        object
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length        float64
credit_score                        int64
previous_loan_defaults_on_file     object
loan_status                         int64
dtype: object

In [118]:
categorical_cols = df.select_dtypes(include=['object']).columns

for col in categorical_cols:
    unique_vals = df[col].dropna().unique()
    print(f"{col}: {unique_vals}")

person_gender: ['female' 'male' 'Male' 'fe male']
person_education: ['Master' 'High School' 'Bachelor' 'Associate' 'Doctorate']
person_home_ownership: ['RENT' 'OWN' 'MORTGAGE' 'OTHER']
loan_intent: ['PERSONAL' 'EDUCATION' 'MEDICAL' 'VENTURE' 'HOMEIMPROVEMENT'
 'DEBTCONSOLIDATION']
previous_loan_defaults_on_file: ['No' 'Yes']


In [None]:
gender_mapping = {
    'female': 'Female',
    'fe male': 'Female',
    'male': 'Male',
    'Male': 'Male'
}

education_mapping = {
    'master': 'Master',
    'high school': 'HighSchool',
    'bachelor': 'Bachelor',
    'associate': 'Associate',
    'doctorate': 'Doctorate'
}

home_ownership_mapping = {
    'RENT': 'Rent',
    'OWN': 'Own',
    'MORTGAGE': 'Mortgage',
    'OTHER': 'Other'
}

loan_intent_mapping = {
    'PERSONAL': 'Personal',
    'EDUCATION': 'Education',
    'MEDICAL': 'Medical',
    'VENTURE': 'Venture',
    'HOMEIMPROVEMENT': 'Home Improvement',
    'DEBTCONSOLIDATION': 'Debt Consolidation'
}

loan_default_mapping = {
    'No': 'No',
    'Yes': 'Yes'
}

df['person_gender'] = df['person_gender'].str.lower().map(gender_mapping)

df['person_education'] = df['person_education'].str.lower().map(education_mapping)

df['person_home_ownership'] = df['person_home_ownership'].map(home_ownership_mapping)

df['loan_intent'] = df['loan_intent'].map(loan_intent_mapping)

df['previous_loan_defaults_on_file'] = df['previous_loan_defaults_on_file'].map(loan_default_mapping)


In [120]:
for col in categorical_cols:
    unique_vals = df[col].dropna().unique()
    print(f"{col}: {unique_vals}")

person_gender: ['Female' 'Male']
person_education: ['Master' 'HighSchool' 'Bachelor' 'Associate' 'Doctorate']
person_home_ownership: ['Rent' 'Own' 'Mortgage' 'Other']
loan_intent: ['Personal' 'Education' 'Medical' 'Venture' 'HomeImprovement'
 'DebtConsolidation']
previous_loan_defaults_on_file: ['No' 'Yes']


In [121]:
x = df.drop('loan_status', axis=1)
y = df['loan_status']

In [122]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

for col in categorical_cols:
    le = LabelEncoder()
    x[col] = le.fit_transform(x[col])

In [123]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, train_size=0.7)

Splitting the data before preprocessing to help keep consistency

In [124]:
print(y_train.isnull().sum())
print(y_test.isnull().sum())

0
0


In [125]:
numeric_cols = x_train.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    if x_train[col].isnull().sum() > 0:
        x_train[col] = x_train[col].fillna(x_train[col].median())
    if x_test[col].isnull().sum() > 0:
        x_test[col] = x_test[col].fillna(x_test[col].median())


for col in categorical_cols:
    if x_train[col].isnull().sum() > 0:
        x_train[col] = x_train[col].fillna(x_train[col].mode()[0])
    if x_test[col].isnull().sum() > 0:
        x_test[col] = x_test[col].fillna(x_test[col].mode()[0])


In [126]:
numeric_features = ['person_age', 'person_income', 'person_emp_exp', 
                    'loan_amnt', 'loan_int_rate', 'loan_percent_income',
                    'cb_person_cred_hist_length', 'credit_score']

scaler = StandardScaler()
x_train[numeric_features] = scaler.fit_transform(x_train[numeric_features])
x_test[numeric_features] = scaler.fit_transform(x_test[numeric_features])

In [127]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=0)
rf_model.fit(x_train, y_train)

In [128]:
y_pred_rf = rf_model.predict(x_test)
y_prob_rf = rf_model.predict_proba(x_test)[:, 1]

In [129]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob_rf):.4f}")
print(classification_report(y_test, y_pred_rf))


Random Forest Results:
Accuracy: 0.9087
AUC-ROC: 0.9614
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     10534
           1       0.80      0.78      0.79      2967

    accuracy                           0.91     13501
   macro avg       0.87      0.86      0.87     13501
weighted avg       0.91      0.91      0.91     13501



In [130]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(x_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [131]:
y_pred_xgb = xgb_model.predict(x_test)
y_prob_xgb = xgb_model.predict_proba(x_test)[:, 1]

In [132]:
print("\nXGBoost Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob_xgb):.4f}")
print(classification_report(y_test, y_pred_xgb))


XGBoost Results:
Accuracy: 0.9107
AUC-ROC: 0.9612
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     10534
           1       0.80      0.79      0.80      2967

    accuracy                           0.91     13501
   macro avg       0.87      0.87      0.87     13501
weighted avg       0.91      0.91      0.91     13501



In [133]:
import pickle

with open('xgboost_model.pkl', 'wb') as model_file:
  pickle.dump(xgb_model, model_file)