In [1]:
# --- Install dependencies ---
!pip install pandas numpy scikit-learn xgboost mlflow shap

# --- Import basic libraries ---
import pandas as pd
import numpy as np


Collecting mlflow
  Downloading mlflow-3.5.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.5.1 (from mlflow)
  Downloading mlflow_skinny-3.5.1-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.5.1 (from mlflow)
  Downloading mlflow_tracing-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.5.1->mlflow)
  Downloading databricks_sdk-0.73.0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting gra

In [2]:
df = pd.read_csv('/content/emi_prediction_dataset.csv')
df.head()

  df = pd.read_csv('/content/emi_prediction_dataset.csv')


Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,existing_loans,current_emi_amount,credit_score,bank_balance,emergency_fund,emi_scenario,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi
0,38.0,Female,Married,Professional,82600.0,Private,0.9,Mid-size,Rented,20000.0,...,Yes,23700.0,660.0,303200.0,70200.0,Personal Loan EMI,850000.0,15,Not_Eligible,500.0
1,38.0,Female,Married,Graduate,21500.0,Private,7.0,MNC,Family,0.0,...,Yes,4100.0,714.0,92500.0,26900.0,E-commerce Shopping EMI,128000.0,19,Not_Eligible,700.0
2,38.0,Male,Married,Professional,86100.0,Private,5.8,Startup,Own,0.0,...,No,0.0,650.0,672100.0,324200.0,Education EMI,306000.0,16,Eligible,27775.0
3,58.0,Female,Married,High School,66800.0,Private,2.2,Mid-size,Own,0.0,...,No,0.0,685.0,440900.0,178100.0,Vehicle EMI,304000.0,83,Eligible,16170.0
4,48.0,Female,Married,Professional,57300.0,Private,3.4,Mid-size,Family,0.0,...,No,0.0,770.0,97300.0,28200.0,Home Appliances EMI,252000.0,7,Not_Eligible,500.0


Data cleaning

In [3]:
def clean_numeric(df, cols):
    for c in cols:
        df[c] = (
            df[c]
            .astype(str)
            .str.replace(r'[^\d.\-]', '', regex=True)
            .replace('', np.nan)
        )
        df[c] = pd.to_numeric(df[c], errors='coerce')
    return df

numeric_cols = [
    'age','monthly_salary','years_of_employment','monthly_rent',
    'school_fees','college_fees','travel_expenses','groceries_utilities',
    'other_monthly_expenses','existing_loans','current_emi_amount',
    'credit_score','bank_balance','emergency_fund','requested_amount',
    'requested_tenure','max_monthly_emi'
]

df = clean_numeric(df, [c for c in numeric_cols if c in df.columns])
df.drop_duplicates(inplace=True)
df.fillna(df.median(numeric_only=True), inplace=True)
print(df.info())


  .replace('', np.nan)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404800 entries, 0 to 404799
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   age                     404800 non-null  float64
 1   gender                  404800 non-null  object 
 2   marital_status          404800 non-null  object 
 3   education               402396 non-null  object 
 4   monthly_salary          404800 non-null  float64
 5   employment_type         404800 non-null  object 
 6   years_of_employment     404800 non-null  float64
 7   company_type            404800 non-null  object 
 8   house_type              404800 non-null  object 
 9   monthly_rent            404800 non-null  float64
 10  family_size             404800 non-null  int64  
 11  dependents              404800 non-null  int64  
 12  school_fees             404800 non-null  float64
 13  college_fees            404800 non-null  float64
 14  travel_expenses     

# **Feature Engineering **
the process of transforming raw data into features that are more suitable for machine learning models to learn from, aiming to improve model performance

In [4]:
def add_features(df):
    df['total_expenses'] = df[['monthly_rent','school_fees','college_fees',
                               'travel_expenses','groceries_utilities',
                               'other_monthly_expenses','current_emi_amount']].sum(axis=1)
    df['debt_to_income'] = df['current_emi_amount'] / (df['monthly_salary'] + 1)
    df['expense_to_income'] = df['total_expenses'] / (df['monthly_salary'] + 1)
    df['savings_ratio'] = df['emergency_fund'] / (df['monthly_salary'] + 1)
    df['affordability_ratio'] = np.clip(0.3 - df['expense_to_income'], 0, 1)
    return df

df = add_features(df)
df.head()


Unnamed: 0,age,gender,marital_status,education,monthly_salary,employment_type,years_of_employment,company_type,house_type,monthly_rent,...,emi_scenario,requested_amount,requested_tenure,emi_eligibility,max_monthly_emi,total_expenses,debt_to_income,expense_to_income,savings_ratio,affordability_ratio
0,38.0,Female,Married,Professional,82600.0,Private,0.9,Mid-size,Rented,20000.0,...,Personal Loan EMI,850000.0,15,Not_Eligible,500.0,83600.0,0.286921,1.012094,0.849869,0.0
1,38.0,Female,Married,Graduate,21500.0,Private,7.0,MNC,Family,0.0,...,E-commerce Shopping EMI,128000.0,19,Not_Eligible,700.0,19500.0,0.190689,0.906935,1.251105,0.0
2,38.0,Male,Married,Professional,86100.0,Private,5.8,Startup,Own,0.0,...,Education EMI,306000.0,16,Eligible,27775.0,35600.0,0.0,0.413468,3.765345,0.0
3,58.0,Female,Married,High School,66800.0,Private,2.2,Mid-size,Own,0.0,...,Vehicle EMI,304000.0,83,Eligible,16170.0,37400.0,0.0,0.559872,2.666128,0.0
4,48.0,Female,Married,Professional,57300.0,Private,3.4,Mid-size,Family,0.0,...,Home Appliances EMI,252000.0,7,Not_Eligible,500.0,58600.0,0.0,1.02267,0.492138,0.0


Train Classification Model (EMI Eligibility)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np
import pandas as pd

# ---------------------------
# Step 1: Prepare target encoding
# ---------------------------
target_class = 'emi_eligibility'

# Encode target labels ('Eligible', 'High_Risk', 'Not_Eligible') → (0, 1, 2)
label_encoder = LabelEncoder()
df[target_class] = label_encoder.fit_transform(df[target_class])

# Split data
X = df.drop(columns=[target_class, 'max_monthly_emi'])
y = df[target_class]

num_cols = X.select_dtypes(include=['number']).columns
cat_cols = [c for c in X.columns if c not in num_cols]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# ---------------------------
# Step 2: Preprocessing pipeline
# ---------------------------
preproc = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

# ---------------------------
# Step 3: XGBoost model
# ---------------------------
xgb_clf = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

pipe_clf = Pipeline([
    ('preproc', preproc),
    ('clf', xgb_clf)
])

# ---------------------------
# Step 4: Train
# ---------------------------
pipe_clf.fit(X_train, y_train)

# ---------------------------
# Step 5: Evaluate
# ---------------------------
y_pred = pipe_clf.predict(X_test)

# Decode predictions back to labels for readability
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

print(classification_report(y_test_labels, y_pred_labels))


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

    Eligible       0.91      0.98      0.95     14889
   High_Risk       0.84      0.11      0.20      3497
Not_Eligible       0.97      1.00      0.98     62574

    accuracy                           0.96     80960
   macro avg       0.91      0.70      0.71     80960
weighted avg       0.95      0.96      0.94     80960



Train Regression Model (Max EMI Amount)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

target_reg = 'max_monthly_emi'
X = df.drop(columns=[target_reg, 'emi_eligibility'])
y = df[target_reg]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preproc = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

rf_reg = RandomForestRegressor(n_estimators=200, random_state=42)
pipe_reg = Pipeline([('preproc', preproc), ('reg', rf_reg)])
pipe_reg.fit(X_train, y_train)

preds = pipe_reg.predict(X_test)
print('RMSE:', np.sqrt(mean_squared_error(y_test, preds)))
print('MAE:', mean_absolute_error(y_test, preds))
print('R2:', r2_score(y_test, preds))


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Save Model

In [None]:
import joblib

# Save the classification model
joblib.dump(pipe_clf, 'emi_eligibility_classifier.joblib')
print("EMI eligibility classifier saved as 'emi_eligibility_classifier.joblib'")

# Save the regression model
joblib.dump(pipe_reg, 'max_emi_regressor.joblib')
print("Max EMI regressor saved as 'max_emi_regressor.joblib'")