In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load data
df = pd.read_csv("train.csv")

In [3]:
num_cols = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

In [4]:
df[num_cols].skew()

annual_income           1.719509
debt_to_income_ratio    1.406680
credit_score           -0.166993
loan_amount             0.207360
interest_rate           0.049945
dtype: float64

In [5]:
# Apply log transformation to reduce skewness
#these columns are right skewed
df['annual_income_log'] = np.log1p(df['annual_income'])
df['dti_log'] = np.log1p(df['debt_to_income_ratio'])

## Feature ENgineering

In [6]:
df['income_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
df['payment_burden'] = (df['loan_amount'] * df['interest_rate']) / (df['annual_income'] + 1)


In [7]:
def credit_bucket(x):
    if x >= 750: return "Excellent"
    elif x >= 700: return "Good"
    elif x >= 650: return "Fair"
    else: return "Poor"

df['credit_bucket'] = df['credit_score'].apply(credit_bucket)


In [8]:
df['credit_score_sq'] = df['credit_score'] ** 2


### 4. Categorical Feature Encoding

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593994 entries, 0 to 593993
Data columns (total 23 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    593994 non-null  int64  
 1   annual_income         593994 non-null  float64
 2   debt_to_income_ratio  593994 non-null  float64
 3   credit_score          593994 non-null  int64  
 4   loan_amount           593994 non-null  float64
 5   interest_rate         593994 non-null  float64
 6   gender                593994 non-null  object 
 7   marital_status        593994 non-null  object 
 8   education_level       593994 non-null  object 
 9   employment_status     593994 non-null  object 
 10  loan_purpose          593994 non-null  object 
 11  grade_subgrade        593994 non-null  object 
 12  loan_paid_back        593994 non-null  float64
 13  annual_income_log     593994 non-null  float64
 14  dti_log               593994 non-null  float64
 15  

In [30]:
df['loan_paid_back'].value_counts()     

loan_paid_back
1.0    474494
0.0    119500
Name: count, dtype: int64

In [None]:
# Convert letter-grade to numeric ranking
df['grade'] = df['grade_subgrade'].str[0]
df['subgrade_num'] = df['grade_subgrade'].str[1:].astype(int)

In [31]:
from sklearn.model_selection import KFold

# 1. Ordinal Encoding
df['education_rank'] = df['education_level'].map({
    "PhD":4, "Master's":3, "Bachelor's":2, "High School":1, "Other":0
})

df['credit_bucket_rank'] = df['credit_bucket'].map({
    "Excellent":4, "Good":3, "Fair":2, "Poor":1
})

# 2. Target Encoding function
def target_encode(train, col, target='loan_paid_back'):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    global_mean = train[target].mean()
    te = pd.Series(index=train.index, dtype=float)

    for tr_idx, val_idx in kf.split(train):
        train_fold = train.iloc[tr_idx]
        val_fold = train.iloc[val_idx]
        mapping = train_fold.groupby(col)[target].mean()
        te.iloc[val_idx] = val_fold[col].map(mapping).fillna(global_mean)
    return te

# Target encoding
df['grade_te'] = target_encode(df, 'grade')
df['loan_purpose_te'] = target_encode(df, 'loan_purpose')
df['employment_status_te'] = target_encode(df, 'employment_status')

# 3. One-hot encoding
df = pd.get_dummies(df, columns=['gender','marital_status'], drop_first=True)


In [34]:
df.head()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,grade_subgrade,loan_paid_back,annual_income_log,dti_log,...,education_rank,credit_bucket_rank,grade_te,loan_purpose_te,employment_status_te,gender_Male,gender_Other,marital_status_Married,marital_status_Single,marital_status_Widowed
0,0,29367.99,0.084,736,2528.42,13.67,C3,1.0,10.287695,0.080658,...,1,3,0.847562,0.802012,0.899948,False,False,False,True,False
1,1,22108.02,0.166,636,4593.1,12.92,D3,0.0,10.003741,0.153579,...,3,1,0.715457,0.796389,0.894201,True,False,True,False,False
2,2,49566.2,0.097,694,17005.15,9.76,C5,1.0,10.811085,0.092579,...,1,2,0.847562,0.797286,0.894132,True,False,False,True,False
3,3,46858.25,0.065,533,4682.48,16.1,F1,1.0,10.754904,0.062975,...,1,1,0.626994,0.797795,0.894269,False,False,False,True,False
4,4,25496.7,0.053,665,12184.43,10.21,D1,1.0,10.146344,0.051643,...,1,2,0.715457,0.802117,0.894201,True,False,True,False,False


In [33]:
# Drop original categorical columns
df.drop(columns=['loan_purpose','employment_status','grade','education_level','credit_bucket'], inplace=True)

In [35]:
# ---------------------------
# 5) Additional engineered numeric features (as previously suggested)
# ---------------------------
df['income_loan_ratio'] = df['annual_income'] / (df['loan_amount'] + 1)
df['payment_burden'] = (df['loan_amount'] * df['interest_rate']) / (df['annual_income'] + 1)
df['credit_x_dti'] = df['credit_score'] * df['debt_to_income_ratio']
df['loan_x_rate'] = df['loan_amount'] * df['interest_rate']

engineered_numeric = ['income_loan_ratio', 'payment_burden', 'credit_x_dti', 'loan_x_rate']

In [37]:
num_cols_extended=['annual_income_log', 'dti_log', 'credit_score', 'loan_amount', 'interest_rate']

In [38]:
num_cols = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']

In [49]:
# full_training_pipeline.py
import os
import random
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import lightgbm as lgb
import xgboost as xgb
# import shap
import matplotlib.pyplot as plt
import seaborn as sns

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [48]:
# !python -m pip install shap

In [41]:
# ---------------------------
# 6) Feature list for modeling
# ---------------------------
features = (num_cols_extended +
            ['education_rank', 'credit_bucket_rank', 'grade_rank'] +
            [c for c in df.columns if c.endswith('_te')] +   # all TE features
            ['gender_Male','gender_Other'] if 'gender_Male' in df.columns else []  # include dummies if exist
           )
# Add marital dummies if present
marital_cols = [c for c in df.columns if c.startswith('marital_status_')]
features += marital_cols

In [43]:
engineered_numeric = ['income_loan_ratio', 'payment_burden', 'credit_x_dti', 'loan_x_rate']

In [44]:
features += engineered_numeric

In [51]:
# Impute numeric missing values if any (median)
num_imputer = SimpleImputer(strategy='median')
df[num_cols_extended] = num_imputer.fit_transform(df[num_cols_extended])
# Impute any produced NaN/inf
df[engineered_numeric] = df[engineered_numeric].replace([np.inf, -np.inf], np.nan)
# df[engineered_numeric] = num_imputer.transform(df[engineered_numeric])

In [52]:
# Ensure unique
features = [f for i,f in enumerate(features) if f not in features[:i]]
print("Number of features:", len(features))

# If any features missing (dummy variable absence), build from available list
features = [f for f in features if f in df.columns]
print("Final features used:", features)

# Standard scaling numeric features for XGBoost (optional for LightGBM not necessary but okay)
scaler = StandardScaler()
scale_cols = [c for c in features if c in num_cols_extended + engineered_numeric + ['education_rank','credit_bucket_rank','grade_rank']]
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# Save imputers/scaler for production
joblib.dump(num_imputer, "num_imputer.joblib")
joblib.dump(scaler, "scaler.joblib")

Number of features: 20
Final features used: ['annual_income_log', 'dti_log', 'credit_score', 'loan_amount', 'interest_rate', 'education_rank', 'credit_bucket_rank', 'grade_te', 'loan_purpose_te', 'employment_status_te', 'gender_Male', 'gender_Other', 'marital_status_Married', 'marital_status_Single', 'marital_status_Widowed', 'income_loan_ratio', 'payment_burden', 'credit_x_dti', 'loan_x_rate']


['scaler.joblib']

In [53]:
TARGET = "loan_paid_back"
ID_COL = "id"

In [None]:
# ---------------------------
# 7) Cross-validated training and evaluation function
# ---------------------------
def cv_train_models(data, features, target, n_splits=5, seed=RANDOM_STATE):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds_lgb = np.zeros(data.shape[0])
    oof_preds_xgb = np.zeros(data.shape[0])
    metrics = {'lgb': [], 'xgb': []}
    fold = 0

    for train_idx, val_idx in skf.split(data, data[target]):
        fold += 1
        print(f"\n--- Fold {fold} ---")
        X_tr, X_val = data.iloc[train_idx][features], data.iloc[val_idx][features]
        y_tr, y_val = data.iloc[train_idx][target], data.iloc[val_idx][target]

        # LightGBM dataset
        lgb_train = lgb.Dataset(X_tr, label=y_tr)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

        lgb_params = {
            "objective": "binary",
            "metric": "auc",
            "boosting_type": "gbdt",
            "learning_rate": 0.05,
            "num_leaves": 31,
            "max_depth": -1,
            "seed": RANDOM_STATE,
            "verbosity": -1,
            "n_jobs": -1,
        }

        print("Training LightGBM...")
        lgb_model = lgb.train(
            params=lgb_params,
            train_set=lgb_train,
            num_boost_round=5000,
            valid_sets=[lgb_train, lgb_val],
            callbacks=[lgb.log_evaluation(period=200)]
)

        lgb_pred = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
        oof_preds_lgb[val_idx] = lgb_pred

        # XGBoost
        xgb_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'eta': 0.05,
            'max_depth': 6,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'seed': RANDOM_STATE,
            'verbosity': 0,
        }

        print("Training XGBoost...")
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        xgb_model = xgb.train(params=xgb_params,
                              dtrain=dtrain,
                              num_boost_round=5000,
                              evals=[(dtrain, 'train'), (dval, 'valid')],
                              early_stopping_rounds=100,
                              verbose_eval=200)

        xgb_pred = xgb_model.predict(xgb.DMatrix(X_val), ntree_limit=xgb_model.best_ntree_limit)
        oof_preds_xgb[val_idx] = xgb_pred

        # Metrics per fold
        for name, preds in [('lgb', lgb_pred), ('xgb', xgb_pred)]:
            auc = roc_auc_score(y_val, preds)
            pr_auc = average_precision_score(y_val, preds)
            pred_labels = (preds >= 0.5).astype(int)
            acc = accuracy_score(y_val, pred_labels)
            prec = precision_score(y_val, pred_labels, zero_division=0)
            rec = recall_score(y_val, pred_labels, zero_division=0)
            f1 = f1_score(y_val, pred_labels, zero_division=0)
            metrics[name].append({'auc':auc, 'pr_auc':pr_auc, 'accuracy':acc, 'precision':prec, 'recall':rec, 'f1':f1})
            print(f"{name.upper()} Fold {fold} — AUC: {auc:.4f}, PR-AUC: {pr_auc:.4f}, F1: {f1:.4f}")

        # Save per-fold models if desired
        joblib.dump(lgb_model, f"lgb_model_fold{fold}.pkl")
        xgb_model.save_model(f"xgb_model_fold{fold}.json")

    # Aggregate metrics
    import statistics
    results = {}
    for m in metrics:
        results[m] = {k: np.mean([fold_m[k] for fold_m in metrics[m]]) for k in metrics[m][0].keys()}
    print("\nCross-validated results:")
    print(results)

    return oof_preds_lgb, oof_preds_xgb, results

oof_lgb, oof_xgb, cv_results = cv_train_models(df, features, TARGET, n_splits=5)


--- Fold 1 ---
Training LightGBM...


TypeError: train() got an unexpected keyword argument 'verbose_eval'