<a href="https://colab.research.google.com/github/theabhishekc/Hackathon/blob/main/hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install pandas numpy scikit-learn lightgbm optuna joblib



Importing the necessary Libraries


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
import joblib
from lightgbm import early_stopping

Data Collection

In [None]:
print("Loading dataset...")
# Load your dataset - REPLACE THIS WITH YOUR ACTUAL DATASET PATH
df = pd.read_csv("/content/Credit-Risk-Dataset.csv")  # Change to your actual file path

print(f"Dataset loaded with shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(df.describe())

Loading dataset...
Dataset loaded with shape: (32581, 11)
Columns: ['person_age', 'person_income', 'person_home_ownership', 'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt', 'loan_status', 'loan_percent_income', 'cb_person_default_on_file', 'cb_person_cred_hist_length']
         person_age  person_income  person_emp_length     loan_amnt  \
count  32581.000000   3.258100e+04       31686.000000  32581.000000   
mean      27.734600   6.607485e+04           4.789686   9589.371106   
std        6.348078   6.198312e+04           4.142630   6322.086646   
min       20.000000   4.000000e+03           0.000000    500.000000   
25%       23.000000   3.850000e+04           2.000000   5000.000000   
50%       26.000000   5.500000e+04           4.000000   8000.000000   
75%       30.000000   7.920000e+04           7.000000  12200.000000   
max      144.000000   6.000000e+06         123.000000  35000.000000   

        loan_status  loan_percent_income  cb_person_cred_hist_length  
count

Feature Engineering


In [None]:
def create_new_features(df):
    """
    Create innovative features that might help the model
    """
    df = df.copy()

    # Liquidity ratio: Income relative to loan amount
    df['liquidity_ratio'] = df['person_income'] / (df['loan_amnt'] + 1)

    # Debt burden intensity
    df['debt_burden'] = df['person_income'] * df['loan_percent_income']

    # Experience to age ratio
    df['experience_ratio'] = df['person_emp_length'] / (df['person_age'] + 1)

    # Income stability score (assuming longer employment = more stable)
    df['income_stability'] = df['person_emp_length'] * df['person_income']

    # Credit history density
    df['credit_density'] = df['cb_person_cred_hist_length'] / (df['person_age'] - 18 + 1)

    # Risk capacity indicator
    df['risk_capacity'] = (df['person_income'] - df['loan_amnt'] * df['loan_percent_income']) / 1000

    # Debt-to-income squared (non-linear relationship)
    df['dti_squared'] = df['loan_percent_income'] ** 2

    # Income to loan amount ratio
    df['income_to_loan_ratio'] = df['person_income'] / (df['loan_amnt'] + 1)

    return df



Data Preprocessing

In [None]:
def preprocess_data(df, target='loan_status', test_size=0.2, random_state=42):
    """
    Comprehensive preprocessing pipeline
    """
    # Create new features
    df = create_new_features(df)

    # Separate features and target
    X = df.drop(columns=[target])
    y = df[target]

    # Identify column types
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    # Handle categorical features with label encoding
    label_encoders = {}
    for col in categorical_features:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Scale numeric features
    scaler = StandardScaler()
    X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
    X_test[numeric_features] = scaler.transform(X_test[numeric_features])

    return X_train, X_test, y_train, y_test, label_encoders, scaler, numeric_features, categorical_features

print("Preprocessing data and creating new features...")
X_train, X_test, y_train, y_test, label_encoders, scaler, numeric_features, categorical_features = preprocess_data(df)

print("Data preprocessing completed!")
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

Preprocessing data and creating new features...
Data preprocessing completed!
Training set shape: (26064, 18)
Test set shape: (6517, 18)


Model Development


In [None]:
def train_optimal_model(X_train, y_train, X_test, y_test, n_trials=50):
    """
    Train the optimal LightGBM model with hyperparameter tuning
    """
    # Hyperparameter optimization
    study = optuna.create_study(
        direction='maximize',
        sampler=TPESampler(seed=42)
    )
    study.optimize(
        lambda trial: objective(trial, X_train, y_train),
        n_trials=n_trials,
        show_progress_bar=True
    )

    print(f"Best ROC-AUC: {study.best_trial.value:.4f}")

    # Train final model with best parameters
    best_params = study.best_trial.params
    best_params.update({
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42
    })

    final_model = lgb.LGBMClassifier(**best_params)
    final_model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        callbacks=[early_stopping(100, verbose=100)] # Use callbacks instead of early_stopping_rounds
    )

    return final_model, study.best_trial.value, best_params



Optimizing the model (Hyperparameter tuning)

In [None]:
def objective(trial, X, y, n_folds=5):
    """
    Optuna objective function for hyperparameter optimization
    """
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 3000, step=20),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 200),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'min_split_gain': trial.suggest_float('min_split_gain', 1e-8, 0.1, log=True),
    }

    # Cross-validation
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMClassifier(**param, random_state=42)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[early_stopping(100, verbose=False)] # Use callbacks instead of early_stopping_rounds
        )

        y_pred_proba = model.predict_proba(X_val)[:, 1]
        score = roc_auc_score(y_val, y_pred_proba)
        cv_scores.append(score)

    return np.mean(cv_scores)

print("\nStarting hyperparameter optimization...")
model, best_score, best_params = train_optimal_model(X_train, y_train, X_test, y_test, n_trials=30)

[I 2025-08-22 17:48:43,780] A new study created in memory with name: no-name-370709e0-565e-4a45-a0dc-c23ea6a41598



Starting hyperparameter optimization...


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-08-22 17:49:01,265] Trial 0 finished with value: 0.9438537809968637 and parameters: {'n_estimators': 812, 'learning_rate': 0.24517932047070642, 'num_leaves': 2200, 'max_depth': 8, 'min_child_samples': 35, 'subsample': 0.5779972601681014, 'colsample_bytree': 0.5290418060840998, 'reg_alpha': 0.6245760287469893, 'reg_lambda': 0.002570603566117598, 'min_split_gain': 0.000904707195756838}. Best is trial 0 with value: 0.9438537809968637.
[I 2025-08-22 17:49:04,715] Trial 1 finished with value: 0.9465050204512527 and parameters: {'n_estimators': 139, 'learning_rate': 0.2652261985899886, 'num_leaves': 2500, 'max_depth': 5, 'min_child_samples': 40, 'subsample': 0.5917022549267169, 'colsample_bytree': 0.6521211214797689, 'reg_alpha': 0.00052821153945323, 'reg_lambda': 7.71800699380605e-05, 'min_split_gain': 1.092959278721938e-06}. Best is trial 1 with value: 0.9465050204512527.
[I 2025-08-22 17:49:34,558] Trial 2 finished with value: 0.9349938728729409 and parameters: {'n_estimators': 12

Model Validation


In [None]:
# Make predictions
print("\nMaking predictions...")
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Evaluate model
test_auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nFinal Test ROC-AUC Score: {test_auc:.4f}")


Making predictions...

Final Test ROC-AUC Score: 0.9511


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
             feature  importance
1      person_income         744
10   liquidity_ratio         355
4        loan_intent         310
13  income_stability         297
5         loan_grade         296
12  experience_ratio         288
11       debt_burden         268
15     risk_capacity         205
14    credit_density         200
6          loan_amnt         196


Probability of Default (PD) Estimation for all applicants

In [None]:
print("\nGenerating PD values for all applicants...")
X_all = pd.concat([X_train, X_test])
all_predictions = model.predict_proba(X_all)[:, 1]

# Create final results dataframe
final_results = df.copy()
final_results['PD'] = all_predictions
print(final_results.head())


Generating PD values for all applicants...
   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_status  loan_percent_income  \
0    PERSONAL          D      35000            1                 0.59   
1   EDUCATION          B       1000            0                 0.10   
2     MEDICAL          C       5500            1                 0.57   
3     MEDICAL          C      35000            1                 0.53   
4     MEDICAL          C      35000            1                 0.55   

  cb_person_default_on_file  cb_person_cred_hist_length        PD  
0         

Business Intelligence and Deployment Framework

# **Interest Rate Assignment Strategy**
(addressing question 2)

Approach: Tiered, non-linear pricing model → aligns with profitability, competitiveness, and risk-adjustment.

**Core Objectives:**



*   Competitiveness – attract highly creditworthy applicants with low rates.
*   Risk-Based Pricing – ensure profitability across varying risk levels.


*   Behavioral Incentives – offer acceptable higher rates to riskier applicants (instead of rejecting).


**Interest Rate Function (Capped: 7%–18%):**

Tier 1 (Super-Prime, PD ≤ 10%): Gentle slope (7 + 15*PD). Competitive rates for best borrowers.

Tier 2 (Prime, 10% < PD ≤ 40%): Steeper slope (8.5 + 20*(PD - 0.10)). Balances risk-reward, protects margins.

Tier 3 (Sub-Prime, PD > 40%): Flatter slope (14.5 + 6*(PD - 0.40)). Soft decline strategy; caps excessive risk-taking.

Rationale: Optimizes for customer acquisition and risk-adjusted profitability, unlike linear models.

In [None]:
#Assign interest rate based on probability of default using tiered approach
def assign_interest_rate(pd_value):

    if pd_value <= 0.10:  # Low Risk Tier
        rate = 7 + (15 * pd_value)
    elif pd_value <= 0.40:  # Medium Risk Tier
        rate = 8.5 + (20 * (pd_value - 0.10))
    else:  # High Risk Tier
        rate = 14.5 + (6 * (pd_value - 0.40))

    # Ensure the rate is capped between 7% and 18%
    return max(7.0, min(rate, 18.0))

# Assign interest rates based on PD
final_results['assigned_interest_rate'] = final_results['PD'].apply(assign_interest_rate)
print(final_results.head())

   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_status  loan_percent_income  \
0    PERSONAL          D      35000            1                 0.59   
1   EDUCATION          B       1000            0                 0.10   
2     MEDICAL          C       5500            1                 0.57   
3     MEDICAL          C      35000            1                 0.53   
4     MEDICAL          C      35000            1                 0.55   

  cb_person_default_on_file  cb_person_cred_hist_length        PD  \
0                         Y                          

# **Maximum Profit Calculation & Portfolio Optimization**
(addressing question 3)

**Focus:** Maximize expected profit, not just minimize default risk.

Expected Profit Formula per Applicant:

------------------------------------------------------
𝐸
[
Profit
]
=
(
Loan Amount
×
Rate
/
100
)
×
(
1
−
𝑃
𝐷
)

−

(
Loan Amount
×
𝐿
𝐺
𝐷
×
𝑃
𝐷
)
E[Profit]=(Loan Amount×Rate/100)×(1−PD)−(Loan Amount×LGD×PD)

---------------------------------------------------------------------------
With LGD = 60% (0.6).

# Portfolio Optimization Steps:


1.   Predict PD for each applicant (model-generated).
2.   Assign Interest Rate via tiered function.
3.   Calculate E[Profit] for all applicants.
4.   Rank by Profitability (descending).
5.   Apply Constraint – accept only top 30% of applicants.
6.   Compute Portfolio Value – sum of E[Profit] for top 30%.









In [None]:
#Calculate expected profit for a loan
def calculate_expected_profit(loan_amnt, interest_rate, pd_value, lgd=0.6):

    profit_if_paid = loan_amnt * (interest_rate / 100)  # Interest earned
    loss_if_default = -loan_amnt * lgd  # Loss given default

    expected_profit = (profit_if_paid * (1 - pd_value)) + (loss_if_default * pd_value)
    return expected_profit

# Calculate expected profit
final_results['expected_profit'] = final_results.apply(
    lambda row: calculate_expected_profit(
        row['loan_amnt'],
        row['assigned_interest_rate'],
        row['PD']
    ), axis=1
)

print("\nSample predictions with PD values and interest rates:")
print(final_results[['person_age', 'person_income', 'loan_amnt', 'loan_status',
                    'PD', 'assigned_interest_rate', 'expected_profit']].head(10))


Sample predictions with PD values and interest rates:
   person_age  person_income  loan_amnt  loan_status        PD  \
0          22          59000      35000            1  0.007830   
1          21           9600       1000            0  0.301612   
2          25           9600       5500            1  0.098366   
3          23          65500      35000            1  0.990906   
4          24          54400      35000            1  0.000641   
5          21           9900       2500            1  0.017298   
6          26          77100      35000            1  0.038293   
7          24          78956      35000            1  0.980873   
8          24          83000      35000            1  0.024701   
9          21          10000       1600            1  0.134671   

   assigned_interest_rate  expected_profit  
0                7.117453      2307.168500  
1               12.532250       -93.443823  
2                8.475486        95.691454  
3               18.000000    -20751.73

Finding the top 30% most profitable applicants

In [None]:
print("\nSelecting top 30% most profitable applicants...")
final_results_sorted = final_results.sort_values('expected_profit', ascending=False)
top_30_percent = final_results_sorted.head(int(len(final_results_sorted) * 0.3))
total_expected_profit = top_30_percent['expected_profit'].sum()
print(f"Total expected profit from top 30% applicants: ₹{total_expected_profit:,.2f}")
print(f"Number of applicants in top 30%: {len(top_30_percent)}")
print(f"Average PD in top 30%: {top_30_percent['PD'].mean():.4f}")
print(f"Average interest rate in top 30%: {top_30_percent['assigned_interest_rate'].mean():.2f}%")



Selecting top 30% most profitable applicants...
Total expected profit from top 30% applicants: ₹7,873,174.08
Number of applicants in top 30%: 9774
Average PD in top 30%: 0.0228
Average interest rate in top 30%: 7.34%


Saving model and preprocessing artifacts


In [None]:
print("\nSaving model and preprocessing artifacts...")
joblib.dump(model, 'lightgbm_credit_risk_model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
joblib.dump(best_params, 'best_hyperparameters.pkl')

# Save final results
final_results.to_csv('loan_applicants_with_predictions.csv', index=False)
top_30_percent.to_csv('top_30_percent_profitable_applicants.csv', index=False)

print("\nAll files saved successfully!")
print("Files created:")
print("- lightgbm_credit_risk_model.pkl (Trained model)")
print("- label_encoders.pkl (Categorical encoders)")
print("- feature_scaler.pkl (Feature scaler)")
print("- best_hyperparameters.pkl (Optimal hyperparameters)")
print("- loan_applicants_with_predictions.csv (All applicants with predictions)")
print("- top_30_percent_profitable_applicants.csv (Top 30% most profitable applicants)")


Saving model and preprocessing artifacts...

All files saved successfully!
Files created:
- lightgbm_credit_risk_model.pkl (Trained model)
- label_encoders.pkl (Categorical encoders)
- feature_scaler.pkl (Feature scaler)
- best_hyperparameters.pkl (Optimal hyperparameters)
- loan_applicants_with_predictions.csv (All applicants with predictions)
- top_30_percent_profitable_applicants.csv (Top 30% most profitable applicants)
