In [1]:
#==========================================
# Title:  Climate Credit Risk Model
# Author: Vachan
# Email : vachan@iitb.ac.in
#==========================================

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from itertools import combinations
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
def simulate_future_data(base_data, best_model_vars, years_to_simulate=4):
    """
    Simulates future data for each customer_id based on logical trends for years 2022-2025.

    Args:
        base_data (pd.DataFrame): The original dataframe to base simulations on. 
                                  Must contain a 'customer_id' column.
        best_model_vars (list): The list of variables to simulate.
        years_to_simulate (int): The number of future years to simulate (e.g., 4 for 2022-2025).

    Returns:
        pd.DataFrame: A dataframe containing the original and simulated data for 2021-2025.
    """
    print("--- Starting Future Data Simulation (per customer) for 2022-2025 ---")
    
    if 'customer_id' not in base_data.columns:
        raise ValueError("The base data must contain a 'customer_id' column.")

    # Start with the 2021 data.
    last_known_data = base_data.copy()
    if 'year' not in last_known_data.columns:
        last_known_data['year'] = 2021
    
    # Ensure credit_history is numeric for simulation
    if 'credit_history' in best_model_vars:
        last_known_data['credit_history'] = pd.to_numeric(last_known_data['credit_history'], errors='coerce').fillna(0)

    future_data_list = [last_known_data] # Include the base year in the list

    for year_offset in range(1, years_to_simulate + 1):
        future_year = 2021 + year_offset
        print(f"Simulating data for year: {future_year}")
        
        future_df = last_known_data.copy()
        future_df['year'] = future_year

        # --- Simulation Logic (applied to the previous year's data) ---
        if 'ltv' in best_model_vars:
            future_df['ltv'] *= np.random.uniform(0.95, 0.99, size=len(future_df))

        if 'average_age' in best_model_vars:
            future_df['average_age'] += 1

        if 'credit_score' in best_model_vars:
            future_df['credit_score'] *= np.random.uniform(1.0, 1.02, size=len(future_df))
            future_df['credit_score'] = future_df['credit_score'].clip(upper=900)

        if 'last_six_month_defaulted_no' in best_model_vars:
            future_df['last_six_month_defaulted_no'] = np.random.choice([0, 1], size=len(future_df), p=[0.98, 0.02])
            
        if 'credit_history' in best_model_vars:
            future_df['credit_history'] += 1

        future_data_list.append(future_df)
        last_known_data = future_df.copy()
    
    full_simulated_df = pd.concat(future_data_list, ignore_index=True)
    
    print("\n--- Simulation Complete ---")
    return full_simulated_df


In [4]:
def load_ngfs_data_for_country(file_path, country_name, ngfs_variable_vector, years_to_extract):
    """
    Loads and filters NGFS data from an Excel file for a specific country, 
    handling different scenarios correctly.
    """
    print(f"--- Loading NGFS data from {file_path} for {country_name} ---")
    
    try:
        df = pd.read_excel(file_path)
        df.columns = [str(col).strip() for col in df.columns]
        
        required_cols = ['Region', 'Scenario', 'Variable']
        if not all(col in df.columns for col in required_cols):
            raise ValueError(f"The Excel file must contain {required_cols} columns.")

    except Exception as e:
        print(f"Error loading or parsing the Excel file: {e}")
        return pd.DataFrame()

    country_df = df[df['Region'] == country_name]
    filtered_df = country_df[country_df['Variable'].isin(ngfs_variable_vector)]

    if filtered_df.empty:
        return pd.DataFrame()

    id_vars = ['Region', 'Scenario', 'Variable']
    year_cols = [str(y) for y in years_to_extract]
    value_vars = [col for col in year_cols if col in filtered_df.columns]
    
    if not value_vars:
        return pd.DataFrame()

    melted_df = filtered_df.melt(id_vars=id_vars, value_vars=value_vars, var_name='year', value_name='value')
    pivoted_df = melted_df.pivot_table(index=['year', 'Scenario'], columns='Variable', values='value').reset_index()
    pivoted_df['year'] = pd.to_numeric(pivoted_df['year']).astype(int)
    
    print("NGFS data loaded and transformed successfully.")
    return pivoted_df



In [5]:
def train_and_predict_risk_drivers(customer_data, explanatory_data, dependent_vars, explanatory_vars):
    """
    Trains models on the 'Baseline' scenario for 2021-2025 and predicts 
    2026 values for all scenarios using an aggregation approach to avoid memory errors.
    """
    print("\n\n--- Starting Model Training and Prediction ---")
    
    # Aggregate customer data by year to prevent memory errors
    avg_customer_data = customer_data.groupby('year')[dependent_vars].mean().reset_index()
    
    # Prepare training data (2021-2025, Baseline only)
    merged_data = pd.merge(avg_customer_data, explanatory_data, on='year', how='left')
    train_data = merged_data[merged_data['Scenario'] == 'Baseline'].copy()
    
    # Prepare prediction input data (2026, all scenarios)
    prediction_inputs = explanatory_data[explanatory_data['year'] == 2026].copy()

    if prediction_inputs.empty:
        print("CRITICAL WARNING: No NGFS data found for the prediction year (2026). Cannot proceed.")
        return

    all_predictions = []

    for dep_var in dependent_vars:
        print(f"\n==================== Processing Risk Driver: {dep_var} ====================")
        
        # Find the Best Model using 'Baseline' Data
        lowest_aic = float('inf')
        best_model = None
        best_combination = None
        
        print("\n--- Finding best model on 'Baseline' data (2021-2025) ---")
        for i in range(1, len(explanatory_vars) + 1):
            for combo in combinations(explanatory_vars, i):
                combo_list = list(combo)
                
                model_data = train_data[[dep_var] + combo_list].dropna()
                if len(model_data) < len(combo_list) + 2:
                    continue

                X_train = model_data[combo_list]
                y_train = model_data[dep_var]
                X_train = sm.add_constant(X_train, has_constant='add')

                try:
                    model = sm.OLS(y_train, X_train).fit()
                    if model.aic < lowest_aic:
                        lowest_aic = model.aic
                        best_model = model
                        best_combination = combo_list
                except Exception:
                    pass
        
        if best_model is None:
            print(f"Could not find a suitable model for '{dep_var}'. Skipping.")
            continue
            
        print(f"Best model for '{dep_var}' found with combination: {best_combination}")
        print(f"Lowest AIC: {lowest_aic:.4f}")

        # Predict for 2026 under ALL Scenarios
        print("\n--- Predicting 2026 values for all scenarios ---")
        
        X_predict = prediction_inputs[best_combination]
        X_predict = sm.add_constant(X_predict, has_constant='add')
        
        predictions = best_model.predict(X_predict)
        
        scenario_predictions = prediction_inputs[['Scenario']].copy()
        scenario_predictions['risk_driver'] = dep_var
        scenario_predictions['predicted_value_2026'] = predictions
        all_predictions.append(scenario_predictions)

    # --- Display and Save Final Predictions ---
    if all_predictions:
        final_predictions_df = pd.concat(all_predictions, ignore_index=True)

        # Save the results to a CSV file
        predicted_drivers_file = 'predicted_risk_drivers_2026.csv'
        final_predictions_df.to_csv(predicted_drivers_file, index=False)
        print(f"\n--- Predicted risk drivers for 2026 saved to {predicted_drivers_file} ---")

        print("\n\n==================== FINAL 2026 PREDICTIONS ====================")
        prediction_summary = final_predictions_df.pivot_table(
            index='Scenario',
            columns='risk_driver',
            values='predicted_value_2026'
        )
        print(prediction_summary)
    else:
        print("\nNo predictions were generated.")



In [6]:
if __name__ == '__main__':
    try:
       # --- Define File Paths and Variables ---
        customer_data_file = "C:/Users/vacha/OneDrive - Indian Institute of Technology Bombay/Climate Finance/Retail Portfoilio Research/Auto Risk Kaggle/data.csv"
        ngfs_data_file = "C:/Users/vacha/OneDrive - Indian Institute of Technology Bombay/Climate Finance/Retail Portfoilio Research/GEME3_IIASA_2025_05_02.xlsx" # <--- CHANGE THIS PATH

        ngfs_variables_to_use = [
            'Imports|Crude Oil',
            'Crude Oil YoY Growth',
            'Expenditure|Household|Transport services',
            'Investment|Energy',
            'Exports|EV Transport Equipment',
            'Production|EV Transport Equipment',
            'Investments|EV Transport Equipment',
            'Expenditure|Household|Purchase of vehicles',
            'Power Generation Technologies|Oil fired',
            'Unemployment Rate'
        ]

        best_model_variables = [
            'ltv', 'average_age', 'credit_score', 
            'last_six_month_defaulted_no', 'credit_history']
    # --- 1. Load Data ---
        df = pd.read_csv(customer_data_file)
        
        # Load NGFS data for training (2021-2025) and prediction (2026)
        explanatory_df = load_ngfs_data_for_country(
            file_path=ngfs_data_file,
            country_name='India - IND',
            ngfs_variable_vector=ngfs_variables_to_use,
            years_to_extract=[2021, 2022, 2023, 2024, 2025, 2026]
        )
        
        if explanatory_df.empty:
            raise ValueError("Failed to load explanatory data. Check file path, country name, and variable names.")

        df.dropna(subset=best_model_variables, inplace=True)

        # --- 2. Simulate Customer-Level Data up to 2025 ---
        customer_data_simulated = simulate_future_data(df, best_model_variables, years_to_simulate=4)
        
        # --- 3. Train Models on Baseline and Predict for 2026 ---
        explanatory_variable_vector = [col for col in explanatory_df.columns if col not in ['year', 'Scenario']]

        train_and_predict_risk_drivers(
            customer_data_simulated,
            explanatory_df,
            best_model_variables, 
            explanatory_variable_vector
        )

    except FileNotFoundError as e:
        print(f"\nError: A required file was not found. Details: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")


--- Loading NGFS data from C:/Users/vacha/OneDrive - Indian Institute of Technology Bombay/Climate Finance/Retail Portfoilio Research/GEME3_IIASA_2025_05_02.xlsx for India - IND ---
NGFS data loaded and transformed successfully.
--- Starting Future Data Simulation (per customer) for 2022-2025 ---
Simulating data for year: 2022
Simulating data for year: 2023
Simulating data for year: 2024
Simulating data for year: 2025

--- Simulation Complete ---


--- Starting Model Training and Prediction ---


--- Finding best model on 'Baseline' data (2021-2025) ---
Best model for 'ltv' found with combination: ['Imports|Crude Oil', 'Production|EV Transport Equipment', 'Unemployment Rate']
Lowest AIC: -27.1436

--- Predicting 2026 values for all scenarios ---


--- Finding best model on 'Baseline' data (2021-2025) ---
Best model for 'average_age' found with combination: ['Imports|Crude Oil', 'Production|EV Transport Equipment', 'Unemployment Rate']
Lowest AIC: -34.9337

--- Predicting 2026 values fo