In [1]:
### 04_ReturnForecasting.ipynb
# Objective: Predict next-day returns for each sector, conditioned on market regimes

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import os

In [2]:
# ------------------------
# Step 1: Load Data
# ------------------------
features = pd.read_csv("../data/processed/feature_matrix.csv", parse_dates=['Date'], index_col='Date')
regimes = pd.read_csv("../data/processed/regime_labels.csv", parse_dates=['Date'], index_col='Date')
returns = pd.read_csv("../data/processed/returns_data.csv", parse_dates=['Date'], index_col='Date')

# Align all data
df = features.join(regimes).join(returns)

# Create target: next-day returns
targets = returns.shift(-1).add_suffix("_t+1")
df = df.join(targets)


In [3]:
df.shape

(2209, 56)

In [4]:
# Drop any resulting NaNs
df.dropna(inplace=True)


In [5]:
df.shape

(2208, 56)

In [8]:
# ------------------------
# Step 2: Model Training per Regime (with Sanitation)
# ------------------------

def sanitize_features(X, method='median', cap=5.0):
    """ Replace inf/nan using contextual imputation and optional capping """
    X = X.copy()
    for col in X.columns:
        x = X[col].replace([np.inf, -np.inf], np.nan)

        if method == 'median':
            fill_val = np.nanmedian(x)
        elif method == 'mean':
            fill_val = np.nanmean(x)
        else:
            fill_val = 0.0

        X[col] = np.nan_to_num(x, nan=fill_val)
        
        # Optional: clip to cap outliers (e.g., ±5×std)
        std = np.std(X[col])
        mean = np.mean(X[col])
        if std > 0:
            X[col] = np.clip(X[col], mean - cap * std, mean + cap * std)
        
    return X


sector_list = ['NIFTY_IT', 'NIFTY_BANK', 'NIFTY_FMCG', 'NIFTY_PHARMA', 'NIFTY_AUTO', 'NIFTY_METAL']
results = []
models = {}

for regime in sorted(df['vol_regime'].unique()):
    print(f"\n--- Regime {regime} ---")
    regime_df = df[df['vol_regime'] == regime]

    for sector in sector_list:
        # Extract raw features and sanitize them
        X_raw = regime_df[features.columns]
        X = sanitize_features(X_raw, method='median', cap=5.0)

        y = regime_df[f"{sector}_t+1"]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        model = Ridge(alpha=1.0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)

        print(f"{sector}: RMSE = {rmse:.5f}, R^2 = {r2:.4f}")
        results.append({'regime': regime, 'sector': sector, 'rmse': rmse, 'r2': r2})

        # Save model for later use
        model_name = f"{sector}_regime{regime}_ridge_model.joblib"
        model_path = os.path.join("../scripts/models", model_name)
        joblib.dump(model, model_path)
        models[(sector, regime)] = model



--- Regime 0 ---
NIFTY_IT: RMSE = 0.02000, R^2 = -2.3558
NIFTY_BANK: RMSE = 0.00789, R^2 = -0.0015
NIFTY_FMCG: RMSE = 0.01137, R^2 = -0.0876
NIFTY_PHARMA: RMSE = 0.00987, R^2 = -0.0986
NIFTY_AUTO: RMSE = 0.00976, R^2 = -0.0066
NIFTY_METAL: RMSE = 0.01371, R^2 = -0.0465

--- Regime 1 ---
NIFTY_IT: RMSE = 0.01302, R^2 = -0.0084
NIFTY_BANK: RMSE = 0.01114, R^2 = -0.0046
NIFTY_FMCG: RMSE = 0.00917, R^2 = -0.0167
NIFTY_PHARMA: RMSE = 0.01344, R^2 = -0.0431
NIFTY_AUTO: RMSE = 0.01301, R^2 = -0.0420
NIFTY_METAL: RMSE = 0.01802, R^2 = -0.0373

--- Regime 2 ---
NIFTY_IT: RMSE = 0.01840, R^2 = -0.0892
NIFTY_BANK: RMSE = 0.02296, R^2 = -0.1118
NIFTY_FMCG: RMSE = 0.01338, R^2 = 0.0213
NIFTY_PHARMA: RMSE = 0.01552, R^2 = -0.0297
NIFTY_AUTO: RMSE = 0.02003, R^2 = -0.0580
NIFTY_METAL: RMSE = 0.02403, R^2 = 0.0152


In [9]:
# ------------------------
# Step 3: Result Summary
# ------------------------
results_df = pd.DataFrame(results)
print("\n=== Summary of Forecasting Results ===")
print(results_df.groupby('regime')[['rmse', 'r2']].mean())



=== Summary of Forecasting Results ===
            rmse        r2
regime                    
0       0.012097 -0.432755
1       0.012966 -0.025351
2       0.019054 -0.042011


In [10]:
# ------------------------
# Step 4: Plot Predicted vs Actual (example sector)
# ------------------------
example_sector = 'NIFTY_IT'
example_regime = 0

regime_df = df[df['vol_regime'] == example_regime]
X = regime_df[features.columns]
y = regime_df[f"{example_sector}_t+1"]
model = models[(example_sector, example_regime)]
y_pred = model.predict(X)

plt.figure(figsize=(10, 5))
plt.plot(y.index, y, label='Actual')
plt.plot(y.index, y_pred, label='Predicted', alpha=0.7)
plt.title(f"{example_sector} - Regime {example_regime} Return Prediction")
plt.legend()
plt.savefig("../results/plots/return_forecast_example.png")
plt.show()


ValueError: Input X contains infinity or a value too large for dtype('float64').

ReturnForecasting ✅ Final Try: XGBoost and MLPRegressor (Side-by-side)

In [11]:
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

def sanitize_features(X, method='median', cap=5.0):
    """ Replace inf/nan using contextual imputation and optional capping """
    X = X.copy()
    for col in X.columns:
        x = X[col].replace([np.inf, -np.inf], np.nan)

        fill_val = np.nanmedian(x) if method == 'median' else np.nanmean(x)
        X[col] = np.nan_to_num(x, nan=fill_val)

        std = np.std(X[col])
        mean = np.mean(X[col])
        X[col] = np.clip(X[col], mean - cap * std, mean + cap * std)

    return X

sector_list = ['NIFTY_IT', 'NIFTY_BANK', 'NIFTY_FMCG', 'NIFTY_PHARMA', 'NIFTY_AUTO', 'NIFTY_METAL']
results = []

for regime in sorted(df['vol_regime'].unique()):
    print(f"\n--- Regime {regime} ---")
    regime_df = df[df['vol_regime'] == regime]

    for sector in sector_list:
        X = regime_df[features.columns]
        y = regime_df[f"{sector}_t+1"]
        X = sanitize_features(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Option 1: XGBoost
        xgb_model = XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.05, random_state=42)
        xgb_model.fit(X_train, y_train)
        y_pred_xgb = xgb_model.predict(X_test)
        r2_xgb = r2_score(y_test, y_pred_xgb)

        # Option 2: MLP Regressor
        mlp_model = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
        mlp_model.fit(X_train, y_train)
        y_pred_mlp = mlp_model.predict(X_test)
        r2_mlp = r2_score(y_test, y_pred_mlp)

        print(f"{sector}: R² XGBoost = {r2_xgb:.4f} | R² MLP = {r2_mlp:.4f}")
        results.append({
            'regime': regime, 'sector': sector,
            'r2_xgb': r2_xgb, 'r2_mlp': r2_mlp
        })



--- Regime 0 ---
NIFTY_IT: R² XGBoost = -241.0908 | R² MLP = -177053.2979
NIFTY_BANK: R² XGBoost = -0.0495 | R² MLP = -333953.9300
NIFTY_FMCG: R² XGBoost = -0.1076 | R² MLP = -173998.1583
NIFTY_PHARMA: R² XGBoost = -0.0951 | R² MLP = -233378.5034
NIFTY_AUTO: R² XGBoost = -0.0676 | R² MLP = -223943.3419
NIFTY_METAL: R² XGBoost = -0.1509 | R² MLP = -114724.5493

--- Regime 1 ---
NIFTY_IT: R² XGBoost = -0.0037 | R² MLP = -46953.7101
NIFTY_BANK: R² XGBoost = -0.0094 | R² MLP = -45492.4320
NIFTY_FMCG: R² XGBoost = -0.1064 | R² MLP = -93067.6856
NIFTY_PHARMA: R² XGBoost = -0.0614 | R² MLP = -33313.4867
NIFTY_AUTO: R² XGBoost = -0.0235 | R² MLP = -47356.9823
NIFTY_METAL: R² XGBoost = -0.0165 | R² MLP = -25194.9149

--- Regime 2 ---
NIFTY_IT: R² XGBoost = -0.2130 | R² MLP = -24292.0259
NIFTY_BANK: R² XGBoost = -0.1208 | R² MLP = -15868.5541
NIFTY_FMCG: R² XGBoost = -0.0879 | R² MLP = -41167.9286
NIFTY_PHARMA: R² XGBoost = -0.0714 | R² MLP = -32167.7371
NIFTY_AUTO: R² XGBoost = -0.0970 | R² ML