<a href="https://colab.research.google.com/github/vivek-varma/Volatality_Prediction_ML/blob/main/Improved_LighGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, math, json, numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, f1_score, classification_report, confusion_matrix, log_loss
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from google.colab import drive

In [2]:
Q_LOW  = 0.25
Q_HIGH = 0.75

# Optional HIGH recall booster at inference:
# If p_high >= HIGH_MIN_PROB, force class=HIGH (2), else use argmax. Set to None to disable.
HIGH_MIN_PROB = 0.40

drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
DATA_DIR = "/content/drive/MyDrive/Regime_pred/Data"
OUT_DIR  = "/content/drive/MyDrive/Regime_pred/Models/LGBM_relabel"
os.makedirs(OUT_DIR, exist_ok=True)

CSV_PATH = f"{DATA_DIR}/REGIME_FEATURES_DAILY_LABELED.csv"
SEED     = 1337

In [4]:
df = pd.read_csv(CSV_PATH, parse_dates=["date"]).sort_values("date").reset_index(drop=True)
assert "RV_t1" in df.columns, "RV_t1 not found."


In [5]:
n = len(df)
n_train = int(math.floor(n * 0.60))
n_val   = int(math.floor(n * 0.20))
idx_train = np.arange(0, n_train)
idx_val   = np.arange(n_train, n_train + n_val)
idx_test  = np.arange(n_train + n_val, n)

In [6]:
rv = df["RV_t1"].astype(float).values
q_low_val, q_high_val = np.quantile(rv[idx_train], [Q_LOW, Q_HIGH])

def bucket(rv_):
    if np.isnan(rv_): return np.nan
    if rv_ <= q_low_val:  return 0
    if rv_ <= q_high_val: return 1
    return 2

df["regime_y_new"] = [bucket(v) for v in rv]

print(f"Train-only cuts: q_low={q_low_val:.6g} (p={Q_LOW}), q_high={q_high_val:.6g} (p={Q_HIGH})")
print("Class counts (ALL, new labels):", pd.Series(df["regime_y_new"]).value_counts(dropna=False).to_dict())

Train-only cuts: q_low=2.79454e-05 (p=0.25), q_high=0.000135278 (p=0.75)
Class counts (ALL, new labels): {1: 1876, 2: 848, 0: 754}


In [7]:
drop_cols = ["date", "regime_y"]  # drop any old labels; keep RV_t1 only for features screening below
X = df.drop(columns=[c for c in drop_cols if c in df.columns]).copy()
y = df["regime_y_new"].astype("Int64").values

# convert any objects to category codes (don’t drop!)
obj_cols = X.select_dtypes(include=["object"]).columns.tolist()
for c in obj_cols:
    X[c] = X[c].astype("category").cat.codes.replace(-1, np.nan)

# we do NOT use RV_t1 as input feature to avoid trivial leakage (it’s the target used to create labels)
if "RV_t1" in X.columns:
    X = X.drop(columns=["RV_t1"])

if "regime_y_new" in X.columns:
    X = X.drop(columns=["regime_y_new"])

In [8]:
print(X.head())

         RV   ret_std  nobs  level_1        ret      RV_5     RV_10     RV_21  \
0  0.000007  0.000138   391        0   3.583200  0.000015  0.000013  0.000013   
1  0.000016  0.000205   391        1   1.462809  0.000014  0.000014  0.000013   
2  0.000016  0.000205   391        0  16.556561  0.000014  0.000014  0.000014   
3  0.000029  0.000271   391        0  82.266350  0.000015  0.000016  0.000015   
4  0.000029  0.000271   391        1  -6.147298  0.000020  0.000018  0.000016   

     VOV_21  RV_chg_1  ...  vix3m  vix6m   vvix   S_short    S_long      CURV  \
0  0.000005  0.000000  ...  12.15  14.21  93.97  0.276260  0.169547 -0.059874   
1  0.000005  1.206724  ...  12.41  14.32  94.74  0.231151  0.153908 -0.041667   
2  0.000005  0.000000  ...  12.41  14.32  94.74  0.231151  0.153908 -0.041667   
3  0.000006  0.755095  ...  12.36  14.31  93.70  0.258656  0.157767 -0.060081   
4  0.000007  0.000000  ...  12.36  14.31  93.70  0.258656  0.157767 -0.060081   

       VRP_21  vix_to_vix3

In [9]:
X = X.fillna(method="ffill").fillna(method="bfill")

feature_names = X.columns.tolist()
X = X.values.astype(np.float32)

  X = X.fillna(method="ffill").fillna(method="bfill")


In [10]:
scaler = StandardScaler()
scaler.fit(X[idx_train])
Xz = scaler.transform(X).astype(np.float32)

X_train, y_train = Xz[idx_train], y[idx_train].astype(int)
X_val,   y_val   = Xz[idx_val],   y[idx_val].astype(int)
X_test,  y_test  = Xz[idx_test],  y[idx_test].astype(int)


In [11]:
classes = np.sort(np.unique(y_train))
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight_map = {int(c): float(w) for c, w in zip(classes, weights)}
w_train = np.array([class_weight_map[int(t)] for t in y_train], dtype=np.float32)

print("Class weights:", class_weight_map)

Class weights: {0: 1.3320561941251596, 1: 0.6673064619321817, 2: 1.3320561941251596}


In [12]:
lgbm = LGBMClassifier(
    objective="multiclass",
    num_class=3,
    n_estimators=1200,
    learning_rate=0.035,
    max_depth=5,
    num_leaves=48,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=2.0,
    random_state=SEED,
    n_jobs=-1,
    verbosity=-1,
)

lgbm.fit(
    X_train, y_train,
    sample_weight=w_train,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[early_stopping(stopping_rounds=200, verbose=False), log_evaluation(period=0)],
)

In [13]:
best_it = getattr(lgbm, "best_iteration_", None)

In [14]:
def predict_with_high_boost(model, Xmat, use_boost=True):
    proba = model.predict_proba(Xmat, num_iteration=best_it)
    pred  = proba.argmax(1)
    if use_boost and HIGH_MIN_PROB is not None:
        high_mask = proba[:, 2] >= HIGH_MIN_PROB
        pred[high_mask] = 2
    return pred, proba

def report_split(name, y_true, y_pred, proba):
    print(f"\n=== {name} ===")
    print("LogLoss:", round(log_loss(y_true, proba, labels=[0,1,2]), 4))
    print("Balanced Acc:", round(balanced_accuracy_score(y_true, y_pred), 4))
    print("Macro F1:", round(f1_score(y_true, y_pred, average='macro'), 4))
    print("\nReport:\n", classification_report(y_true, y_pred, digits=3, zero_division=0))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred, labels=[0,1,2]))

In [15]:
y_val_pred,  y_val_proba  = predict_with_high_boost(lgbm, X_val,  use_boost=True)
y_test_pred, y_test_proba = predict_with_high_boost(lgbm, X_test, use_boost=True)

report_split("VALIDATION (boosted)", y_val, y_val_pred, y_val_proba)
report_split("TEST (boosted)",       y_test, y_test_pred, y_test_proba)

# Also show pure argmax (no boost) for comparison
y_val_pred_arg,  y_val_proba_arg  = predict_with_high_boost(lgbm, X_val,  use_boost=False)
y_test_pred_arg, y_test_proba_arg = predict_with_high_boost(lgbm, X_test, use_boost=False)
report_split("VALIDATION (argmax)", y_val, y_val_pred_arg, y_val_proba_arg)
report_split("TEST (argmax)",       y_test, y_test_pred_arg, y_test_proba_arg)




=== VALIDATION (boosted) ===
LogLoss: 0.3898
Balanced Acc: 0.8158
Macro F1: 0.751

Report:
               precision    recall  f1-score   support

           0      0.412     0.778     0.538        36
           1      0.861     0.834     0.847       385
           2      0.902     0.836     0.867       274

    accuracy                          0.832       695
   macro avg      0.725     0.816     0.751       695
weighted avg      0.853     0.832     0.839       695

Confusion matrix:
 [[ 28   8   0]
 [ 39 321  25]
 [  1  44 229]]

=== TEST (boosted) ===
LogLoss: 0.5037
Balanced Acc: 0.7363
Macro F1: 0.7333

Report:
               precision    recall  f1-score   support

           0      0.668     0.781     0.720       196
           1      0.869     0.813     0.840       449
           2      0.667     0.615     0.640        52

    accuracy                          0.789       697
   macro avg      0.735     0.736     0.733       697
weighted avg      0.797     0.789     0.791    



In [16]:
import joblib
joblib.dump(lgbm,  os.path.join(OUT_DIR, "lgbm_model_relabeled.joblib"))
joblib.dump(scaler, os.path.join(OUT_DIR, "lgbm_scaler_relabeled.joblib"))
with open(os.path.join(OUT_DIR, "label_cuts.json"), "w") as f:
    json.dump({"q_low": float(q_low_val), "q_high": float(q_high_val), "Q_LOW": Q_LOW, "Q_HIGH": Q_HIGH,
               "HIGH_MIN_PROB": HIGH_MIN_PROB}, f, indent=2)
with open(os.path.join(OUT_DIR, "features.json"), "w") as f:
    json.dump(feature_names, f, indent=2)

print(f"\n✅ Saved to {OUT_DIR}")


✅ Saved to /content/drive/MyDrive/Regime_pred/Models/LGBM_relabel
