<a href="https://colab.research.google.com/github/vivek-varma/Volatality_Prediction_ML/blob/main/Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [9]:
import pandas as pd
import numpy as np
import os


In [2]:
SAVE_DIR = "/content/drive/MyDrive/Regime_pred/Data"
ES_PATH  = f"{SAVE_DIR}/ES_1min_MASTER_continuous_RTH.csv"

In [4]:
df = pd.read_csv(ES_PATH, parse_dates=['timestamp'])

In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

In [7]:
df['date_ct'] = df['timestamp'].dt.tz_convert("America/Chicago").dt.date

In [8]:
out = f"{SAVE_DIR}/ES_1min_MASTER_continuous_RTH_tz.csv"
df.to_csv(out, index=False)
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,symbol,date_ct
0,2018-01-02 14:30:00+00:00,2687.0,2687.25,2684.75,2685.25,24815,ESH8,2018-01-02
1,2018-01-02 14:31:00+00:00,2685.5,2685.5,2684.0,2684.0,11772,ESH8,2018-01-02
2,2018-01-02 14:32:00+00:00,2684.0,2685.75,2684.0,2685.0,6174,ESH8,2018-01-02
3,2018-01-02 14:33:00+00:00,2685.0,2685.0,2683.25,2683.5,6756,ESH8,2018-01-02
4,2018-01-02 14:34:00+00:00,2683.5,2684.0,2682.5,2682.75,6552,ESH8,2018-01-02


In [11]:
BASE = "/content/drive/MyDrive/Regime_pred/Data"
ES_MIN_PATH   = f"{BASE}/ES_1min_MASTER_continuous_RTH.csv"  # or your full-session file
VIX_STACK_PATH= f"{BASE}/VIX_term_stack.csv"                  # cols: date,VIX,VIX3M,VIX6M
VVIX_PATH     = f"{BASE}/VVIX_daily.csv"

OUT_FEATURES  = f"{BASE}/REGIME_FEATURES_DAILY.csv"
OUT_LABELED   = f"{BASE}/REGIME_FEATURES_DAILY_LABELED.csv"

# ---------- A) ES → daily realized-variance features ----------
es = pd.read_csv(ES_MIN_PATH, parse_dates=["timestamp"])
es = es.sort_values("timestamp").copy()

# Ensure tz-aware UTC then compute session date in Chicago time
es["timestamp"] = pd.to_datetime(es["timestamp"], utc=True)
es["date"] = es["timestamp"].dt.tz_convert("America/Chicago").dt.date

# 1-min log returns
es["ret"] = np.log(es["close"]).diff()

# Sanity filter: drop days with too few 1-min bars (RTH ~390; accept >= 300)
g = es.groupby("date")["ret"]
daily = pd.DataFrame({
    "RV":      g.apply(lambda s: np.nansum(np.square(s))),   # realized variance
    "ret_std": g.std(),
    "nobs":    g.size()
}).reset_index()
daily["date"] = pd.to_datetime(daily["date"])
daily = daily[daily["nobs"] >= 300].copy()

# Intraday distribution features
def _safe_skew(x):
    x = pd.Series(x).dropna()
    return x.skew() if len(x) > 5 else np.nan

def _safe_kurt(x):
    x = pd.Series(x).dropna()
    return x.kurt() if len(x) > 5 else np.nan

mom = es.groupby("date")["ret"].apply(lambda s: pd.Series({
    "rskew": _safe_skew(s), "rkurt": _safe_kurt(s)
})).reset_index()
mom["date"] = pd.to_datetime(mom["date"])
rv = daily.merge(mom, on="date", how="left").sort_values("date").reset_index(drop=True)

# Rolling realized-variance stats (use only past info)
rv["RV_5"]   = rv["RV"].rolling(5,  min_periods=3).mean()
rv["RV_10"]  = rv["RV"].rolling(10, min_periods=5).mean()
rv["RV_21"]  = rv["RV"].rolling(21, min_periods=10).mean()
rv["VOV_21"] = rv["RV"].rolling(21, min_periods=10).std()
rv["RV_chg_1"] = rv["RV"].pct_change(1)
rv["RV_chg_5"] = rv["RV"].pct_change(5)

# ---------- B) Implied-vol stack (VIX/VIX3M/VIX6M + VVIX) ----------
vixs = pd.read_csv(VIX_STACK_PATH, parse_dates=["date"]).sort_values("date")
vixs = vixs.rename(columns={"VIX":"vix","VIX3M":"vix3m","VIX6M":"vix6m"})

vvix = pd.DataFrame(columns=["date","vvix"])
if os.path.exists(VVIX_PATH):
    tmp = pd.read_csv(VVIX_PATH, parse_dates=["date"])
    # be forgiving on column name casing
    tmp.columns = [c.lower() for c in tmp.columns]
    vvix = tmp.rename(columns={"vvix":"vvix"})

# Merge stack
feat = rv.merge(vixs, on="date", how="inner")
if len(vvix):
    feat = feat.merge(vvix, on="date", how="left")

# ---------- C) Term structure & VRP features ----------
# Proxies for VX term structure
feat["S_short"] = (feat["vix3m"] - feat["vix"])   / feat["vix"]      # 1m→3m slope
feat["S_long"]  = (feat["vix6m"] - feat["vix3m"]) / feat["vix3m"]    # 3m→6m slope
feat["CURV"]    = (feat["vix6m"] - 2*feat["vix3m"] + feat["vix"]) / feat["vix"]

# Variance Risk Premium (21d realized variance vs. 30d IV proxy)
feat["VRP_21"] = (feat["vix"] ** 2) - feat["RV_21"]

# Helpful level ratios / short rolls
feat["vix_to_vix3m"] = feat["vix"] / feat["vix3m"]
feat["vix_to_vix6m"] = feat["vix"] / feat["vix6m"]
if "vvix" in feat.columns:
    feat["vvix_roll3"] = feat["vvix"].rolling(3).mean()

# Final clean: drop rows missing key inputs
feat = feat.dropna(subset=["RV","RV_21","vix","vix3m","vix6m"]).reset_index(drop=True)
feat.to_csv(OUT_FEATURES, index=False)
print(f"✅ Saved features → {OUT_FEATURES} | rows: {len(feat)}")

# ---------- D) Create next-day regime labels (LOW/MID/HIGH by terciles) ----------
df = feat.copy()
df["RV_t1"] = df["RV"].shift(-1)       # predict t+1 from info up to t

# Use in-sample terciles; you can later fix thresholds from a train period to avoid drift
q_low, q_high = df["RV_t1"].quantile([0.33, 0.66])

def _bucket(rv):
    if pd.isna(rv): return np.nan
    if rv <= q_low:  return 0  # LOW
    if rv <= q_high: return 1  # MID
    return 2                   # HIGH

df["regime_y"] = df["RV_t1"].apply(_bucket).astype("Int64")
df = df.dropna(subset=["regime_y"]).reset_index(drop=True)

# (Optional) remove last day (no t+1 realized)
# already handled by dropna above

df.to_csv(OUT_LABELED, index=False)
print(f"✅ Saved labeled dataset → {OUT_LABELED} | rows: {len(df)}")
print("Class counts:", df["regime_y"].value_counts().to_dict())

# Small preview
display(df.tail(3)[["date","RV","RV_21","vix","vix3m","vix6m","S_short","CURV","VRP_21","vvix" if "vvix" in df.columns else df.columns[0],"regime_y"]])


✅ Saved features → /content/drive/MyDrive/Regime_pred/Data/REGIME_FEATURES_DAILY.csv | rows: 3479
✅ Saved labeled dataset → /content/drive/MyDrive/Regime_pred/Data/REGIME_FEATURES_DAILY_LABELED.csv | rows: 3478
Class counts: {np.int64(2): 1182, np.int64(0): 1148, np.int64(1): 1148}


Unnamed: 0,date,RV,RV_21,vix,vix3m,vix6m,S_short,CURV,VRP_21,vvix,regime_y
3475,2024-12-30,0.000247,0.000346,17.4,18.940001,20.5,0.088506,0.001149,302.759641,103.05,2
3476,2024-12-30,0.000247,0.000357,17.4,18.940001,20.5,0.088506,0.001149,302.75963,103.05,1
3477,2024-12-31,8.6e-05,0.00036,17.35,18.98,20.549999,0.093948,-0.003458,301.022153,104.33,1
