# Notebook 02 â€” Feature Engineering & ML Dataset Construction

This notebook transforms the raw simulation data into a clean,
machine-learning-ready dataset for quantitative modeling.

We engineer:

- Attention returns (1-step, 5-step)
- Forward labels for classification and regression
- Rolling statistics (volatility, mean)
- Lagged features (microstructure-style)
- Demand/liquidity/imbalance transformations
- Regime-based variables
- Normalized versions of key variables
- A final cleaned ML dataset

This dataset will be used in Notebook 03 for:
- Regime classification
- Short-term attention forecasting
- Feature importance analysis
- Model evaluation


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

data_path = Path("..") / "data" / "attention_simulation.csv"
df = pd.read_csv(data_path)

df.head()


In [None]:
df = df.copy()

# Basic returns
df["attn_return_1"] = df["attention_level"].diff()
df["attn_return_5"] = df["attention_level"].diff(5)

# Forward returns (predict the future)
df["fwd_return_1"] = df["attention_level"].shift(-1) - df["attention_level"]
df["fwd_return_5"] = df["attention_level"].shift(-5) - df["attention_level"]

# Binary up/down labels for classification
df["label_up_1"] = (df["fwd_return_1"] > 0).astype(int)
df["label_up_5"] = (df["fwd_return_5"] > 0).astype(int)

df.head(10)


In [None]:
window = 20

df["roll_mean_20"] = df["attention_level"].rolling(window).mean()
df["roll_std_20"] = df["attention_level"].rolling(window).std()
df["roll_vol_20"] = df["attn_return_1"].rolling(window).std()
df["roll_absret_20"] = df["attn_return_1"].abs().rolling(window).mean()

df[["attention_level", "roll_mean_20", "roll_vol_20"]].head(15)


In [None]:
df["imbalance_abs"] = df["attention_imbalance"].abs()
df["liquidity_inv"] = 1 / (df["attention_liquidity"] + 1e-6)
df["demand_liquidity_ratio"] = df["attention_demand"] / (df["attention_liquidity"] + 1e-6)

# Short-term changes
df["delta_imbalance"] = df["attention_imbalance"].diff()
df["delta_volatility"] = df["volatility"].diff()
df["delta_demand"] = df["attention_demand"].diff()
df["delta_liquidity"] = df["attention_liquidity"].diff()

df.head()


In [None]:
lags = [1, 2, 3, 5, 10]

for lag in lags:
    df[f"attn_lag_{lag}"] = df["attention_level"].shift(lag)
    df[f"ret_lag_{lag}"] = df["attn_return_1"].shift(lag)
    df[f"imbalance_lag_{lag}"] = df["attention_imbalance"].shift(lag)
    df[f"vol_lag_{lag}"] = df["volatility"].shift(lag)


In [None]:
regime_map = {
    "engaged": 0,
    "fatigued": 1,
    "overstimulated": 2,
    "addictive_loop": 3,
    "disengaged": 4,
    "baseline": 5
}

df["regime_id"] = df["regime"].map(regime_map)


In [None]:
df_clean = df.dropna().reset_index(drop=True)
df_clean.head()


In [None]:
feature_cols = [
    # Raw variables
    "attention_level", "boredom", "fatigue", "volatility",
    "attention_imbalance", "attention_liquidity", "attention_demand",

    # Engineered features
    "attn_return_1", "attn_return_5",
    "roll_mean_20", "roll_std_20", "roll_vol_20", "roll_absret_20",

    # Microstructure features
    "imbalance_abs", "liquidity_inv", "demand_liquidity_ratio",
    "delta_imbalance", "delta_volatility", "delta_demand", "delta_liquidity",
]

# Add lagged features
for lag in lags:
    feature_cols += [
        f"attn_lag_{lag}", f"ret_lag_{lag}",
        f"imbalance_lag_{lag}", f"vol_lag_{lag}"
    ]

# Labels
label_cols = ["label_up_1", "label_up_5", "fwd_return_1", "fwd_return_5", "regime_id"]

ml_df = df_clean[feature_cols + label_cols]

ml_df.head()


In [None]:
output_path = Path("..") / "data" / "ml_dataset.csv"
ml_df.to_csv(output_path, index=False)

print("Saved ML dataset to:", output_path)
ml_df.shape


In [None]:
print("\nLabel balance (1-step up/down):")
print(ml_df["label_up_1"].value_counts())

print("\nLabel balance (5-step up/down):")
print(ml_df["label_up_5"].value_counts())

print("\nRegime distribution:")
print(ml_df["regime_id"].value_counts())
