# Hidden markov model


In [None]:
import datetime as dt

import polars as pl

from homelab_pipelines.utils.paths import Paths

In [None]:
class Config:
    rv_rolling_rows: int = 8
    """Number of rows to consider for $RV_t$."""
    volsurp_window_days: int = 30
    """Number of days to consider for time-of-day mean of $VolSurp_t$."""

    hmm_days_train: int = 26 * 7  # 6 months


## Loading data


In [None]:
prices = pl.read_parquet(Paths.repo_root / "data" / "BTCUSDT.parquet")
prices

In [None]:
prices.describe()

## Preprocessing data

For the HMM, we generate certain features:

- $RV_t$: realized volatility of the last $m$ bars
- $|r_t|$ where $r_t$ is the Intraday log-return. This means assets that are not open 24 hours a day should omit the first observation of each day. Crypto assets are open the entire day, making this part slightly easier.
- $\text{VolSurp}_t$: volume surprise relative to the time-of-day mean
- $\text{Range}_t$: intraday high-low range


In [None]:
X = (
    prices.sort("start_time_utc")
    .with_columns(
        log_returns=pl.col("close").log() - pl.col("close").shift(1).log(),
        log_range=pl.col("high").log() - pl.col("low").log(),
    )
    .with_columns(
        pl.col("log_returns")
        .pow(2)
        .rolling_sum(Config.rv_rolling_rows)
        .sqrt()
        .alias("realized_volatility")
    )
)

# Calculate rolling volatility at each time of day
X = X.join(
    X.with_columns(
        time_of_day=pl.col("start_time_utc").dt.time(),
    )
    .rolling(
        index_column="start_time_utc",
        period=f"{Config.volsurp_window_days}d",
        group_by="time_of_day",
    )
    .agg(volume_mean_time_of_day_rolling=pl.col("volume").mean())
    .drop("time_of_day"),
    on="start_time_utc",
    how="inner",
)

# Since we based our rolling mean volatility based on the first k days, we omit the first k days
X = X.filter(
    pl.col("start_time_utc")
    >= pl.col("start_time_utc").min() + dt.timedelta(days=Config.volsurp_window_days)
)

# Calculate VolSurp
X = X.with_columns(
    volsurp=pl.col("volume").log() - pl.col("volume_mean_time_of_day_rolling").log()
)

X

In [None]:
X_hmm = X.filter(
    pl.col("start_time_utc")
    >= pl.col("start_time_utc").max() - dt.timedelta(days=Config.hmm_days_train)
).select("start_time_utc", "log_returns", "log_range", "realized_volatility", "volsurp")

X_hmm

## Fitting an HMM model
