# BTC 5-Min Data: Hourly Realized Standard Deviation of Log Returns

This section loads 5-minute BTC price data and computes the realized standard deviation (std) of intra-hour log returns.

Definition used:
- Log return r_t = ln(P_t / P_{t-1}) for each 5-minute bar.
- For each hour H, realized variance = sum_{t in H} (r_t - mean_H)^2 / (n_H - 1) (i.e. sample variance) and realized std = sqrt(variance).

Optionally we can exclude the first 5-minute return of each hour (as it spans the boundary from the prior hour). Both versions are produced.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Config
data_file_5min = Path('/Users/kate/projects/polymarket/data/btc_5min_data.json')
assert data_file_5min.exists(), f"Data file not found: {data_file_5min}"

# Load 5-min BTC data (expects JSON lines with timestamp & price)
df = pd.read_json(data_file_5min, lines=True)
# Ensure timestamp is datetime
if not np.issubdtype(df['timestamp'].dtype, np.datetime64):
    df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort just in case
df = df.sort_values('timestamp').reset_index(drop=True)

# Compute log returns on consecutive 5-min bars
df['prev_price'] = df['price'].shift(1)
df['log_return'] = np.log(df['price'] / df['prev_price'])

# Derive hour bucket (UTC)
df['hour'] = df['timestamp'].dt.floor('H')

# Helper: compute realized std within each hour including all intra-hour returns
# Note: The first return in an hour uses prev_price from prior hour boundary.
# We also create a variant excluding that boundary-spanning return.

def hourly_stats(group: pd.DataFrame):
    # All returns in this hour (drop NaN from first overall row)
    r_all = group['log_return'].dropna()
    n_all = len(r_all)
    std_all = r_all.std(ddof=1) if n_all > 1 else np.nan

    # Exclude the first row of the hour (boundary) for alt measure
    if n_all > 1:
        # Identify the first non-null return's index within group
        first_idx = r_all.index[0]
        r_ex = r_all.drop(first_idx)
    else:
        r_ex = pd.Series(dtype=float)
    n_ex = len(r_ex)
    std_ex = r_ex.std(ddof=1) if n_ex > 1 else np.nan

    return pd.Series({
        'n_returns_all': n_all,
        'realized_std_all': std_all,
        'n_returns_ex_boundary': n_ex,
        'realized_std_ex_boundary': std_ex,
        'mean_log_return_all': r_all.mean() if n_all else np.nan,
        'mean_log_return_ex_boundary': r_ex.mean() if n_ex else np.nan,
    })

hourly_realized = df.groupby('hour', as_index=False).apply(hourly_stats)

# Optional: annualize / scale variants (not required yet)
# realized_vol_per_hour = hourly_realized['realized_std_all']
# If needed to convert to per-day (assuming 12 five-minute intervals per hour => 12 returns) etc.

print("Computed hourly realized standard deviations (first 5 rows):")
hourly_realized.head()

TypeError: Cannot interpret 'datetime64[ns, UTC]' as a data type

In [None]:
# Descriptive statistics of realized hourly std
summary = hourly_realized[['realized_std_all','realized_std_ex_boundary']].describe().T
print(summary)
summary

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=hourly_realized['hour'], y=hourly_realized['realized_std_all'],
                         mode='lines', name='Realized Std (All Returns)', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=hourly_realized['hour'], y=hourly_realized['realized_std_ex_boundary'],
                         mode='lines', name='Realized Std (Ex Boundary)', line=dict(color='orange')))
fig.update_layout(title='BTC Hourly Realized Std of 5-Min Log Returns',
                  xaxis_title='Hour (UTC)', yaxis_title='Realized Std',
                  hovermode='x unified', height=450)
fig.show(renderer='browser')

In [None]:
# Aggregate realized std by clock hour of day (0-23)
hourly_realized['clock_hour'] = hourly_realized['hour'].dt.hour
clock_hour_stats = hourly_realized.groupby('clock_hour').agg(
    mean_realized_std_all=('realized_std_all','mean'),
    mean_realized_std_ex_boundary=('realized_std_ex_boundary','mean'),
    count_hours=('hour','count')
).reset_index()

clock_hour_stats

In [None]:
# Bar chart of average realized std by clock hour
fig = go.Figure()
fig.add_bar(x=clock_hour_stats['clock_hour'], y=clock_hour_stats['mean_realized_std_all'], name='All Returns')
fig.add_bar(x=clock_hour_stats['clock_hour'], y=clock_hour_stats['mean_realized_std_ex_boundary'], name='Ex Boundary')
fig.update_layout(barmode='group', title='Average Hourly Realized Std by Clock Hour (UTC)',
                  xaxis_title='Clock Hour (UTC)', yaxis_title='Average Realized Std')
fig.show(renderer='browser')