In [None]:
!uv pip install pyarrow kagglehub numpy pandas scipy scikit-learn pykalman matplotlib seaborn statsmodels

[2mAudited [1m9 packages[0m [2min 7ms[0m[0m


In [25]:
import kagglehub
from pathlib import Path
import shutil

# Download latest version
# download to kagglehub cache (returns the downloaded folder path)
path = kagglehub.dataset_download("iamspace/precog-quant-task-2026")

# copy contents into the current notebook directory
src = Path(path)
dest = Path.cwd()
for item in src.iterdir():
    target = dest / item.name
    if item.is_dir():
        if target.exists():
            shutil.rmtree(target)
        shutil.copytree(item, target)
    else:
        if target.exists():
            target.unlink()
        shutil.copy2(item, target)

In [26]:
import pandas as pd
import numpy as np
import glob
import os
from pykalman import KalmanFilter

def rogers_satchell(df, window=20):
    rs = (
        np.log(df['High'] / df['Open']) * np.log(df['High'] / df['Close']) +
        np.log(df['Low']  / df['Open']) * np.log(df['Low']  / df['Close'])
    )

    return np.sqrt(rs.rolling(window).mean())

def kalman_series(col, Q=0.01, R=1.0):
    filled = df[col].fillna(0).values
    return kalman_1d(filled, Q=Q, R=R)

def kalman_1d(z, Q=0.01, R=1.0):
    # Convert to numpy array if it's a pandas Series
    if isinstance(z, pd.Series):
        z = z.fillna(0).values
    
    x_hat = np.zeros_like(z)
    P = 1.0
    x_hat[0] = z[0]

    for k in range(1, len(z)):
        x_pred = x_hat[k-1]
        P_pred = P + Q

        K = P_pred / (P_pred + R)
        x_hat[k] = x_pred + K * (z[k] - x_pred)
        P = (1 - K) * P_pred

    return x_hat

def compute_rsi(series, window=14):
    delta = series.diff()

    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window).mean()
    avg_loss = loss.rolling(window).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    return rsi

def compute_features(group):
    # Preserve Asset column if it exists
    asset_col = group['Asset'].iloc[0] if 'Asset' in group.columns else None

    group['ret_14d'] = group['Close'].pct_change(14)

    group['RSI_14'] = compute_rsi(group['Close'], 14)

    group['RSI_kalman'] = kalman_1d(group['RSI_14'])

    group['ret_14d_kalman'] = kalman_1d(group['ret_14d'])

    group['RS_vol'] = rogers_satchell(group, window=14)

    group['RS_vol_kalman'] = kalman_1d(group['RS_vol'])

    group['RSI_slope'] = group['RSI_kalman'].diff()
    group['RSI_accel'] = group['RSI_slope'].diff()

    group['risk_adj_mom'] = group['ret_14d_kalman'] / group['RS_vol_kalman']

    group['vol_z_14'] = (
        group['Volume'] - group['Volume'].rolling(14).mean()
    ) / group['Volume'].rolling(14).std()
    
    # Restore Asset column if it was present
    if asset_col is not None and 'Asset' not in group.columns:
        group['Asset'] = asset_col

    return group

In [27]:
files = glob.glob("anonymized_data/*.csv")

all_dfs = []

for f in files:
    asset_name = os.path.basename(f).replace(".csv", "")

    df_temp = pd.read_csv(f)
    df_temp['Date'] = pd.to_datetime(df_temp['Date'])
    df_temp['Asset'] = asset_name

    all_dfs.append(df_temp)

# Concatenate all data
df = pd.concat(all_dfs, ignore_index=True)
df = df.sort_values(['Asset','Date']).reset_index(drop=True)

# Store asset column temporarily
asset_col = df['Asset'].copy()

# Compute features (this will lose Asset column)
df = df.groupby('Asset', group_keys=False).apply(compute_features)

# Restore Asset column
df['Asset'] = asset_col

# Save to parquet
df.to_parquet("features_all_assets.parquet")

In [28]:
print(df.head(50))

         Date       Open       High        Low      Close     Volume  \
0  2016-01-25  29.178415  29.181290  28.514486  28.580592  249449990   
1  2016-01-26  28.721415  28.994458  28.186822  28.738659  361581962   
2  2016-01-27  27.603374  27.772948  26.827351  26.850345  642328247   
3  2016-01-28  26.956690  27.166502  26.554308  27.042913  268157355   
4  2016-01-29  27.244108  27.977016  27.117645  27.977016  310239413   
5  2016-02-01  27.726962  27.795941  27.419428  27.715465  197189966   
6  2016-02-02  27.425174  27.603372  27.097520  27.155005  179917813   
7  2016-02-03  27.304454  27.833297  27.040032  27.692464  221370883   
8  2016-02-04  27.701143  28.125937  27.507531  27.914984  223814596   
9  2016-02-05  27.891858  28.007448  27.074060  27.169420  223556451   
10 2016-02-08  26.912225  27.654890  26.886218  27.455499  260175071   
11 2016-02-09  27.247455  27.724264  27.143424  27.449737  213505631   
12 2016-02-10  27.718473  27.842732  27.192539  27.241664  20393