In [17]:
!uv pip install xgboost pyarrow kagglehub numpy pandas scipy scikit-learn pykalman matplotlib seaborn pandas_ta

[2mAudited [1m11 packages[0m [2min 9ms[0m[0m


In [18]:

import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import joblib
import os

In [19]:


# ==========================================
# 1. Load Data
# ==========================================
# Make sure to point this to your actual input file
input_path = "features_all_assets.parquet"
if not os.path.exists(input_path):
    print("Input file not found. Check path!")
else:
    df = pd.read_parquet(input_path)

# Ensure sorted for time-series operations
df = df.sort_values(['Asset', 'Date']).reset_index(drop=True)

# ==========================================
# 2. Define Targets (14-Day Horizon)
# ==========================================
LOOKAHEAD = 14

# Target A: Return (Magnitude)
df['target_return'] = df.groupby('Asset')['Close'].transform(
    lambda x: x.shift(-LOOKAHEAD) / x - 1
)

# Target B: Risk (Volatility)
df['target_risk'] = df.groupby('Asset')['Close'].transform(
    lambda x: x.pct_change().rolling(window=LOOKAHEAD).std().shift(-LOOKAHEAD)
)

# Features (Must match your data cleaning)
features = [
    'RSI_kalman', 'RSI_slope', 'RSI_accel', 
    'ret_14d_kalman', 'RS_vol_kalman', 
    'vol_z_14', 'risk_adj_mom'
]

# Drop NaNs
df_model = df.dropna(subset=['target_return', 'target_risk'] + features).copy()

# ==========================================
# 3. Walk-Forward Training Loop
# ==========================================
print("Starting Walk-Forward Training...")
predictions = []

for asset, group in df_model.groupby('Asset'):
    
    if len(group) < 300: continue
    
    tscv = TimeSeriesSplit(n_splits=5)
    X = group[features]
    y_ret = group['target_return']
    y_risk = group['target_risk']
    
    # Robust Parameters (Prevent Overfitting)
    params = {
        'n_estimators': 500,
        'learning_rate': 0.01,
        'max_depth': 3,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'objective': 'reg:squarederror',
        'n_jobs': -1
    }
    
    model_alpha = XGBRegressor(**params)
    model_risk = XGBRegressor(**params)
    
    # Arrays for OOS predictions
    group_pred_ret = np.full(len(group), np.nan)
    group_pred_risk = np.full(len(group), np.nan)
    
    for train_idx, test_idx in tscv.split(group):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        
        # Train & Predict Alpha
        model_alpha.fit(X_train, y_ret.iloc[train_idx])
        group_pred_ret[test_idx] = model_alpha.predict(X_test)
        
        # Train & Predict Risk
        model_risk.fit(X_train, y_risk.iloc[train_idx])
        group_pred_risk[test_idx] = model_risk.predict(X_test)
        
    group['pred_return'] = group_pred_ret
    group['pred_risk'] = group_pred_risk
    
    # Save valid predictions
    valid_preds = group.dropna(subset=['pred_return', 'pred_risk']).copy()
    predictions.append(valid_preds[['Date', 'Asset', 'pred_return', 'pred_risk']])

# ==========================================
# 4. Signal Engineering (Smoothing & Weighting)
# ==========================================
pred_df = pd.concat(predictions)
pred_df = pred_df.sort_values(['Asset', 'Date'])

# A. Raw Ratio
pred_df['trade_score'] = pred_df['pred_return'] / (pred_df['pred_risk'] + 1e-6)

# B. SIGNAL SMOOTHING (Baked In)
print("Applying 3-Day Signal Smoothing...")
pred_df['trade_score_smooth'] = pred_df.groupby('Asset')['trade_score'].transform(
    lambda x: x.rolling(3).mean()
)

# C. INVERSE VOLATILITY WEIGHT (Baked In)
# Calculate a raw weight factor. The backtester will normalize this to sum to 1.0.
# We cap the risk denominator to avoid massive weights on zero-risk artifacts
pred_df['inv_vol_weight'] = 1.0 / (pred_df['pred_risk'] + 1e-4)

# ==========================================
# 5. Export
# ==========================================
# Drop rows where smoothing created NaNs (first 2 days)
final_df = pred_df.dropna(subset=['trade_score_smooth'])

output_parquet = "/models/alpha_risk_predictions.parquet"
output_csv = "/outputs/alpha_risk_predictions.csv"

final_df.to_parquet(output_parquet)
final_df.to_csv(output_csv, index=False)

print(f"Success! Processed {len(final_df)} rows.")
print(f"Files saved:\n- {output_parquet}\n- {output_csv}")

Starting Walk-Forward Training...


Starting Walk-Forward Training...


KeyboardInterrupt: 