In [None]:
# Imports and environment detection
import os
import importlib.util
from pathlib import Path
import warnings
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

# ============================================================================
# GLOBAL CONFIGURATION CONSTANTS
# ============================================================================
MISSING_DATA_THRESHOLD = 0.10  # Drop features with >10% missing values
# ============================================================================

IN_KAGGLE = Path('/kaggle/input').exists()
LGBM_AVAILABLE = importlib.util.find_spec('lightgbm') is not None
TORCH_AVAILABLE = importlib.util.find_spec('torch') is not None

if LGBM_AVAILABLE:
    import lightgbm as lgb

if TORCH_AVAILABLE:
    import torch
    import torch.nn as nn
    from torch.utils.data import TensorDataset, DataLoader
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)

np.random.seed(42)
print(f'IN_KAGGLE={IN_KAGGLE}, LGBM_AVAILABLE={LGBM_AVAILABLE}, TORCH_AVAILABLE={TORCH_AVAILABLE}')
print(f'MISSING_DATA_THRESHOLD={MISSING_DATA_THRESHOLD:.0%}')

In [None]:
# Configuration and paths
NOTEBOOK_DIR = Path.cwd()
PROJECT_DIR = NOTEBOOK_DIR.parent
IN_KAGGLE = Path('/kaggle/input').exists()

def resolve_kaggle_path(preferred: Path, default_filename: str) -> Path:
    if not IN_KAGGLE:
        return preferred
    env_key = default_filename.upper().replace('.CSV', '') + '_PATH'
    env_val = os.environ.get(env_key)
    if env_val and Path(env_val).exists():
        return Path(env_val)
    candidates = list(Path('/kaggle/input').glob(f'**/{default_filename}'))
    if candidates:
        return candidates[0]
    any_csv = list(Path('/kaggle/input').glob('**/*.csv'))
    if any_csv:
        print(f'Warning: Could not find {default_filename}; using first CSV found: {any_csv[0]}')
        return any_csv[0]
    print(f'Warning: No CSV found under /kaggle/input; attempting preferred path: {preferred}')
    return preferred

# Directories for artifacts/results 
if IN_KAGGLE:
    ARTIFACTS_DIR = Path('/kaggle/working/artifacts')
    RESULTS_DIR = Path('/kaggle/working/results')
else:
    ARTIFACTS_DIR = PROJECT_DIR / 'artifacts'
    RESULTS_DIR = PROJECT_DIR / 'results'

CONFIG = {
    'train_data_path': resolve_kaggle_path(PROJECT_DIR / 'data' / 'train.csv', 'train.csv'),
    'test_data_path': resolve_kaggle_path(PROJECT_DIR / 'data' / 'test.csv', 'test.csv'),
    'artifacts_dir': ARTIFACTS_DIR,
    'results_dir': RESULTS_DIR,
    'target_column': 'forward_returns',
    'prediction_bounds': (0.0, 2.0),
    'max_volatility_ratio': 1.2,
    'n_splits': 5,
    'missing_threshold': MISSING_DATA_THRESHOLD,  # Use global constant
}
CONFIG['results_dir'].mkdir(parents=True, exist_ok=True)
CONFIG['artifacts_dir'].mkdir(parents=True, exist_ok=True)

# FEATURE_COLS will be inferred dynamically in the data loading cell based on missingness
FEATURE_COLS = []

In [None]:
# Utilities: volatility enforcement
import numpy as _np

def enforce_volatility(preds, target_std, max_ratio=1.2, clip_bounds=(0.0, 2.0)):
    """Scale predictions around their mean so std(preds) <= max_ratio * target_std.
    Returns a numpy array; optionally clips to bounds.
    """
    arr = _np.asarray(preds, dtype=float).reshape(-1)
    pred_std = float(arr.std())
    tgt_std = float(target_std)
    if pred_std > 0 and tgt_std > 0:
        scale = min(1.0, (max_ratio * tgt_std) / (pred_std + 1e-12))
        mean = float(arr.mean())
        arr = mean + scale * (arr - mean)
    if clip_bounds is not None:
        arr = _np.clip(arr, clip_bounds[0], clip_bounds[1])
    return arr

In [None]:
# Load and Prepare Data
train_df = pd.read_csv(CONFIG['train_data_path'])
print(f"Loaded training data: {train_df.shape} from {CONFIG['train_data_path']}")

# Optional date range info
_date_col = next((c for c in ['date','timestamp','datetime','time','Date','DATE'] if c in train_df.columns), None)
if _date_col:
    _dmn = pd.to_datetime(train_df[_date_col], errors='coerce').min()
    _dmx = pd.to_datetime(train_df[_date_col], errors='coerce').max()
    if pd.notna(_dmn) and pd.notna(_dmx):
        print(f"Date range ({_date_col}): {_dmn} to {_dmx}")

# Try to load public test to ensure feature intersection (avoids train-only/leaky columns)
_test_cols = None
try:
    if Path(CONFIG['test_data_path']).exists():
        _test_df_head = pd.read_csv(CONFIG['test_data_path'], nrows=5)
        _test_cols = set(_test_df_head.columns)
except Exception:
    _test_cols = None

# Infer features by missingness (drop columns with > missing_threshold) and by presence in test
missing_thr = CONFIG.get('missing_threshold', 0.30)
# Candidate features: numeric columns excluding the target and obvious meta columns
meta_cols = set([CONFIG['target_column']])
meta_cols |= set([c for c in ['id','ID','ticker','symbol', 'row','index'] if c in train_df.columns])
meta_cols |= set([c for c in ['date','timestamp','datetime','time','Date','DATE'] if c in train_df.columns])

train_numeric_cols = [c for c in train_df.select_dtypes(include=['number']).columns if c not in meta_cols]
if _test_cols is not None:
    # Keep only columns that will exist at prediction time
    candidate_cols = [c for c in train_numeric_cols if c in _test_cols]
else:
    candidate_cols = train_numeric_cols

missing_frac = train_df[candidate_cols].isna().mean()
FEATURE_COLS = [c for c in candidate_cols if missing_frac[c] <= missing_thr]

# Logging
_dropped_missing = len(candidate_cols) - len(FEATURE_COLS)
_dropped_absent = len(train_numeric_cols) - len(candidate_cols)
print(f"Selected {len(FEATURE_COLS)} features (missing ≤ {missing_thr:.0%}). Dropped { _dropped_missing } by missingness and { _dropped_absent } absent-in-test of {len(train_numeric_cols)} train numeric columns.")

# Build features/target
X = train_df[FEATURE_COLS].copy()
y = train_df[CONFIG['target_column']].copy()

# Missing handling and consistent row filtering
X = X.ffill().bfill()
valid_idx = ~(X.isnull().any(axis=1) | y.isnull())
X = X[valid_idx]
y = y[valid_idx]

# Save training feature means for test-time imputation
TRAIN_FEATURE_MEANS = X.mean(numeric_only=True)

print(f"Data cleaned: {len(X):,} samples; final feature count: {X.shape[1]}")

In [None]:
# Time-Series Cross-Validation Setup
print("\n" + "="*70)
print("TIME-SERIES CROSS-VALIDATION SETUP")
print("="*70)
tscv = TimeSeriesSplit(n_splits=CONFIG['n_splits'])
print(f"n_splits = {CONFIG['n_splits']}")

In [None]:
# Model 1: Random Forest Regressor (with volatility enforcement)
print("\n" + "="*70)
print("MODEL 1: RANDOM FOREST")
print("="*70)

rf_results = []
for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
    print(f"\nFold {fold}/{CONFIG['n_splits']}")
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_val_scaled = scaler.transform(X_val_fold)

    rf_model = RandomForestRegressor(
        n_estimators=100, max_depth=10, min_samples_split=20, min_samples_leaf=10,
        random_state=42, n_jobs=-1
    )
    rf_model.fit(X_train_scaled, y_train_fold)

    y_pred = rf_model.predict(X_val_scaled)
    # clip to [0,2]
    y_pred = np.clip(y_pred, *CONFIG['prediction_bounds'])
    # enforce volatility vs benchmark (use validation target std for fold)
    y_pred = enforce_volatility(y_pred, y_val_fold.std(), max_ratio=CONFIG['max_volatility_ratio'], clip_bounds=CONFIG['prediction_bounds'])

    mse = mean_squared_error(y_val_fold, y_pred)
    mae = mean_absolute_error(y_val_fold, y_pred)
    r2 = r2_score(y_val_fold, y_pred)

    rf_results.append({'fold': fold, 'mse': mse, 'mae': mae, 'r2': r2})
    print(f"  MSE: {mse:.6f}")
    print(f"  MAE: {mae:.6f}")
    print(f"  R²:  {r2:.6f}")

rf_results_df = pd.DataFrame(rf_results)
print("\n" + "="*50)
print("RANDOM FOREST - AVERAGE RESULTS")
print("="*50)
print(f"MSE: {rf_results_df['mse'].mean():.6f} ± {rf_results_df['mse'].std():.6f}")
print(f"MAE: {rf_results_df['mae'].mean():.6f} ± {rf_results_df['mae'].std():.6f}")
print(f"R²:  {rf_results_df['r2'].mean():.6f} ± {rf_results_df['r2'].std():.6f}")

In [None]:
# Train final Random Forest on all data
print("\nTraining final Random Forest on all data...")
scaler_rf = StandardScaler()
X_scaled_all = scaler_rf.fit_transform(X)
rf_final = RandomForestRegressor(
    n_estimators=100, max_depth=10, min_samples_split=20, min_samples_leaf=10,
    random_state=42, n_jobs=-1
)
rf_final.fit(X_scaled_all, y)
joblib.dump(rf_final, CONFIG['artifacts_dir'] / 'rf_model.joblib')
joblib.dump(scaler_rf, CONFIG['artifacts_dir'] / 'rf_scaler.joblib')
print("Random Forest model saved")

In [None]:
# Model 2: LightGBM (with volatility enforcement)
if LGBM_AVAILABLE:
    print("\n" + "="*70)
    print("MODEL 2: LIGHTGBM")
    print("="*70)
    lgbm_results = []
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
        print(f"\nFold {fold}/{CONFIG['n_splits']}")
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        lgbm_model = lgb.LGBMRegressor(
            n_estimators=100, learning_rate=0.05, max_depth=5, num_leaves=31,
            min_child_samples=20, subsample=0.8, colsample_bytree=0.8,
            random_state=42, verbose=-1
        )
        lgbm_model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            callbacks=[lgb.early_stopping(stopping_rounds=10, verbose=False)]
        )
        y_pred = lgbm_model.predict(X_val_fold)
        y_pred = np.clip(y_pred, *CONFIG['prediction_bounds'])
        y_pred = enforce_volatility(y_pred, y_val_fold.std(), max_ratio=CONFIG['max_volatility_ratio'], clip_bounds=CONFIG['prediction_bounds'])
        mse = mean_squared_error(y_val_fold, y_pred)
        mae = mean_absolute_error(y_val_fold, y_pred)
        r2 = r2_score(y_val_fold, y_pred)
        lgbm_results.append({'fold': fold, 'mse': mse, 'mae': mae, 'r2': r2})
        print(f"  MSE: {mse:.6f}")
        print(f"  MAE: {mae:.6f}")
        print(f"  R²:  {r2:.6f}")
    lgbm_results_df = pd.DataFrame(lgbm_results)
    print("\n" + "="*50)
    print("LIGHTGBM - AVERAGE RESULTS")
    print("="*50)
    print(f"MSE: {lgbm_results_df['mse'].mean():.6f} ± {lgbm_results_df['mse'].std():.6f}")
    print(f"MAE: {lgbm_results_df['mae'].mean():.6f} ± {lgbm_results_df['mae'].std():.6f}")
    print(f"R²:  {lgbm_results_df['r2'].mean():.6f} ± {lgbm_results_df['r2'].std():.6f}")
else:
    print("Skipping LightGBM (not installed)")

In [None]:
# Train final LightGBM on all data
if LGBM_AVAILABLE:
    print("\nTraining final LightGBM on all data...")
    lgbm_final = lgb.LGBMRegressor(
        n_estimators=100, learning_rate=0.05, max_depth=5, num_leaves=31,
        min_child_samples=20, subsample=0.8, colsample_bytree=0.8,
        random_state=42, verbose=-1
    )
    lgbm_final.fit(X, y)
    joblib.dump(lgbm_final, CONFIG['artifacts_dir'] / 'lgbm_model.joblib')
    print("LightGBM model saved")

In [None]:
# Model 3: PyTorch LSTM 
torch_results = []
if TORCH_AVAILABLE:
    print("\n" + "="*70)
    print("MODEL 3: PYTORCH LSTM")
    print("="*70)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    class LSTMRegressor(nn.Module):
        def __init__(self, input_size):
            super().__init__()
            self.lstm1 = nn.LSTM(input_size=input_size, hidden_size=50, num_layers=1, batch_first=True)
            self.dropout1 = nn.Dropout(0.2)
            self.lstm2 = nn.LSTM(input_size=50, hidden_size=25, num_layers=1, batch_first=True)
            self.dropout2 = nn.Dropout(0.2)
            self.fc = nn.Linear(25, 1)
        def forward(self, x):
            out, _ = self.lstm1(x)
            out = self.dropout1(out)
            out, _ = self.lstm2(out)
            out = self.dropout2(out)
            out = out[:, -1, :]
            out = self.fc(out)
            return out.squeeze(-1)

    class EarlyStopping:
        def __init__(self, patience=10):
            self.patience = patience
            self.counter = 0
            self.best_loss = float('inf')
            self.best_state = None
        def step(self, val_loss, model):
            improved = val_loss < self.best_loss - 1e-12
            if improved:
                self.best_loss = val_loss
                self.counter = 0
                self.best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            else:
                self.counter += 1
            return self.counter >= self.patience

    def train_one_fold(X_train, y_train, X_val, y_val, epochs=100, batch_size=32):
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        X_train_t = torch.from_numpy(X_train_scaled.astype('float32')).unsqueeze(1)
        X_val_t = torch.from_numpy(X_val_scaled.astype('float32')).unsqueeze(1)
        y_train_t = torch.from_numpy(y_train.values.astype('float32'))
        y_val_t = torch.from_numpy(y_val.values.astype('float32'))
        train_ds = TensorDataset(X_train_t, y_train_t)
        val_ds = TensorDataset(X_val_t, y_val_t)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=False)
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

        model = LSTMRegressor(input_size=X_train.shape[1]).to(device)
        opt = torch.optim.Adam(model.parameters(), lr=1e-3)
        loss_fn = nn.MSELoss()
        early = EarlyStopping(patience=10)
        epochs_ran = 0

        for epoch in range(epochs):
            model.train()
            tr_loss = 0.0
            for xb, yb in train_loader:
                xb = xb.to(device)
                yb = yb.to(device)
                opt.zero_grad()
                preds = model(xb)
                loss = loss_fn(preds, yb)
                loss.backward()
                opt.step()
                tr_loss += loss.item() * xb.size(0)
            tr_loss /= len(train_loader.dataset)

            model.eval()
            va_loss = 0.0
            with torch.no_grad():
                for xb, yb in val_loader:
                    xb = xb.to(device)
                    yb = yb.to(device)
                    preds = model(xb)
                    loss = loss_fn(preds, yb)
                    va_loss += loss.item() * xb.size(0)
            va_loss /= len(val_loader.dataset)
            epochs_ran = epoch + 1
            if (epoch + 1) % 10 == 0 or epoch == 0:
                print(f"Epoch {epoch+1:3d} - train_loss={tr_loss:.6f} val_loss={va_loss:.6f}")
            if early.step(va_loss, model):
                print(f"Early stopping at epoch {epoch+1}; best val_loss={early.best_loss:.6f}")
                break
        if early.best_state is not None:
            model.load_state_dict(early.best_state)
        model.eval()
        with torch.no_grad():
            val_preds = model(X_val_t.to(device)).cpu().numpy()
        # clip then enforce volatility w.r.t. validation target std
        val_preds = np.clip(val_preds, *CONFIG['prediction_bounds'])
        val_preds = enforce_volatility(val_preds, y_val.std(), max_ratio=CONFIG['max_volatility_ratio'], clip_bounds=CONFIG['prediction_bounds'])
        mse = mean_squared_error(y_val, val_preds)
        mae = mean_absolute_error(y_val, val_preds)
        r2 = r2_score(y_val, val_preds)
        return scaler, model, val_preds, mse, mae, r2, epochs_ran

    torch_cv_scalers, torch_cv_models = [], []
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X), 1):
        print(f"\nFold {fold}/{CONFIG['n_splits']}")
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        scaler_t, model_t, y_pred, mse, mae, r2, epochs_ran = train_one_fold(X_train_fold, y_train_fold, X_val_fold, y_val_fold)
        torch_results.append({'fold': fold, 'mse': mse, 'mae': mae, 'r2': r2, 'epochs': epochs_ran})
        torch_cv_scalers.append(scaler_t)
        torch_cv_models.append(model_t)
        print(f"  MSE: {mse:.6f}")
        print(f"  MAE: {mae:.6f}")
        print(f"  R²:  {r2:.6f}")
        print(f"  Epochs trained: {epochs_ran}")

    torch_results_df = pd.DataFrame(torch_results)
    print("\n" + "="*50)
    print("PYTORCH LSTM - AVERAGE RESULTS")
    print("="*50)
    print(f"MSE: {torch_results_df['mse'].mean():.6f} ± {torch_results_df['mse'].std():.6f}")
    print(f"MAE: {torch_results_df['mae'].mean():.6f} ± {torch_results_df['mae'].std():.6f}")
    print(f"R²:  {torch_results_df['r2'].mean():.6f} ± {torch_results_df['r2'].std():.6f}")
else:
    print("Skipping PyTorch LSTM (torch not installed)")

In [None]:
# Train final PyTorch LSTM on all data
torch_final = None
scaler_torch_final = None
if TORCH_AVAILABLE:
    print("\nTraining final PyTorch LSTM on all data...")
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    scaler_torch_final = StandardScaler()
    X_scaled_full = scaler_torch_final.fit_transform(X)
    X_full_t = torch.from_numpy(X_scaled_full.astype('float32')).unsqueeze(1).to(device)
    y_full_t = torch.from_numpy(y.values.astype('float32')).to(device)
    ds_full = TensorDataset(X_full_t, y_full_t)
    dl_full = DataLoader(ds_full, batch_size=32, shuffle=False)
    model = LSTMRegressor(input_size=X.shape[1]).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()
    class _Early:
        def __init__(self, patience=10):
            self.patience = patience; self.c=0; self.best=float('inf'); self.state=None
        def step(self, loss, model):
            if loss < self.best - 1e-12:
                self.best = loss; self.c=0; self.state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            else:
                self.c += 1
            return self.c>=self.patience
    early = _Early(patience=10)
    for epoch in range(100):
        model.train(); tr=0.0
        for xb, yb in dl_full:
            opt.zero_grad(); preds = model(xb); loss = loss_fn(preds, yb); loss.backward(); opt.step(); tr += loss.item()*xb.size(0)
        tr /= len(dl_full.dataset)
        if (epoch+1)%10==0 or epoch==0:
            print(f"Epoch {epoch+1:3d} - loss={tr:.6f}")
        if early.step(tr, model):
            print(f"Early stopping at epoch {epoch+1}; best loss={early.best:.6f}")
            break
    if early.state is not None:
        model.load_state_dict(early.state)
    torch_final = model
    torch.save(torch_final.state_dict(), CONFIG['artifacts_dir'] / 'torch_lstm_model.pt')
    joblib.dump(scaler_torch_final, CONFIG['artifacts_dir'] / 'torch_lstm_scaler.joblib')
    try:
        scripted = torch.jit.script(torch_final.cpu())
        scripted.save(str(CONFIG['artifacts_dir'] / 'torch_lstm_model_scripted.pt'))
        torch_final.to(device)
    except Exception as e:
        print(f"Could not save scripted model: {e}")
    print("PyTorch LSTM model saved")

## CHOOSING BEST MODEL FOR KAGGLE SUBMISSION

In [None]:
# Model Comparison
print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
comparison = []
comparison.append({
    'Model': 'Random Forest',
    'MSE': f"{rf_results_df['mse'].mean():.6f} ± {rf_results_df['mse'].std():.6f}",
    'MAE': f"{rf_results_df['mae'].mean():.6f} ± {rf_results_df['mae'].std():.6f}",
    'R²':  f"{rf_results_df['r2'].mean():.6f} ± {rf_results_df['r2'].std():.6f}",
})
if LGBM_AVAILABLE:
    comparison.append({
        'Model': 'LightGBM',
        'MSE': f"{lgbm_results_df['mse'].mean():.6f} ± {lgbm_results_df['mse'].std():.6f}",
        'MAE': f"{lgbm_results_df['mae'].mean():.6f} ± {lgbm_results_df['mae'].std():.6f}",
        'R²':  f"{lgbm_results_df['r2'].mean():.6f} ± {lgbm_results_df['r2'].std():.6f}",
    })
if TORCH_AVAILABLE and len(torch_results) > 0:
    comparison.append({
        'Model': 'PyTorch LSTM',
        'MSE': f"{torch_results_df['mse'].mean():.6f} ± {torch_results_df['mse'].std():.6f}",
        'MAE': f"{torch_results_df['mae'].mean():.6f} ± {torch_results_df['mae'].std():.6f}",
        'R²':  f"{torch_results_df['r2'].mean():.6f} ± {torch_results_df['r2'].std():.6f}",
    })
comparison_df = pd.DataFrame(comparison)
print(comparison_df.to_string(index=False))
comparison_df.to_csv(CONFIG['results_dir'] / 'model_comparison.csv', index=False)

In [None]:
# Volatility Analysis (post-hoc)
print("\n" + "="*70)
print("VOLATILITY ANALYSIS")
print("="*70)

# RF
X_scaled_all_rf = scaler_rf.transform(X)
pred_rf_raw = rf_final.predict(X_scaled_all_rf)
pred_rf_raw = np.clip(pred_rf_raw, *CONFIG['prediction_bounds'])
benchmark_std = y.std()
pred_rf_enf = enforce_volatility(pred_rf_raw, benchmark_std, max_ratio=CONFIG['max_volatility_ratio'], clip_bounds=CONFIG['prediction_bounds'])
print(f"RF - target std: {benchmark_std:.6f}, raw std: {pred_rf_raw.std():.6f}, enforced std: {np.asarray(pred_rf_enf).std():.6f}")

# Torch (if trained)
if TORCH_AVAILABLE and 'torch_final' in globals() and torch_final is not None:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_scaled_all_t = scaler_torch_final.transform(X)
    X_all_t = torch.from_numpy(X_scaled_all_t.astype('float32')).unsqueeze(1).to(device)
    torch_final.eval()
    with torch.no_grad():
        pred_t_raw = torch_final(X_all_t).detach().cpu().numpy()
    pred_t_raw = np.clip(pred_t_raw, *CONFIG['prediction_bounds'])
    pred_t_enf = enforce_volatility(pred_t_raw, benchmark_std, max_ratio=CONFIG['max_volatility_ratio'], clip_bounds=CONFIG['prediction_bounds'])
    print(f"Torch LSTM - raw std: {np.asarray(pred_t_raw).std():.6f}, enforced std: {np.asarray(pred_t_enf).std():.6f}")

print(f"Max allowed ratio: {CONFIG['max_volatility_ratio']:.0%}")

In [None]:
# Feature Importance (Random Forest)
feature_importance = pd.DataFrame({
    'feature': FEATURE_COLS,
    'importance': rf_final.feature_importances_
}).sort_values('importance', ascending=False)
print("\n" + "="*70)
print("TOP 10 MOST IMPORTANT FEATURES (Random Forest)")
print("="*70)
print(feature_importance.head(10).to_string(index=False))
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'][:10], feature_importance['importance'][:10])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importance (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(CONFIG['results_dir'] / 'feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Generate submission using best  model
from pathlib import Path as _Path

IS_KAGGLE = _Path('/kaggle/input').exists()
submission_path = _Path('/kaggle/working/submission.csv') if IS_KAGGLE else CONFIG['results_dir'] / 'submission.csv'

test_df = pd.read_csv(CONFIG['test_data_path'])
# Align columns with training features (add missing columns, keep order)
X_test = test_df.reindex(columns=FEATURE_COLS, copy=True)
# Fill missing values: within-row ffill/bfill, then fall back to training feature means
X_test = X_test.ffill().bfill()
if 'TRAIN_FEATURE_MEANS' in globals():
    X_test = X_test.fillna(TRAIN_FEATURE_MEANS)
# Final fallback for any remaining NaNs
X_test = X_test.fillna(0.0)

# Determine best model by lowest mean CV MSE
model_scores = {'rf': float(rf_results_df['mse'].mean())}
if 'lgbm_results_df' in globals():
    model_scores['lgbm'] = float(lgbm_results_df['mse'].mean())
if 'torch_results_df' in globals() and len(torch_results) > 0:
    model_scores['torch'] = float(torch_results_df['mse'].mean())

best_model = min(model_scores, key=model_scores.get)

# Predict with the chosen model
if best_model == 'torch' and TORCH_AVAILABLE and 'torch_final' in globals() and torch_final is not None and 'scaler_torch_final' in globals() and scaler_torch_final is not None:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    X_test_scaled = scaler_torch_final.transform(X_test)
    X_test_t = torch.from_numpy(X_test_scaled.astype('float32')).unsqueeze(1).to(device)
    torch_final.eval()
    with torch.no_grad():
        preds = torch_final(X_test_t).detach().cpu().numpy().reshape(-1)
elif best_model == 'lgbm' and 'lgbm_final' in globals():
    preds = lgbm_final.predict(X_test)
else:
    X_test_scaled = scaler_rf.transform(X_test)
    preds = rf_final.predict(X_test_scaled)

# Clip and enforce volatility
preds = np.clip(preds, *CONFIG['prediction_bounds'])
preds = enforce_volatility(preds, y.std(), max_ratio=CONFIG['max_volatility_ratio'], clip_bounds=CONFIG['prediction_bounds'])

# Build submission
sample_sub_path = _Path('/kaggle/input/hull-tactical-market-prediction/sample_submission.csv')
if sample_sub_path.exists():
    sample_sub = pd.read_csv(sample_sub_path)
    id_col = sample_sub.columns[0]
    if id_col in test_df.columns:
        submission_df = pd.DataFrame({id_col: test_df[id_col], 'prediction': preds})
    else:
        submission_df = pd.DataFrame({id_col: range(len(test_df)), 'prediction': preds})
elif 'id' in test_df.columns:
    submission_df = pd.DataFrame({'id': test_df['id'], 'prediction': preds})
else:
    submission_df = pd.DataFrame({'row': np.arange(len(test_df)), 'prediction': preds})

submission_df.to_csv(submission_path, index=False)
print(f"Saved submission ({best_model}) -> {submission_path}")
print(submission_df.head())

In [None]:
# Final confirmation and Kaggle output declaration
print("\n" + "="*70)
print("NOTEBOOK EXECUTION COMPLETE")
print("="*70)

kaggle_submission = Path('/kaggle/working/submission.csv')
local_submission = CONFIG['results_dir'] / 'submission.csv'

if kaggle_submission.exists():
    submission_file = kaggle_submission
    print(f"✓ Submission file found at: {submission_file}")
elif local_submission.exists():
    submission_file = local_submission
    print(f"WARNING: Submission file at wrong location: {submission_file}")
    if Path('/kaggle/working').exists():
        import shutil
        shutil.copy(submission_file, kaggle_submission)
        submission_file = kaggle_submission
        print(f"✓ Copied to correct location: {submission_file}")
else:
    print(f"✗ ERROR: Submission file not found at either location:")
    print(f"  - {kaggle_submission}")
    print(f"  - {local_submission}")
    raise FileNotFoundError("Submission file not created!")

print(f"Size: {submission_file.stat().st_size:,} bytes")
sub_df = pd.read_csv(submission_file)
print(f"Shape: {sub_df.shape}")
print(f"Columns: {list(sub_df.columns)}")
print("\nFirst 5 rows:")
print(sub_df.head())
if sub_df.shape[0] == 0:
    raise ValueError("Submission file is empty!")
if sub_df.isnull().any().any():
    raise ValueError("Submission contains null values!")
print("\n✓ Submission file validated successfully")
print("✓ Ready for Kaggle competition submission")
print(f"\n** SUBMISSION FILE LOCATION: {submission_file.absolute()} **")

## THIS IS STUFF TO MAKE SUBMISSION WORK

In [None]:
# Inference server: predict() and gateway wiring
import os
import importlib.util
from pathlib import Path

import numpy as np
import pandas as pd

# Polars may be provided by the competition environment
try:
    import polars as pl  # type: ignore
    _POLARS_AVAILABLE = True
except Exception:
    pl = None  # type: ignore
    _POLARS_AVAILABLE = False

# Reuse availability flags from earlier if present; else recompute minimally
try:
    LGBM_AVAILABLE  # type: ignore[name-defined]
except NameError:
    LGBM_AVAILABLE = importlib.util.find_spec("lightgbm") is not None
if LGBM_AVAILABLE:
    import lightgbm as lgb  # type: ignore

# Reuse helper from earlier cells if present; else define a minimal one
if "enforce_volatility" not in globals():
    def enforce_volatility(preds, target_std, max_ratio=1.2, clip_bounds=(0.0, 2.0)):
        arr = np.asarray(preds, dtype=float).reshape(-1)
        pred_std = float(arr.std())
        tgt_std = float(target_std)
        if pred_std > 0 and tgt_std > 0:
            scale = min(1.0, (max_ratio * tgt_std) / (pred_std + 1e-12))
            mean = float(arr.mean())
            arr = mean + scale * (arr - mean)
        if clip_bounds is not None:
            arr = np.clip(arr, clip_bounds[0], clip_bounds[1])
        return arr

# Threshold from CONFIG if present, else use global constant
try:
    _MISSING_THR = float(CONFIG.get('missing_threshold', MISSING_DATA_THRESHOLD))  # type: ignore[name-defined]
except Exception:
    _MISSING_THR = 0.10  # Default to 10% if neither CONFIG nor constant available

# Competition paths
COMP_ROOT = Path("/kaggle/input/hull-tactical-market-prediction")
TRAIN_CSV = COMP_ROOT / "train.csv"  # published train data

# Globals managed by inference
_INF_READY = False
_INF_MODEL = None
_INF_BENCH_STD = None  # std of training targets
_INF_FEATURE_COLS = None  # feature list determined from training data by missingness
_INF_TRAIN_MEANS = None   # per-feature means from training data


def _infer_feature_cols(df: pd.DataFrame, target_col: str, missing_thr: float) -> list[str]:
    meta_cols = set([target_col])
    meta_cols |= set([c for c in ['id','ID','ticker','symbol','row','index'] if c in df.columns])
    meta_cols |= set([c for c in ['date','timestamp','datetime','time','Date','DATE'] if c in df.columns])
    numeric_cols = [c for c in df.select_dtypes(include=['number']).columns if c not in meta_cols]
    miss = df[numeric_cols].isna().mean()
    return [c for c in numeric_cols if miss[c] <= missing_thr]


def _safe_prepare_frame(df_in: pd.DataFrame) -> pd.DataFrame:
    """Select required features, forward/backward fill missing, fallback to train means, return DataFrame."""
    global _INF_FEATURE_COLS, _INF_TRAIN_MEANS
    if _INF_FEATURE_COLS is None:
        # Fall back to any globally defined FEATURE_COLS
        if 'FEATURE_COLS' in globals() and isinstance(FEATURE_COLS, (list, tuple)) and len(FEATURE_COLS) > 0:
            _INF_FEATURE_COLS = list(FEATURE_COLS)
        else:
            _INF_FEATURE_COLS = _infer_feature_cols(df_in, 'forward_returns', _MISSING_THR)
    X_batch = df_in.reindex(columns=_INF_FEATURE_COLS, copy=True)
    X_batch = X_batch.ffill().bfill()
    if _INF_TRAIN_MEANS is not None:
        X_batch = X_batch.fillna(_INF_TRAIN_MEANS)
    X_batch = X_batch.fillna(0.0)
    return X_batch


def _init_inference():
    """Train a fast model on published training data on first call."""
    global _INF_READY, _INF_MODEL, _INF_BENCH_STD, _INF_FEATURE_COLS, _INF_TRAIN_MEANS

    if _INF_READY:
        return

    if not TRAIN_CSV.exists():
        raise FileNotFoundError(f"Training data not found at {TRAIN_CSV}")

    train_df = pd.read_csv(TRAIN_CSV)
    target_col = 'forward_returns'
    if target_col not in train_df.columns:
        raise ValueError("Target column 'forward_returns' not found in training data.")

    # Determine features once using missingness
    _INF_FEATURE_COLS = _infer_feature_cols(train_df, target_col, _MISSING_THR)

    y = train_df[target_col]
    X = train_df.reindex(columns=_INF_FEATURE_COLS, copy=True)
    X = X.ffill().bfill()
    # Compute and store train means for robust imputation later
    _INF_TRAIN_MEANS = X.mean(numeric_only=True)
    X = X.fillna(_INF_TRAIN_MEANS).fillna(0.0)

    _INF_BENCH_STD = float(pd.Series(y).std())

    # Prefer LightGBM for speed
    if LGBM_AVAILABLE:
        model = lgb.LGBMRegressor(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=5,
            num_leaves=31,
            min_child_samples=20,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            verbose=-1,
        )
        model.fit(X, y)
        _INF_MODEL = model
    else:
        from sklearn.ensemble import RandomForestRegressor
        model = RandomForestRegressor(
            n_estimators=200,
            max_depth=12,
            min_samples_split=20,
            min_samples_leaf=10,
            random_state=42,
            n_jobs=-1,
        )
        model.fit(X, y)
        _INF_MODEL = model

    _INF_READY = True


def _to_pandas(df_any) -> pd.DataFrame:
    if _POLARS_AVAILABLE and isinstance(df_any, pl.DataFrame):  # type: ignore
        return df_any.to_pandas()
    if isinstance(df_any, pd.DataFrame):
        return df_any
    return pd.DataFrame(df_any)


def _predict_batch(df_any) -> np.ndarray:
    if not _INF_READY:
        _init_inference()
    dfp = _to_pandas(df_any)
    Xb = _safe_prepare_frame(dfp)
    preds = _INF_MODEL.predict(Xb)  # type: ignore[union-attr]
    preds = np.clip(preds, *(0.0, 2.0))
    preds = enforce_volatility(preds, _INF_BENCH_STD, max_ratio=1.2, clip_bounds=(0.0, 2.0))
    return np.asarray(preds, dtype=float).reshape(-1)


def predict(test) -> float:
    preds = _predict_batch(test)
    if preds.shape[0] == 1:
        return float(preds[0])
    if _POLARS_AVAILABLE:
        return pl.DataFrame({"prediction": preds})  # type: ignore
    return pd.DataFrame({"prediction": preds})

# Start the evaluation server (remote) or a local gateway (for local testing)
import kaggle_evaluation.default_inference_server as _kserv
_inference_server = _kserv.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    _inference_server.serve()
else:
    _inference_server.run_local_gateway((str(COMP_ROOT),))