In [None]:
import pandas as pd
import numpy as np
import logging
from pathlib import Path
from src.io.saving import load_long_parquet
from src.features.target_generation import generate_triple_barrier_targets, get_target_summary

# Set logging level (optional for debugging)
logging.basicConfig(level=logging.INFO)

In [None]:
df_long = load_long_parquet(Path("artifacts/features_long.parquet"))

In [None]:
bad_rows = df_long[df_long['close'] > df_long['high']+1e-6]
bad_rows

In [None]:
df_long = df_long.reset_index()

# Confirm required columns
df_long['symbol'] = df_long['symbol'].astype(str)
df_long['date'] = pd.to_datetime(df_long['date'], errors='coerce')
df_long['atr'] = df_long['atr14']  # Triple barrier logic expects this column

required_cols = ['symbol', 'date', 'close', 'high', 'low', 'atr']
missing = [col for col in required_cols if col not in df_long.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

In [None]:
from joblib import Parallel, delayed
import pandas as pd
from typing import Dict, List, Optional
import numpy as np

def _process_symbol_group(df: pd.DataFrame, config: Dict, symbols: List[str]) -> pd.DataFrame:
    """Process a group of symbols by filtering and running triple barrier target generation."""
    chunk_df = df[df['symbol'].isin(symbols)]  # No need to copy if generate_triple_barrier_targets is read-only
    return generate_triple_barrier_targets(chunk_df, config)


def generate_targets_parallel(
    df: pd.DataFrame,
    config: Dict,
    n_jobs: int = -1,
    chunk_size: int = 25
) -> pd.DataFrame:
    """
    Chunk long-format data by symbol and run target generation in parallel.

    Args:
        df: Long-format DataFrame with columns ['symbol', 'date', 'close', 'high', 'low', 'atr']
        config: Triple barrier config dictionary
        n_jobs: Number of parallel workers (-1 = all cores)
        chunk_size: Number of symbols per parallel chunk

    Returns:
        Combined DataFrame with triple barrier targets for all symbols
    """
    # Deduplicate and sort symbols to reduce memory fragmentation
    unique_symbols = pd.Series(df['symbol'].dropna().unique())
    symbol_chunks = [unique_symbols[i:i + chunk_size].tolist() for i in range(0, len(unique_symbols), chunk_size)]

    # Avoid re-pickling the entire DataFrame repeatedly by moving it outside parallel calls if possible
    df_shared = df.copy(deep=False)  # Avoid unnecessary deep copy
    # display(df_shared)
    results = Parallel(n_jobs=n_jobs, backend='loky', verbose=1, prefer="processes")(
        delayed(_process_symbol_group)(df_shared[df_shared["symbol"].isin(chunk)], config, chunk) for chunk in symbol_chunks
    )

    if not results:
        return pd.DataFrame()

    # Use iterator for efficiency if many chunks
    return pd.concat(results, ignore_index=True)


config = {
    'up_mult': 3.0,
    'dn_mult': 2.5,
    'max_horizon': 20,
    'start_every': 3,
}

targets = generate_targets_parallel(df_long, config, n_jobs=-1, chunk_size=32)

In [None]:
# Ensure proper datetime types
df_long['date'] = pd.to_datetime(df_long['date'])
targets['t0'] = pd.to_datetime(targets['t0'])
drop_cols = ['open', 'high', 'low', 'close']  # Add any other overlapping OHLC names if needed
targets_clean = targets.drop(columns=[col for col in drop_cols if col in targets.columns])

# Merge on symbol + date = t0
df_merged = df_long.merge(
    targets_clean,
    how='inner',
    left_on=['symbol', 'date'],
    right_on=['symbol', 't0'],
    suffixes=('', '_target')
)


In [None]:
import matplotlib.pyplot as plt
import mplfinance as mpf
import pandas as pd
import numpy as np

# === USER CONFIGURABLE ===
TICKER   = "AAPL"                      # Example ticker
START_DT = pd.Timestamp("2020-09-03")  # Entry point to visualize
# Column names
DATE_COL = 'date'
PRICE_COL = 'close'
ATR_COL = 'atr14'
HIGH_COL = 'high'
LOW_COL = 'low'
OPEN_COL = 'open'
SYMBOL_COL = 'symbol'
# --- Get the relevant trajectory row
row = df_merged[(df_merged.symbol == TICKER) & (df_merged["t0"] == START_DT)].squeeze()
if row.empty:
    raise ValueError("No trajectory found for given ticker and date")

# --- Price data for the ticker
px = df_merged[df_merged.symbol == TICKER].set_index(DATE_COL).sort_index()

# --- Extract key trajectory values
# t0__up4.0_dn2.0_h21_s5
# t_hit__up4.0_dn2.0_h21_s5
# hit__up4.0_dn2.0_h21_s5
# entry_px__up4.0_dn2.0_h21_s5
# top__up4.0_dn2.0_h21_s5
# bot__up4.0_dn2.0_h21_s5
# h_used__up4.0_dn2.0_h21_s5
# price_hit__up4.0_dn2.0_h21_s5
# ret_from_entry__up4.0_dn2.0_h21_s5
# n_overlapping_trajs__up4.0_dn2.0_h21_s5

entry_dt = pd.to_datetime(row['t0'])
hit_dt   = pd.to_datetime(row['t_hit'])
horizon  = row['h_used']
entry_px = row['entry_px']
top_px   = row['top']
bot_px   = row['bot']
hit      = row['hit']

# === Get full price trajectory from t0 to t0 + h_used
trajectory_mask = (
    (df_long['symbol'] == TICKER) &
    (df_long['date'] >= entry_dt) &
    (df_long['date'] <= entry_dt + pd.Timedelta(days=int(horizon)))
)

px_window = df_long[trajectory_mask].set_index('date').sort_index().copy()

# Pad context around the trajectory
pre_window = df_long[
    (df_long['symbol'] == TICKER) &
    (df_long['date'] < entry_dt)
].sort_values('date').tail(10)

post_window = df_long[
    (df_long['symbol'] == TICKER) &
    (df_long['date'] > entry_dt + pd.Timedelta(days=int(horizon)))
].sort_values('date').head(10)

# Combine the full padded window
px_window = pd.concat([
    pre_window.set_index('date'),
    px_window,
    post_window.set_index('date')
]).sort_index()

print("Column mappings:")
print("OPEN_COL:", OPEN_COL)
print("HIGH_COL:", HIGH_COL)
print("LOW_COL:", LOW_COL)
print("PRICE_COL:", PRICE_COL)

print("\npx_window sample:")
print(px_window[[OPEN_COL, HIGH_COL, LOW_COL, PRICE_COL]].head(10))

# Required OHLC format
ohlc = px_window[[OPEN_COL, HIGH_COL, LOW_COL, PRICE_COL]].copy()
ohlc.columns = ['Open', 'High', 'Low', 'Close']

# --- Count overlapping trajectories
active_mask = (targets.symbol == TICKER) & \
              (targets['t0'] <= entry_dt) & \
              ((targets['t_hit'] >= entry_dt) | (targets['hit'].isna()))
overlap_count = active_mask.sum()

# --- Custom Lines and Markers
add_lines = [
    mpf.make_addplot([top_px] * len(ohlc), color='green', linestyle='dotted', width=0.8),
    mpf.make_addplot([bot_px] * len(ohlc), color='red', linestyle='dotted', width=0.8)
]

# --- Custom marker for entry and hit
entry_idx = ohlc.index.get_loc(entry_dt) if entry_dt in ohlc.index else None
hit_idx   = ohlc.index.get_loc(hit_dt) if hit_dt in ohlc.index else None

if entry_idx is not None:
    add_lines.append(
        mpf.make_addplot(
            [np.nan if i != entry_idx else entry_px for i in range(len(ohlc))],
            type='scatter', markersize=70, marker='o', color='black'
        )
    )

if hit_idx is not None:
    add_lines.append(
        mpf.make_addplot(
            [np.nan if i != hit_idx else row.price_hit for i in range(len(ohlc))],
            type='scatter', markersize=70, marker='x', color='purple'
        )
    )

# --- Plot the candlestick chart
mpf.plot(
    ohlc,
    type='candle',
    style='yahoo',
    addplot=add_lines,
    title=f"{TICKER} â€” Triple Barrier Trajectory",
    ylabel='Price',
    datetime_format='%Y-%m-%d',
    xrotation=15,
    tight_layout=True,
    volume=False,
    alines=dict(
        alines=[[(entry_dt, ohlc['Low'].min()), (entry_dt, ohlc['High'].max())]],
        colors=['gray'], linestyle=':', linewidths=1
    )
)

# --- Metadata display below the chart
print(f"\nMetadata for {TICKER} on {entry_dt.date()}")
print("-" * 40)
print(f"Entry Date        : {entry_dt.date()}")
print(f"Hit Date          : {hit_dt.date()}")
print(f"Hit Type          : {hit} ({'Top' if hit==1 else 'Bottom' if hit==-1 else 'Horizon'})")
print(f"Entry Price       : {entry_px:.2f}")
print(f"Top Barrier       : {top_px:.2f}")
print(f"Bottom Barrier    : {bot_px:.2f}")
print(f"Overlapping Count : {overlap_count}")
print(f"Horizon Used      : {horizon} bars")

In [None]:
df_merged[df_merged.symbol == TICKER]['t0']

In [None]:
df_merged.weights

In [None]:
selected_features = [
    # Moving averages and slopes
    "ma_10", "ma_20", "ma_50", "ma_100", "ma_200",
    "pct_slope_ma_10", "pct_slope_ma_20", "pct_slope_ma_50", "pct_slope_ma_100", "pct_slope_ma_200",
    "pct_dist_ma_20", "pct_dist_ma_50", "pct_dist_ma_100", "pct_dist_ma_200",
    "pct_dist_ma_20_z", "pct_dist_ma_50_z", "pct_dist_ma_100_z", "pct_dist_ma_200_z",
    "min_pct_dist_ma",
    "relative_dist_20_50", "relative_dist_20_50_z",
    
    # Trend indicators
    "trend_score_granular", "trend_score_slope", "trend_alignment", "trend_persist_ema",
    "quiet_trend", "ret",
    
    # Volatility indicators
    "rv_10", "rv_20", "rv_60", "rv_100",
    "rv_ratio_10_60", "rv_ratio_20_100",
    "rv_z_60", "rv60_slope_norm", "rv100_slope_norm",
    "vol_of_vol_20d", "volshock_z", "volshock_dir",
    "atr14", "atr_percent", "gap_atr_ratio",
    
    # Range position and breakout
    "pos_in_5d_range", "pos_in_10d_range", "pos_in_20d_range",
    "breakout_up_5d", "breakout_up_10d", "breakout_up_20d",
    "breakout_dn_5d", "breakout_dn_10d", "breakout_dn_20d",
    "range_z_5d", "range_z_10d", "range_z_20d",
    
    # Volume and relative volume
    "vol_z_20", "vol_z_60", "rvol_20", "rvol_50",
    "dollar_vol_ma_20", "rdollar_vol_20",
    
    # OBV
    "obv", "obv_z_60",
    
    # Hurst exponent
    "hurst_ret_64", "hurst_ret_64_emaHL5", "hurst_ret_128",
    
    # Alpha vs benchmark
    "alpha_resid_spy", "alpha_mom_spy_20_ema10", "alpha_mom_spy_60_ema10",
    "alpha_resid_sector", "alpha_mom_sector_20_ema10", "alpha_mom_sector_60_ema10",
    "alpha_mom_combo_20_ema10", "alpha_mom_combo_60_ema10",
    
    # Relative strength
    "rel_strength_spy", "rel_strength_spy_norm", "rel_strength_spy_slope20",
    "rel_strength_sector", "rel_strength_sector_norm", "rel_strength_sector_slope20",
    
    # Cross-sectional momentum
    "xsec_mom_5d_z", "xsec_mom_20d_z", "xsec_mom_60d_z",
    "xsec_mom_5d_sect_neutral_z", "xsec_mom_20d_sect_neutral_z", "xsec_mom_60d_sect_neutral_z",
    
    # Breadth
    "pct_universe_above_ma20", "pct_universe_above_ma50", "pct_universe_above_ma200",
    "ad_line_universe"
]

total_df = df_merged[['date'] +selected_features+['hit','weight']]


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.utils.class_weight import compute_sample_weight

# --- Sort by date and define train/test split
total_df = total_df.sort_values("date")
split_date = total_df["date"].quantile(0.8)  # 80% train, 20% test

# --- Create train/test sets
train_df = total_df[total_df["date"] <= split_date].copy()
test_df  = total_df[total_df["date"] > split_date].copy()

# --- Define selected features (must exclude 'hit', 'target', 'date', 'symbol')


X_train = train_df[selected_features]
X_test  = test_df[selected_features]

# --- Encode class labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['hit'])  # [-1, 0, 1] â†’ [0, 1, 2]
y_test  = label_encoder.transform(test_df['hit'])
class_labels = label_encoder.classes_

# --- Compute balanced sample weights
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train)

# --- Train XGBoost model
model = xgb.XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    num_class=3,
    random_state=42,
    n_jobs=-1,
    # ðŸŒ± Regularization
    max_depth=2,                 # Shallower trees
    min_child_weight=5,          # Require more samples per leaf
    gamma=2.0,                   # Higher threshold for node splits
    subsample=0.7,               # Lower row sampling
    colsample_bytree=0.7,        # Lower feature sampling
    reg_alpha=2.0,               # Stronger L1 regularization
    reg_lambda=3.0,              # Stronger L2 regularization

    # ðŸ“‰ Learning rate and tree count
    learning_rate=0.03,          # Lower LR for stability
    n_estimators=700,            # More trees to compensate
)
model.fit(X_train, y_train, sample_weight=sample_weights)

# --- Predict
y_pred_proba_test = model.predict_proba(X_test)
y_pred_proba_train = model.predict_proba(X_train)

# --- Evaluate Test AUC
macro_auc_test = roc_auc_score(y_test, y_pred_proba_test, multi_class='ovr', average='macro')
print(f"\nðŸ§ª Test AUC (macro-averaged OVR): {macro_auc_test:.4f}")

# --- Per-class AUC (Test)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
print("\nPer-Class AUC (Test):")
for i, label in enumerate(class_labels):
    auc = roc_auc_score(y_test_bin[:, i], y_pred_proba_test[:, i])
    print(f"  Class {label}: AUC = {auc:.4f}")

# --- Evaluate Train AUC
macro_auc_train = roc_auc_score(y_train, y_pred_proba_train, multi_class='ovr', average='macro')
print(f"\nðŸ§  Train AUC (macro-averaged OVR): {macro_auc_train:.4f}")

# --- Per-class AUC (Train)
y_train_bin = label_binarize(y_train, classes=[0, 1, 2])
print("\nPer-Class AUC (Train):")
for i, label in enumerate(class_labels):
    auc = roc_auc_score(y_train_bin[:, i], y_pred_proba_train[:, i])
    print(f"  Class {label}: AUC = {auc:.4f}")

In [None]:
y_pred_proba

In [None]:
X_test.columns

In [None]:
total_df.weight

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import json
import time
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# --- Encode class labels
label_encoder = LabelEncoder()
total_df['target_encoded'] = label_encoder.fit_transform(total_df['hit'])  # [-1, 0, 1] â†’ [0, 1, 2]
class_labels = label_encoder.classes_

# --- Prepare fold splits based on time
total_df = total_df.sort_values("date").reset_index(drop=True)
n_folds = 10
fold_size = len(total_df) // n_folds

fold_indices = []
for i in range(n_folds):
    start = i * fold_size
    end = (i + 1) * fold_size if i < n_folds - 1 else len(total_df)
    fold_indices.append((start, end))

# --- Hyperopt Objective
def objective(space):
    fold_aucs = []

    for i in range(n_folds - 1):  # leave last fold for testing, train on earlier ones
        train_end = fold_indices[i][1]
        val_start, val_end = fold_indices[i + 1]

        train_df = total_df.iloc[:train_end]
        val_df = total_df.iloc[val_start:val_end]

        X_train = train_df[selected_features].values
        y_train = train_df['target_encoded'].values
        X_val   = val_df[selected_features].values
        y_val   = val_df['target_encoded'].values

        # Step 1: Extract raw weights
        raw_weights = train_df["weight"].values
        y_train = train_df["target_encoded"].values
        
        # Step 2: Compute class balancing weights
        class_weights = compute_class_weight(
            class_weight="balanced",
            classes=np.unique(y_train),
            y=y_train
        )
        class_weight_map = {cls: w for cls, w in zip(np.unique(y_train), class_weights)}
        
        # Step 3: Combine raw weight Ã— class weight
        combined_weights = np.array([
            raw_weights[i] * class_weight_map[y_train[i]]
            for i in range(len(y_train))
        ])
        
        # Step 4: Clip extreme weights to avoid dominance
        # You can tune these thresholds â€” e.g., 1stâ€“99th percentile
        lower_clip = np.percentile(combined_weights, 1)
        upper_clip = np.percentile(combined_weights, 99)
        clipped_weights = np.clip(combined_weights, lower_clip, upper_clip)
        dtrain = xgb.DMatrix(X_train, label=y_train, weight=clipped_weights)
        dval   = xgb.DMatrix(X_val, label=y_val)

        params = {
            "objective": "multi:softprob",
            "eval_metric": "mlogloss",
            "num_class": 3,
            "tree_method": "hist",
            "eta": space["eta"],
            "max_depth": int(space["max_depth"]),
            "subsample": space["subsample"],
            "colsample_bytree": space["colsample_bytree"],
            "lambda": space["reg_lambda"],
            "alpha": space["reg_alpha"],
        }

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dval, "eval")],
            early_stopping_rounds=30,
            verbose_eval=False,
        )

        y_val_pred = model.predict(dval)
        y_val_bin = label_binarize(y_val, classes=[0, 1, 2])
        auc = roc_auc_score(y_val_bin, y_val_pred, multi_class='ovr', average='macro')
        fold_aucs.append(auc)

    return {"loss": -np.mean(fold_aucs), "status": STATUS_OK}

# --- Search space
space = {
    "eta": hp.loguniform("eta", np.log(0.005), np.log(0.3)),
    "max_depth": hp.quniform("max_depth", 3, 10, 1),
    "min_child_weight": hp.loguniform("min_child_weight", np.log(1), np.log(20)),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
    "gamma": hp.uniform("gamma", 0, 5),
    "reg_alpha": hp.loguniform("reg_alpha", np.log(1e-3), np.log(10.0)),
    "reg_lambda": hp.loguniform("reg_lambda", np.log(1e-3), np.log(10.0)),
    "booster": hp.choice("booster", ["gbtree", "dart"]),
}
# --- Run optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=500, trials=trials)
best["max_depth"] = int(best["max_depth"])

# --- Save results
with open("xgb_hyperopt_best_params.json", "w") as f:
    json.dump(best, f, indent=2)

print("\nâœ… Best hyperparameters saved:")
print(json.dumps(best, indent=2))