# Main Pipeline (Monthly Return Prediction + Portfolio Backtest)

This notebook runs the full end-to-end pipeline:

1. Load monthly returns, characteristics, and factor data  
2. Compute factor exposures (betas / alpha)  
3. Assemble the final modeling panel (features + next_return)  
4. Generate rolling out-of-sample predictions (OLS, Ridge, Lasso, ENet, RF)  
5. Build decile portfolios and long–short spreads  
6. Produce performance tables, significance tests, and plots

All logic lives in the project modules; this notebook is only orchestration.

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# Modules
from final_dataset import assemble_modeling_dataset
from oos_predictions import generate_oos_predictions_all_models
from portfolios import compute_decile_portfolio_returns, compute_decile_portfolio_weights
from reporting import generate_comparison_report
from training_scheme import WindowConfig

# If you have this module in your repo:
# from factor_exposures import compute_factor_exposures

DIR_DATA = Path("..") / "data"
DIR_RESULTS = Path("..") / "results"
DIR_REPORTS = DIR_RESULTS / "reports"
DIR_RESULTS.mkdir(parents=True, exist_ok=True)
DIR_REPORTS.mkdir(parents=True, exist_ok=True)

RETURN_COL = "RET"               # realized monthly return column in returns df
TARGET_COL = "next_return"       # target col inside panel
REALIZED_COL = "realized_return" # realized col name in predictions output

STANDARDIZE = "zscore"           # "zscore" or "rank"

window_cfg = WindowConfig(
    min_train_months=60,
    max_train_months=120,
    expanding=False,
)

N_DECILES = 10
WEIGHT_COL = None  # set to e.g. "mktcap" if you have it in predictions panel

# 1) Load a 10% sample of tickers (fast dev mode)

We load a deterministic 10% subset of tickers from metadata, then read only:
- `date`
- `adj_close`
- `volume`

We also cap the maximum rows per ticker to keep the run light.

In [None]:
from pathlib import Path
import logging
import pandas as pd
import numpy as np

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# --- Paths (match your setup) ---
DIR_DATA = Path("../data")
DIR_STOCK_DATA = DIR_DATA / "stock_data"
PATH_META = DIR_DATA / "metadata_listed.csv"
PATH_META_DELISTED = DIR_DATA / "metadata_delisted.csv"

# --- Controls ---
P_SAMPLE = 0.10              # 10% of tickers
MAX_ROWS_PER_TICKER = 1500   # reduce further (e.g. 500) if heavy
PRICE_COL = "adj_close"
VOLUME_COL = "volume"

# Read metadata
df_meta = pd.read_csv(PATH_META)
df_meta_delisted = pd.read_csv(PATH_META_DELISTED)

# Combine ticker universe
tickers = pd.concat(
    [df_meta[["Symbol"]], df_meta_delisted[["Symbol"]]],
    ignore_index=True
)["Symbol"].dropna().astype(str).unique()

# Deterministic sampling (stable across runs *in the same Python environment*)
# If you want it stable across machines/Python versions, see note below.
def keep_ticker(t: str, p: float) -> bool:
    return (hash(t) % 10_000) / 10_000 < p

tickers_sample = [t for t in tickers if keep_ticker(t, P_SAMPLE)]
logging.info(f"Selected {len(tickers_sample)} tickers out of {len(tickers)} (~{P_SAMPLE:.0%}).")

all_stocks = []
skipped = 0

usecols = ["date", PRICE_COL, VOLUME_COL]

for symbol in tickers_sample:
    path_stock = DIR_STOCK_DATA / f"{symbol}.csv"
    if not path_stock.exists():
        skipped += 1
        continue

    try:
        df_stock = pd.read_csv(
            path_stock,
            usecols=lambda c: c in usecols,
            parse_dates=["date"],
        )

        if MAX_ROWS_PER_TICKER is not None and len(df_stock) > MAX_ROWS_PER_TICKER:
            df_stock = df_stock.tail(MAX_ROWS_PER_TICKER)

        df_stock = df_stock.sort_values("date")

        # Downcast for memory
        df_stock[PRICE_COL] = pd.to_numeric(df_stock[PRICE_COL], errors="coerce", downcast="float")
        df_stock[VOLUME_COL] = pd.to_numeric(df_stock[VOLUME_COL], errors="coerce", downcast="float")

        df_stock["ticker"] = symbol
        df_stock = df_stock.set_index(["ticker", "date"]).sort_index()

        all_stocks.append(df_stock)

    except Exception as e:
        skipped += 1
        logging.warning(f"Skipping {symbol}: {e}")

df_stocks = pd.concat(all_stocks) if all_stocks else pd.DataFrame()
logging.info(f"Loaded panel shape: {df_stocks.shape}, skipped: {skipped}, rows: {len(df_stocks):,}")

df_stocks.head()

# 2) Run the full pipeline on the sample

Steps:
1. Compute monthly features (characteristics)
2. Compute factor exposures (betas / alpha)
3. Assemble modeling dataset (panel with next_return)
4. Generate OOS predictions
5. Compute decile portfolio returns (and optionally weights)

In [None]:
from pathlib import Path
import logging

from features import FeatureConfig, compute_features
from factor_exposures import ExposureConfig, compute_factor_exposures
from final_dataset import assemble_modeling_dataset
from oos_predictions import generate_oos_predictions_all_models
from portfolios import compute_decile_portfolio_returns, compute_decile_portfolio_weights
from training_scheme import WindowConfig

# Output
DIR_OUTPUT = Path("outputs")
DIR_OUTPUT.mkdir(parents=True, exist_ok=True)

# Configs
feature_cfg = FeatureConfig()
feature_cfg.price_col = getattr(feature_cfg, "price_col", "adj_close")
feature_cfg.volume_col = getattr(feature_cfg, "volume_col", "volume")

exposure_cfg = ExposureConfig()
window_cfg = WindowConfig()

standardize_method = "zscore"   # or "rank"

# Input: sampled daily panel
stock_data = df_stocks

# --- 1) Compute features (monthly characteristics) ---
logging.info("Computing features...")
characteristics = compute_features(stock_data, config=feature_cfg)
logging.info(f"Characteristics shape: {characteristics.shape}")

# --- 2) Compute factor exposures (monthly betas/alpha) ---
logging.info("Computing factor exposures...")
factor_loadings = compute_factor_exposures(characteristics, config=exposure_cfg)
logging.info(f"Factor loadings shape: {factor_loadings.shape}")

# --- 3) Assemble final modeling dataset ---
# assemble_modeling_dataset expects:
# - returns: MultiIndex ('ticker','date') with column RET (monthly realized)
# - characteristics: MultiIndex ('ticker','date') numeric features
# - factor_loadings: MultiIndex ('ticker','date') rolling betas
#
# IMPORTANT: compute_features must return (or include) a monthly return column for you to build `returns`.
# Most pipelines either:
#   - include RET in the output already, OR
#   - you compute monthly returns separately from adj_close.

if "RET" in characteristics.columns:
    returns = characteristics[["RET"]].copy()
else:
    raise KeyError(
        "Couldn't find 'RET' in characteristics. "
        "Either modify compute_features to output monthly RET, "
        "or compute monthly returns from adj_close and build returns DataFrame."
    )

# Remove RET from characteristics if you don't want it treated as a feature
characteristics_only = characteristics.drop(columns=["RET"], errors="ignore")

logging.info("Assembling modeling dataset...")
panel = assemble_modeling_dataset(
    returns=returns,
    characteristics=characteristics_only,
    factor_loadings=factor_loadings,
    return_col="RET",
    next_return_col="next_return",
    standardize=standardize_method,
)

logging.info(f"Final panel shape: {panel.shape}")
panel.to_csv(DIR_OUTPUT / "panel_sample.csv")

# --- 4) OOS predictions ---
feature_cols = [c for c in panel.columns if c != "next_return"]

logging.info("Generating OOS predictions...")
predictions = generate_oos_predictions_all_models(
    panel,
    feature_cols,
    target_col="next_return",
    window_config=window_cfg,
    output_path=DIR_OUTPUT / "predictions_sample.csv",
    realized_col="realized_return",
)

logging.info(f"Predictions shape: {predictions.shape}")
predictions.head()

# 3) Decile portfolios (sample run)

We form deciles each month based on predictions and compute next-month realized returns.

In [None]:
logging.info("Computing decile portfolio returns...")
decile_returns = compute_decile_portfolio_returns(
    predictions,
    return_col="realized_return",
    n_deciles=10,
    weight_col=None,   # set to a market-cap column if you have one
)
decile_returns.to_csv(DIR_OUTPUT / "decile_returns_sample.csv")

logging.info("Computing decile portfolio weights...")
decile_weights = compute_decile_portfolio_weights(
    predictions,
    n_deciles=10,
    weight_col=None,
)
decile_weights.to_csv(DIR_OUTPUT / "decile_weights_sample.csv")

decile_returns.tail()

## 4) Assemble final modeling panel

This creates the final dataset with:
- standardized characteristics
- factor loadings
- `next_return` target (shifted by ticker)

In [None]:
panel = assemble_modeling_dataset(
    returns=returns,
    characteristics=characteristics,
    factor_loadings=factor_loadings,
    return_col=RETURN_COL,
    next_return_col=TARGET_COL,
    standardize=STANDARDIZE,
)

panel.to_csv(DIR_RESULTS / "panel.csv")
panel.head()

## 5) Rolling out-of-sample predictions (multiple models)

This generates one prediction column per model + realized next return.

In [None]:
feature_cols = [c for c in panel.columns if c != TARGET_COL]

predictions = generate_oos_predictions_all_models(
    panel,
    feature_cols,
    target_col=TARGET_COL,
    window_config=window_cfg,
    realized_col=REALIZED_COL,
    output_path=DIR_RESULTS / "predictions.csv",
)

predictions.head()

## 6) Decile portfolios + long–short

We sort stocks each month into deciles based on each model’s prediction,
then compute the next-month realized return for each decile.

In [None]:
decile_returns = compute_decile_portfolio_returns(
    predictions,
    return_col=REALIZED_COL,
    model_cols=None,
    weight_col=WEIGHT_COL,
    n_deciles=N_DECILES,
)
decile_returns.to_csv(DIR_RESULTS / "decile_returns.csv")

decile_weights = compute_decile_portfolio_weights(
    predictions,
    model_cols=None,
    weight_col=WEIGHT_COL,
    n_deciles=N_DECILES,
)
decile_weights.to_csv(DIR_RESULTS / "decile_weights.csv")

decile_returns.tail()

## 7) Reporting (performance table, significance tests, plots)

This will output:
- performance summary table
- DM test table (forecast accuracy differences)
- Sharpe significance tests
- rolling Sharpe plot
- cumulative return plot
- factor exposure plot (if factor loadings + weights exist)
- feature importance plot (optional)

In [None]:
report = generate_comparison_report(
    decile_returns=decile_returns,
    prediction_panel=predictions,
    portfolio_weights=decile_weights,
    factor_loadings=factor_loadings,
    feature_panel=panel,
    feature_cols=feature_cols,
    transaction_cost_bps=None,  # set e.g. 10 for 10 bps cost per unit turnover
    realized_col=REALIZED_COL,
    risk_free_rate=0.0,
    periods_per_year=12,
    rolling_sharpe_window=12,
    output_dir=DIR_REPORTS,
)

# Save key outputs
if isinstance(report.get("performance_metrics"), pd.DataFrame):
    report["performance_metrics"].to_csv(DIR_REPORTS / "performance_metrics.csv")

if isinstance(report.get("dm_table"), pd.DataFrame):
    report["dm_table"].to_csv(DIR_REPORTS / "dm_table.csv")

if isinstance(report.get("sharpe_table"), pd.DataFrame):
    report["sharpe_table"].to_csv(DIR_REPORTS / "sharpe_table.csv")

report["performance_metrics"]


## Outputs

- `results/panel.csv`
- `results/predictions.csv`
- `results/decile_returns.csv`
- `results/decile_weights.csv`
- `results/reports/*` (tables + plots)

In [None]:
# Combine predictions with realized returns
panel_with_preds = panel.join([df_ols_pred, df_ridge_pred], how='left')
panel_with_preds = panel_with_preds.rename(columns={'next_return': 'realized_return'})

ranking_cfg = RankingConfig(prediction_col='ridge_prediction', output_col='ridge_rank', basis='zscore')
df_ridge_rank = convert_predictions_to_rankings(panel_with_preds[['ridge_prediction']], config=ranking_cfg)
panel_ranked = panel_with_preds.join(df_ridge_rank)

decile_returns = compute_decile_portfolio_returns(
    panel_ranked,
    model_cols=['ridge_rank', 'ols_prediction'],
    return_col='realized_return',
)
decile_weights = compute_decile_portfolio_weights(
    panel_ranked,
    model_cols=['ridge_rank', 'ols_prediction'],
)
decile_returns.head()

### 7 — Performance summary
Compute annualized performance stats, cumulative returns, and drawdowns for each model/decile combination.

In [None]:
metrics, cumulative, drawdowns = summarize_portfolio_performance(
    decile_returns,
    turnover_weights=decile_weights,
    transaction_cost_bps=0,
    periods_per_year=12,
)
metrics.head()