# Cross-Sectional Return Prediction and Portfolio Construction

This project studies cross-sectional stock return predictability using firm characteristics, factor exposures, and machine learning models. Monthly firm-level features are constructed, factor loadings are estimated using rolling regressions, and out-of-sample return predictions are generated. Portfolio performance is evaluated through decile portfolios and long–short strategies.

## 0. Setup and Configuration

This section defines paths, global parameters, and imports all project modules.

In [None]:
from pathlib import Path
import logging
import pandas as pd
import numpy as np

# Project modules
from features import FeatureConfig, compute_features
from factor_exposures import ExposureConfig, compute_factor_exposures
from final_dataset import assemble_modeling_dataset
from training_scheme import WindowConfig
from oos_predictions import generate_oos_predictions_all_models
from portfolios import compute_decile_portfolio_returns, compute_decile_portfolio_weights
from reporting import generate_comparison_report, run_plot_suite, plot_long_short_rank_vs_raw
from ranking import RankingConfig, convert_predictions_to_rankings
from performance_metrics import summarize_portfolio_performance

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# Paths (UNCHANGED)
DIR_DATA = Path("../data")
DIR_STOCK_DATA = DIR_DATA / "stock_data"
PATH_META = DIR_DATA / "metadata_listed.csv"
PATH_META_DELISTED = DIR_DATA / "metadata_delisted.csv"
PATH_FACTORS = DIR_DATA / "factors.csv"

DIR_OUTPUT = Path("outputs")
DIR_OUTPUT.mkdir(parents=True, exist_ok=True)

DIR_REPORTS = Path("reports")
DIR_REPORTS.mkdir(parents=True, exist_ok=True)

# Controls (dev run)
P_SAMPLE = 1
MAX_ROWS_PER_TICKER = 1500
PRICE_COL = "adj_close"
VOLUME_COL = "volume"

STANDARDIZE_METHOD = "zscore"
REALIZED_COL = "realized_return"
N_DECILES = 10
WEIGHT_COL = None

## 1. Daily Stock Data

We load daily adjusted prices and volumes for a random subsample of listed and delisted stocks,
then construct a clean daily panel indexed by (ticker, date).

In [None]:
df_meta = pd.read_csv(PATH_META)
df_meta_delisted = pd.read_csv(PATH_META_DELISTED)

tickers = pd.concat(
    [df_meta[["Symbol"]], df_meta_delisted[["Symbol"]]],
    ignore_index=True
)["Symbol"].dropna().astype(str).unique()

def keep_ticker(t: str, p: float) -> bool:
    return (hash(t) % 10_000) / 10_000 < p

tickers_sample = [t for t in tickers if keep_ticker(t, P_SAMPLE)]
logging.info(f"Selected {len(tickers_sample)} tickers out of {len(tickers)} (~{P_SAMPLE:.0%}).")

all_stocks = []
skipped = 0
usecols = ["date", PRICE_COL, VOLUME_COL]

for symbol in tickers_sample:
    path_stock = DIR_STOCK_DATA / f"{symbol}.csv"
    if not path_stock.exists():
        skipped += 1
        continue

    try:
        df_stock = pd.read_csv(
            path_stock,
            usecols=lambda c: c in usecols,
            parse_dates=["date"],
        )

        if MAX_ROWS_PER_TICKER is not None and len(df_stock) > MAX_ROWS_PER_TICKER:
            df_stock = df_stock.tail(MAX_ROWS_PER_TICKER)

        df_stock = df_stock.sort_values("date")

        df_stock[PRICE_COL] = pd.to_numeric(df_stock[PRICE_COL], errors="coerce", downcast="float")
        df_stock[VOLUME_COL] = pd.to_numeric(df_stock[VOLUME_COL], errors="coerce", downcast="float")

        df_stock["ticker"] = symbol
        df_stock = df_stock.set_index(["ticker", "date"]).sort_index()

        all_stocks.append(df_stock)

    except Exception as e:
        skipped += 1
        logging.warning(f"Skipping {symbol}: {e}")

df_stocks = pd.concat(all_stocks) if all_stocks else pd.DataFrame()
logging.info(f"Loaded panel shape: {df_stocks.shape}, skipped: {skipped}, rows: {len(df_stocks):,}")

## 2. Monthly Returns

Daily prices are aggregated to month-end prices, and monthly returns are computed
as simple percentage changes.

In [None]:
logging.info("Building monthly returns...")

monthly_price = (
    df_stocks[[PRICE_COL]]
    .rename(columns={PRICE_COL: "price"})
    .groupby(level="ticker")
    .resample("ME", level="date")     # month-end
    .last()
)

monthly_ret = monthly_price.groupby(level="ticker")["price"].pct_change(fill_method=None)

returns = monthly_ret.rename("RET").to_frame().dropna()
returns.index = pd.MultiIndex.from_arrays(
    [returns.index.get_level_values("ticker"), pd.to_datetime(returns.index.get_level_values("date"))],
    names=["ticker", "date"],
)

logging.info(f"Monthly returns shape: {returns.shape}")

## 3. Factor Returns

We load monthly Fama–French-style factor returns used to estimate rolling factor exposures.

In [None]:
factors = pd.read_csv(PATH_FACTORS, parse_dates=["date"]).set_index("date").sort_index()
logging.info(f"Factors shape: {factors.shape}")

## 4. Feature Engineering and Factor Exposures

Firm characteristics are constructed from daily data and aligned to month-end.
Rolling factor exposures are estimated using historical excess returns.

In [None]:
feature_cfg = FeatureConfig()
feature_cfg.price_col = PRICE_COL
feature_cfg.volume_col = VOLUME_COL

exposure_cfg = ExposureConfig()
window_cfg = WindowConfig()

logging.info("Computing features...")
characteristics = compute_features(df_stocks, config=feature_cfg)

logging.info("Computing factor exposures...")
factor_loadings = compute_factor_exposures(
    returns,
    factors,
    config=exposure_cfg,
)

logging.info("Assembling modeling dataset...")
panel = assemble_modeling_dataset(
    returns=returns,
    characteristics=characteristics,
    factor_loadings=factor_loadings,
    return_col="RET",
    next_return_col="next_return",
    standardize=STANDARDIZE_METHOD,
)

panel.to_csv(DIR_OUTPUT / "panel_sample.csv")

## 5. Out-of-Sample Prediction

We generate rolling out-of-sample predictions using multiple model classes:
OLS, regularized linear models, and random forests.

In [None]:
feature_cols = [c for c in panel.columns if c != "next_return"]

logging.info("Generating OOS predictions...")
predictions = generate_oos_predictions_all_models(
    panel,
    feature_cols,
    target_col="next_return",
    window_config=window_cfg,
    output_path=DIR_OUTPUT / "predictions_sample.csv",
    realized_col=REALIZED_COL,
)

## 6. Portfolio Construction

Predictions are sorted into decile portfolios each month.
We evaluate both equal-weighted and long–short strategies.

In [None]:
MODEL_COLS = [c for c in predictions.columns if c != REALIZED_COL]

logging.info("Computing decile portfolio returns and weights...")
decile_returns = compute_decile_portfolio_returns(
    predictions,
    model_cols=MODEL_COLS,
    return_col=REALIZED_COL,
    weight_col=WEIGHT_COL,
    n_deciles=N_DECILES,
)

logging.info("Computing decile portfolio weights...")
decile_weights = compute_decile_portfolio_weights(
    predictions,
    model_cols=MODEL_COLS,
    weight_col=WEIGHT_COL,
    n_deciles=N_DECILES,
)

## 7. Performance Evaluation

We evaluate portfolio performance using:
- Annualized returns and Sharpe ratios
- Long–short cumulative returns
- Diebold–Mariano forecast comparison tests

In [None]:
report = generate_comparison_report(
    decile_returns=decile_returns,
    prediction_panel=predictions,
    portfolio_weights=decile_weights,
    factor_loadings=factor_loadings,
    feature_panel=panel,
    feature_cols=feature_cols,
    transaction_cost_bps=None,
    realized_col=REALIZED_COL,
    risk_free_rate=0.0,
    periods_per_year=12,
    rolling_sharpe_window=12,
    output_dir=DIR_REPORTS,
)

report["performance_metrics"].to_csv(DIR_REPORTS / "performance_metrics.csv")
report["dm_table"].to_csv(DIR_REPORTS / "dm_table.csv")
report["sharpe_table"].to_csv(DIR_REPORTS / "sharpe_table.csv")

## 8. Rank-Based Signals vs Raw Predictions

Compare long–short (top–bottom decile) performance when portfolios are formed on:
- raw model predictions (e.g., ridge_pred), versus 
- cross-sectional rank-transformed signals (ridge_rank).

### 8.1 Join predictions into panel + set realized_return

In [None]:
# Join predictions into panel for further analysis
logging.info("Joining predictions into panel for further analysis...")
preds_for_join = predictions.drop(columns=["realized_return"], errors="ignore")
panel_with_preds = panel.join(preds_for_join, how="left")

# Use panel's next_return as realized return for portfolio construction
panel_with_preds = panel_with_preds.rename(columns={"next_return": "realized_return"})

# Cap realized returns (monthly) to avoid exploding portfolios
REALIZED_CAP = 1.0 # ±100% per month
panel_with_preds["realized_return"] = panel_with_preds["realized_return"].clip( 
    lower=-REALIZED_CAP, upper=REALIZED_CAP 
    )

# Cross-sectional rank from ridge_pred 
ranking_cfg = RankingConfig(prediction_col="ridge_pred", output_col="ridge_rank", basis="zscore") 
df_ridge_rank = convert_predictions_to_rankings(panel_with_preds[["ridge_pred"]], config=ranking_cfg) 
panel_ranked = panel_with_preds.join(df_ridge_rank, how="left") 
panel_ranked.head()

### 8.2 Decile returns & weights for ridge_rank vs ols_pred

In [None]:
logging.info("Computing decile portfolio returns based on ranked predictions...")
decile_returns_rank = compute_decile_portfolio_returns(
    panel_ranked,
    model_cols=["ridge_rank", "ols_pred"],
    return_col="realized_return",
    n_deciles=N_DECILES,
)

logging.info("Computing decile portfolio weights based on ranked predictions...")
decile_weights_rank = compute_decile_portfolio_weights(
    panel_ranked,
    model_cols=["ridge_rank", "ols_pred"],
    n_deciles=N_DECILES,
)

### 8.3 Summarize performance (with turnover + TC_BPS)

In [None]:
logging.info("Summarizing portfolio performance (with transaction costs) based on ranked predictions...")
metrics_rank, cumulative_rank, drawdowns_rank = summarize_portfolio_performance(
    decile_returns_rank,
    turnover_weights=decile_weights_rank,
    transaction_cost_bps=25,
    periods_per_year=12,
)
metrics_rank.head()

### 8.4 Standard plot suite (writes plots + tc_summary.csv already)

In [None]:
logging.info("Summarizing portfolio performance (with transaction costs) based on ranked predictions...")
out = run_plot_suite( 
    predictions=predictions, 
    decile_returns=decile_returns_rank, 
    decile_weights=decile_weights_rank, 
    realized_col="realized_return", 
    start_date="1995-01-31", 
    cost_bps_list=[0, 10, 25, 50], 
    example_model="ols_pred",
    output_dir=DIR_REPORTS, 
)

### 8.5 Rank vs raw LS comparison plot (ridge_pred vs ridge_rank)

In [None]:
logging.info("Plotting long-short portfolio returns: ranked vs raw predictions...")
out_ls = plot_long_short_rank_vs_raw( 
    decile_returns_raw=decile_returns, 
    decile_returns_rank=decile_returns_rank,
    raw_model="ridge_pred", 
    rank_model="ridge_rank", 
    start_date="1995-01-31", 
    log_scale=True, 
    output_path=DIR_REPORTS / "ls_rank_vs_raw_ridge.png", 
)