In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from functools import lru_cache
import re

In [2]:
df_statements = pd.read_csv("data/annual_statements.csv")

In [3]:
df_statements['datadate'] = pd.to_datetime(df_statements['datadate'])

In [4]:
df_statements.head()

Unnamed: 0.1,Unnamed: 0,gvkey,tic,fyear,datadate,conm,revt,ni,emp,prcc_f,ceq,oancf,at,dltt,act,lct,csho,cogs
0,0,1004,AIR,2002.0,2003-05-31,AAR CORP,606.337,-12.41,2.1,4.5,294.988,34.733,686.621,164.658,396.412,203.575,31.851,496.747
1,1,1004,AIR,2003.0,2004-05-31,AAR CORP,651.958,3.504,2.3,9.58,301.684,14.572,709.292,248.666,432.204,131.261,32.245,523.302
2,2,1004,AIR,2004.0,2005-05-31,AAR CORP,747.848,15.453,2.6,16.04,314.744,50.938,732.23,227.159,474.542,160.025,32.586,598.172
3,3,1004,AIR,2005.0,2006-05-31,AAR CORP,897.284,35.163,3.3,24.08,422.717,-40.482,978.819,318.576,624.454,187.788,36.654,704.081
4,4,1004,AIR,2006.0,2007-05-31,AAR CORP,1061.169,58.66,3.9,32.5,494.243,-21.239,1067.633,253.611,645.721,256.506,37.729,837.171


In [5]:
def build_decile_memberships(df_statements: pd.DataFrame,
                             id_col: str = 'tic',
                             n_bins: int = 10):
    """Return nested dict: deciles_by_year[year][decile] -> list of tickers."""
    df = df_statements.copy()

    # --- Basic sanity ---
    # Ensure expected columns exist
    req = {'gvkey','tic','fyear','datadate','revt','ni','oancf','ceq','prcc_f','csho'}
    missing = req - set(df.columns)
    if missing:
        raise ValueError(f"df_statements missing columns: {missing}")

    # Parse date + sort
    # make sure dates are sorted within gvkey
    df['datadate'] = pd.to_datetime(df['datadate'])
    df = df.sort_values(['gvkey','datadate']).reset_index(drop=True)

    # --- Market equity & core ratios (at fiscal-year end) ---
    df['ME'] = df['prcc_f'] * df['csho']   # assumes price $/share and shares in millions
    # Guard against divide-by-zero or negative ME
    df.loc[~(df['ME'] > 0), 'ME'] = np.nan

    df['bm'] = df['ceq']   / df['ME']              # Book-to-Market
    df['ep'] = df['ni']    / df['ME']              # Earnings-to-Price
    df['cp'] = df['oancf'] / df['ME']              # CashFlow-to-Price

    # --- GS: pre-formation 5y average sales growth ---
    # First compute YoY growth in revenue per firm
    df['revt_lag'] = df.groupby('gvkey')['revt'].shift(1)
    df['sales_growth'] = (df['revt'] / df['revt_lag']) - 1.0
    # Pre-formation average over *prior* 5 years (exclude current fyear reading)
    gs = (df.groupby('gvkey')['sales_growth']
        .apply(lambda s: s.shift(1).rolling(5, min_periods=5).mean())
        .reset_index(level=0, drop=True))   # <- key: drop the gvkey level

    df['gs'] = gs

    # --- Formation year (end of April Y uses fyear=Y-1 financials) ---
    df['formation_year'] = df['fyear'].astype('Int64') + 1

    # Keep the columns we need
    chars = df[['formation_year','gvkey','tic','bm','ep','cp','gs','ME']].dropna(subset=['formation_year'])

    # Helper to form deciles for one characteristic
    def one_char_deciles(chars_df: pd.DataFrame, char: str):
        cs = chars_df[['formation_year','gvkey','tic',char,'ME']].dropna(subset=[char]).copy()

        out_rows = []
        for Y, g in cs.groupby('formation_year'):
            # Need enough names to form deciles
            if g[char].nunique() < n_bins:
                continue
            # qcut on rank to avoid duplicate-bin issues
            g = g.copy()
            rk = g[char].rank(method='first')
            try:
                g['decile'] = pd.qcut(rk, n_bins, labels=False) + 1  # 1..10
            except ValueError:
                # fallback: skip if still not enough spread
                continue

            # Collect members per decile
            members = (g.groupby('decile')[id_col]
                         .apply(list)
                         .reindex(range(1, n_bins+1), fill_value=[])
                         .to_dict())
            out_rows.append((Y, members))

        # Build nested dict: deciles_by_year[year][decile] -> list
        deciles_by_year = {}
        for Y, members in out_rows:
            deciles_by_year[int(Y)] = {int(k): v for k, v in members.items()}
        return deciles_by_year

    return {
        'bm' : one_char_deciles(chars, 'bm'),
        'ep' : one_char_deciles(chars, 'ep'),
        'cp' : one_char_deciles(chars, 'cp'),
        'gs' : one_char_deciles(chars, 'gs'),
    }
    
deciles_by_year = build_decile_memberships(df_statements)

In [None]:
# example: show a couple stocks in the first decile in year 2020
# here, lower decile = low B/M ratio = glamour stock
deciles_by_year['bm'][2020][1][:5]

['AAL', 'WDDD', 'PBAJ', 'ANDR', 'BA']

In [134]:
postformation_returns_5y_yf("AAPL", 2014)

[np.float64(0.46803250995942114),
 np.float64(-0.2192882925142451),
 np.float64(0.5620805792824903),
 np.float64(0.24264308234087562),
 np.float64(-0.04887947443331486)]

In [135]:
@lru_cache(maxsize=None)
def _pf5y_cached(ticker: str, formation_year: int):
    try:
        out = postformation_returns_5y_yf(ticker, formation_year)
        # ensure 5-length list of floats/NaN
        if out is None or len(out) != 5:
            return [np.nan]*5
        return [float(x) if x is not None else np.nan for x in out]
    except Exception:
        return [np.nan]*5

def _cohort_equal_weight_R1toR5(tickers, formation_year):
    """Equal-weight R1..R5 across tickers in the decile formed at year=formation_year."""
    if not tickers:
        return [np.nan]*5
    rets = np.array([_pf5y_cached(t, formation_year) for t in tickers], dtype=float)
    # drop tickers missing any of the 12 months for that R_t (NaN in that column)
    print('rets', rets, tickers, formation_year)
    with np.errstate(all='ignore'):
        out = np.nanmean(rets, axis=0)  # equal-weight across names
    return out.tolist()

def build_panel_for_char(deciles_by_year, char_key='bm', deciles=10, years=None):
    """
    Returns:
      panel_df: 5 x 10 DataFrame where rows are R1..R5 and cols are deciles 1..10
      ar:       Series of AR per decile = mean(R1..R5)
      cr5:      Series of CR(5y) per decile = prod(1+R_t) - 1 (annual rebalancing)
    """
    by_year = deciles_by_year[char_key]  # e.g., deciles_by_year['bm'] = {Y: {1:[tics],...,10:[tics]}}
    Ys_all = sorted(by_year.keys())
    if years is None:
        years = Ys_all
    else:
        years = [y for y in years if y in by_year]

    # accumulators for averaging across formation years
    # shape: 5 x 10 (R1..R5 by decile)
    sum_R = np.zeros((5, deciles), dtype=float)
    cnt_R = np.zeros((5, deciles), dtype=float)

    for Y in years:
        for d in range(1, deciles+1):
            tickers = by_year.get(Y, {}).get(d, [])
            r1to5 = _cohort_equal_weight_R1toR5(tickers, Y)  # equal-weight across names in that cohort
            r1to5 = np.array(r1to5, dtype=float)
            print(r1to5)
            mask = ~np.isnan(r1to5)
            sum_R[mask, d-1] += r1to5[mask]
            cnt_R[mask, d-1] += 1.0

    panel = sum_R / np.where(cnt_R == 0, np.nan, cnt_R)  # average over formation years
    panel_df = pd.DataFrame(panel, index=[f"Year {i}" for i in range(1,6)],
                            columns=[str(d) for d in range(1, deciles+1)])

    # AR: average annual return over post-formation years (mean across R1..R5)
    ar = pd.Series(np.nanmean(panel, axis=0), index=panel_df.columns, name="AR")

    # CR(5y): compounded 5y return with annual rebalancing: prod_t (1+R_t) - 1
    cr5 = pd.Series(np.prod(1.0 + panel, axis=0) - 1.0, index=panel_df.columns, name="CR_5y")

    return panel_df, ar, cr5

In [None]:
panel_bm, ar_bm, cr5_bm = build_panel_for_char(deciles_by_year, char_key='bm')

print("Panel A: B/M (R1..R5 by decile):")
print(panel_bm.round(3))