In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf

In [None]:
df_statements = pd.read_csv("data/annual_statements.csv")

In [None]:
df_statements['datadate'] = pd.to_datetime(df_statements['datadate'])

In [4]:
df_statements.head()

Unnamed: 0.1,Unnamed: 0,gvkey,tic,fyear,datadate,conm,revt,ni,emp,prcc_f,ceq,oancf,at,dltt,act,lct,csho,cogs
0,0,1004,AIR,2002.0,2003-05-31,AAR CORP,606.337,-12.41,2.1,4.5,294.988,34.733,686.621,164.658,396.412,203.575,31.851,496.747
1,1,1004,AIR,2003.0,2004-05-31,AAR CORP,651.958,3.504,2.3,9.58,301.684,14.572,709.292,248.666,432.204,131.261,32.245,523.302
2,2,1004,AIR,2004.0,2005-05-31,AAR CORP,747.848,15.453,2.6,16.04,314.744,50.938,732.23,227.159,474.542,160.025,32.586,598.172
3,3,1004,AIR,2005.0,2006-05-31,AAR CORP,897.284,35.163,3.3,24.08,422.717,-40.482,978.819,318.576,624.454,187.788,36.654,704.081
4,4,1004,AIR,2006.0,2007-05-31,AAR CORP,1061.169,58.66,3.9,32.5,494.243,-21.239,1067.633,253.611,645.721,256.506,37.729,837.171


In [5]:
def build_decile_memberships(df_statements: pd.DataFrame,
                             id_col: str = 'tic',
                             n_bins: int = 10):
    """Return nested dict: deciles_by_year[year][decile] -> list of tickers."""
    df = df_statements.copy()

    # --- Basic sanity ---
    # Ensure expected columns exist
    req = {'gvkey','tic','fyear','datadate','revt','ni','oancf','ceq','prcc_f','csho'}
    missing = req - set(df.columns)
    if missing:
        raise ValueError(f"df_statements missing columns: {missing}")

    # Parse date + sort
    # make sure dates are sorted within gvkey
    df['datadate'] = pd.to_datetime(df['datadate'])
    df = df.sort_values(['gvkey','datadate']).reset_index(drop=True)

    # --- Market equity & core ratios (at fiscal-year end) ---
    df['ME'] = df['prcc_f'] * df['csho']   # assumes price $/share and shares in millions
    # Guard against divide-by-zero or negative ME
    df.loc[~(df['ME'] > 0), 'ME'] = np.nan

    df['bm'] = df['ceq']   / df['ME']              # Book-to-Market
    df['ep'] = df['ni']    / df['ME']              # Earnings-to-Price
    df['cp'] = df['oancf'] / df['ME']              # CashFlow-to-Price

    # --- GS: pre-formation 5y average sales growth ---
    # First compute YoY growth in revenue per firm
    df['revt_lag'] = df.groupby('gvkey')['revt'].shift(1)
    df['sales_growth'] = (df['revt'] / df['revt_lag']) - 1.0
    # Pre-formation average over *prior* 5 years (exclude current fyear reading)
    gs = (df.groupby('gvkey')['sales_growth']
        .apply(lambda s: s.shift(1).rolling(5, min_periods=5).mean())
        .reset_index(level=0, drop=True))   # <- key: drop the gvkey level

    df['gs'] = gs

    # --- Formation year (end of April Y uses fyear=Y-1 financials) ---
    df['formation_year'] = df['fyear'].astype('Int64') + 1

    # Keep the columns we need
    chars = df[['formation_year','gvkey','tic','bm','ep','cp','gs','ME']].dropna(subset=['formation_year'])

    # Helper to form deciles for one characteristic
    def one_char_deciles(chars_df: pd.DataFrame, char: str):
        cs = chars_df[['formation_year','gvkey','tic',char,'ME']].dropna(subset=[char]).copy()

        out_rows = []
        for Y, g in cs.groupby('formation_year'):
            # Need enough names to form deciles
            if g[char].nunique() < n_bins:
                continue
            # qcut on rank to avoid duplicate-bin issues
            g = g.copy()
            rk = g[char].rank(method='first')
            try:
                g['decile'] = pd.qcut(rk, n_bins, labels=False) + 1  # 1..10
            except ValueError:
                # fallback: skip if still not enough spread
                continue

            # Collect members per decile
            members = (g.groupby('decile')[id_col]
                         .apply(list)
                         .reindex(range(1, n_bins+1), fill_value=[])
                         .to_dict())
            out_rows.append((Y, members))

        # Build nested dict: deciles_by_year[year][decile] -> list
        deciles_by_year = {}
        for Y, members in out_rows:
            deciles_by_year[int(Y)] = {int(k): v for k, v in members.items()}
        return deciles_by_year

    return {
        'bm' : one_char_deciles(chars, 'bm'),
        'ep' : one_char_deciles(chars, 'ep'),
        'cp' : one_char_deciles(chars, 'cp'),
        'gs' : one_char_deciles(chars, 'gs'),
    }
    
deciles_by_year = build_decile_memberships(df_statements)

In [6]:
all_tickers = df_statements['tic'].unique().tolist()


In [None]:
def _compound(returns: pd.Series) -> float:
    returns = returns.dropna()
    if returns.empty:
        return np.nan
    return ((1.0 + returns).prod() - 1.0).iloc[0]

def postformation_returns_5y_yf(ticker: str,
                                formation_year: int,
                                window: str = "lsv",
                                start_buffer_years: int = 1,
                                end_buffer_years: int = 6):
    """
    Compute 5 post-formation annual returns for a ticker using Yahoo monthly data.

    Parameters
    ----------
    ticker : str
        e.g., "AAPL"
    formation_year : int
        The portfolio is formed at the end of April in `formation_year`.
    window : {"lsv", "ff"}
        "lsv": compound MAY(Y) .. APR(Y+1)
        "ff" : compound JUL(Y) .. JUN(Y+1)
    start_buffer_years, end_buffer_years : int
        Download range padding around formation_year for safety.

    Returns
    -------
    list[float] : [R1, R2, R3, R4, R5]
    """

    # 1) Download split/dividend-adjusted monthly prices
    start = f"{formation_year - start_buffer_years}-01-01"
    end   = f"{formation_year + end_buffer_years}-12-31"
    df = yf.download(ticker, start=start, end=end,
                     interval="1mo", auto_adjust=True, progress=False)


    if df.empty:
        return [np.nan]*5

    # 2) Monthly returns
    price_col = 'Adj Close' if 'Adj Close' in df.columns else 'Close'
    r = df[price_col].pct_change().dropna()

    # Use PeriodIndex for precise month slicing
    r.index = r.index.to_period('M')

    # 3) Helper: build 12-month windows per convention
    def _year_ret(y0: int) -> float:
        if window == "ff":
            # JUL y0 .. JUN y0+1
            start_p = pd.Period(f"{y0}-07", freq="M")
            end_p   = pd.Period(f"{y0+1}-06", freq="M")
        else:
            # LSV: MAY y0 .. APR y0+1
            # start_p = pd.Period(f"{y0}-05", freq="M")
            # end_p   = pd.Period(f"{y0+1}-04", freq="M")
            # LSV: JUN y0 .. MAY y0+1
            start_p = pd.Period(f"{y0}-06", freq="M")
            end_p   = pd.Period(f"{y0+1}-05", freq="M")
        sel = r[(r.index >= start_p) & (r.index <= end_p)]
        return _compound(sel)

    # 4) Five consecutive post-formation annual returns
    return [_year_ret(formation_year + k) for k in range(5)]

In [None]:
postformation_returns_5y_yf("AAPL", 2020)

[np.float64(0.8230667868515655),
 np.float64(0.34648189847663),
 np.float64(-0.2640418603558611),
 np.float64(0.49008041637443767),
 np.float64(0.30705239680114293)]