In [17]:
import warnings
warnings.filterwarnings('ignore')

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['axes.grid'] = True

DATA_CSV = 'MSF_1996_2023.csv'   
HAS_RISK_FREE = False            
START_YEAR, END_YEAR = 1996, 2023

DATE_COL   = 'date'      
ID_COL     = 'permno'
RET_COL    = 'ret'       
MKT_RET_COL= 'vwretd'    
SIC_COL    = 'siccd'
PRC_COL    = 'prc'
SHROUT_COL = 'shrout'

np.random.seed(7)

In [18]:
df = pd.read_csv(DATA_CSV)
_cols = {c.lower(): c for c in df.columns}

def pick(*names):
    for n in names:
        if n in df.columns: 
            return n
        if n.lower() in _cols: 
            return _cols[n.lower()]
    return None

DATE_COL    = pick('date','yyyymm','caldt') or DATE_COL
ID_COL      = pick('permno') or ID_COL
RET_COL     = pick('ret','retx') or RET_COL
MKT_RET_COL = pick('vwretd','vwretx') or MKT_RET_COL
SIC_COL     = pick('siccd') or SIC_COL
PRC_COL     = pick('prc') or PRC_COL
SHROUT_COL  = pick('shrout') or SHROUT_COL

if DATE_COL and pd.api.types.is_integer_dtype(df[DATE_COL]):
    df[DATE_COL] = pd.to_datetime(df[DATE_COL].astype(str) + '01') + pd.offsets.MonthEnd(0)
elif DATE_COL:
    df[DATE_COL] = pd.to_datetime(df[DATE_COL])

print("Resolved columns ->",
      dict(DATE_COL=DATE_COL, ID_COL=ID_COL, RET_COL=RET_COL, MKT_RET_COL=MKT_RET_COL, 
           SIC_COL=SIC_COL, PRC_COL=PRC_COL, SHROUT_COL=SHROUT_COL))

df = df[(df[DATE_COL].dt.year >= START_YEAR) & (df[DATE_COL].dt.year <= END_YEAR)].copy()

for col in [RET_COL, 'retx', MKT_RET_COL, PRC_COL, SHROUT_COL, SIC_COL]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

if HAS_RISK_FREE:
    if 'rf' not in df.columns:
        raise ValueError("HAS_RISK_FREE=True but 'rf' column not found. Provide monthly risk-free rate as DECIMAL.")
    df['excess_ret'] = df[RET_COL] - df['rf']
    df['mkt_excess'] = df[MKT_RET_COL] - df['rf']
else:
    df['excess_ret'] = df[RET_COL]
    df['mkt_excess'] = df[MKT_RET_COL]

df['mktcap'] = (df[PRC_COL].abs() * df[SHROUT_COL]).astype(float)
df['year'] = df[DATE_COL].dt.year
df['ym'] = df[DATE_COL].dt.to_period('M')

df.sort_values([ID_COL, DATE_COL], inplace=True)
print("Data loaded. Rows:", len(df))
df.head()

Resolved columns -> {'DATE_COL': 'date', 'ID_COL': 'PERMNO', 'RET_COL': 'RET', 'MKT_RET_COL': 'vwretd', 'SIC_COL': 'SICCD', 'PRC_COL': 'PRC', 'SHROUT_COL': 'SHROUT'}
Data loaded. Rows: 1636563


Unnamed: 0,PERMNO,date,SHRCD,SICCD,TICKER,COMNAM,PERMCO,CUSIP,BIDLO,ASKHI,...,BID,ASK,SHROUT,RETX,vwretd,excess_ret,mkt_excess,mktcap,year,ym
0,10001,1996-01-31,11,4920.0,EWST,ENERGY WEST INC,7953,36720410,8.75,9.5,...,8.75,9.5,2281.0,-0.026667,0.028121,-0.026667,0.028121,20814.125,1996,1996-01
1,10001,1996-02-29,11,4920.0,EWST,ENERGY WEST INC,7953,36720410,8.75,9.5,...,9.25,9.75,2281.0,0.013699,0.016353,0.013699,0.016353,21099.25,1996,1996-02
2,10001,1996-03-29,11,4920.0,EWST,ENERGY WEST INC,7953,36720410,9.25,9.75,...,9.0,9.5,2309.0,0.025338,0.010914,0.036149,0.010914,21899.43342,1996,1996-03
3,10001,1996-04-30,11,4920.0,EWST,ENERGY WEST INC,7953,36720410,8.625,9.375,...,8.625,9.0,2309.0,-0.07084,0.02556,-0.07084,0.02556,20348.0625,1996,1996-04
4,10001,1996-05-31,11,4920.0,EWST,ENERGY WEST INC,7953,36720410,8.625,9.0,...,8.625,9.0,2309.0,-0.021277,0.02681,-0.021277,0.02681,19915.125,1996,1996-05


## SIC → Industry mapping

In [19]:
def map_sic_to_industry(sic: float) -> str:
    try:
        s = int(sic)
    except (ValueError, TypeError):
        return 'Unknown'
    if 1 <= s <= 999: return 'Agriculture, Forestry & Fishing'
    if 1000 <= s <= 1499: return 'Mining'
    if 1500 <= s <= 1799: return 'Construction'
    if 2000 <= s <= 3999: return 'Manufacturing'
    if 4000 <= s <= 4999: return 'Transportation & Utilities'
    if 5000 <= s <= 5199: return 'Wholesale Trade'
    if 5200 <= s <= 5999: return 'Retail Trade'
    if 6000 <= s <= 6799: return 'Finance, Insurance & Real Estate'
    if 7000 <= s <= 8999: return 'Services'
    if 9000 <= s <= 9999: return 'Public Administration'
    return 'Unknown'

df['industry'] = df[SIC_COL].apply(map_sic_to_industry)
df['industry'].value_counts().head()

industry
Manufacturing                       579113
Finance, Insurance & Real Estate    297158
Services                            283090
Public Administration               119836
Transportation & Utilities          119790
Name: count, dtype: int64

## Sampling up to 10 firms per industry per year

In [20]:
year_ind_perm = (df.dropna(subset=[ID_COL, 'industry'])
                   .groupby(['year','industry'])[ID_COL]
                   .unique()
                   .reset_index(name='permnos'))

def sample_permnos(permnos, k=10):
    p = np.array(permnos)
    if len(p) <= k:
        return p.tolist()
    return np.random.choice(p, size=k, replace=False).tolist()

year_ind_perm['sampled_permnos'] = year_ind_perm['permnos'].apply(sample_permnos)
samp_rows = []
for _, r in year_ind_perm.iterrows():
    for p in r['sampled_permnos']:
        samp_rows.append((r['year'], r['industry'], p))
sampled_universe = pd.DataFrame(samp_rows, columns=['year','industry',ID_COL])
print("Sampled rows:", len(sampled_universe))
sampled_universe.head()

Sampled rows: 2895


Unnamed: 0,year,industry,PERMNO
0,1996,"Agriculture, Forestry & Fishing",11642
1,1996,"Agriculture, Forestry & Fishing",16468
2,1996,"Agriculture, Forestry & Fishing",11790
3,1996,"Agriculture, Forestry & Fishing",91708
4,1996,"Agriculture, Forestry & Fishing",82225


## Rolling CAPM regressions (12m/24m/36m)

In [21]:
def estimate_beta_for_stock_year(stock_df, end_period, window_months):
    start_period = end_period - (window_months - 1)
    win = stock_df[(stock_df['ym'] >= start_period) & (stock_df['ym'] <= end_period)].copy()
    win = win.dropna(subset=['excess_ret', 'mkt_excess'])
    if len(win) < max(8, window_months//2):
        return np.nan, np.nan, np.nan

    X = sm.add_constant(win['mkt_excess'].values)
    y = win['excess_ret'].values
    model = sm.OLS(y, X, missing='drop').fit()
    beta = model.params[1] if len(model.params) > 1 else np.nan
    resid_var = np.var(model.resid, ddof=1) if model.resid.size > 1 else np.nan
    mkt_var = np.var(win['mkt_excess'].values, ddof=1) if win['mkt_excess'].size > 1 else np.nan
    return beta, resid_var, mkt_var

def compute_betas(df, sampled_universe, windows=(12,24,36)):
    out = []
    for (year, industry, permno) in sampled_universe[['year','industry',ID_COL]].itertuples(index=False):
        stock_df = df[df[ID_COL] == permno].copy()
        year_mask = (stock_df['year'] == year)
        if not year_mask.any():
            continue
        last_ym = stock_df.loc[year_mask, 'ym'].max()
        if pd.isna(last_ym):
            continue

        row = {'year': year, 'industry': industry, ID_COL: permno}
        for w in windows:
            beta, resid_var, mkt_var = estimate_beta_for_stock_year(stock_df, last_ym, w)
            row[f'beta_{w}m'] = beta
            row[f'resid_var_{w}m'] = resid_var
            row[f'mkt_var_{w}m'] = mkt_var
        out.append(row)
    return pd.DataFrame(out)

betas = compute_betas(df, sampled_universe, windows=(12,24,36))
print("Betas rows:", len(betas))
betas.head()

Betas rows: 2895


Unnamed: 0,year,industry,PERMNO,beta_12m,resid_var_12m,mkt_var_12m,beta_24m,resid_var_24m,mkt_var_24m,beta_36m,resid_var_36m,mkt_var_36m
0,1996,"Agriculture, Forestry & Fishing",11642,-1.465386,0.0833,0.000973,-1.465386,0.0833,0.000973,,,
1,1996,"Agriculture, Forestry & Fishing",16468,0.869667,0.016199,0.000973,0.869667,0.016199,0.000973,,,
2,1996,"Agriculture, Forestry & Fishing",11790,0.456031,0.004957,0.000973,0.456031,0.004957,0.000973,,,
3,1996,"Agriculture, Forestry & Fishing",91708,,,,,,,,,
4,1996,"Agriculture, Forestry & Fishing",82225,,,,,,,,,


## Descriptive statistics of betas by industry

In [22]:
from scipy.stats import skew, kurtosis

def describe_series(x: pd.Series) -> pd.Series:
    x = x.dropna()
    if x.empty:
        return pd.Series(dtype=float)
    pct = np.nanpercentile(x, [1,5,25,50,75,95,99])
    return pd.Series({
        'N': x.size,
        'mean': x.mean(),
        'std': x.std(ddof=1),
        'skew': skew(x, bias=False),
        'kurtosis': kurtosis(x, bias=False),
        'min': x.min(),
        'p1': pct[0], 'p5': pct[1], 'p25': pct[2], 'p50': pct[3], 'p75': pct[4], 'p95': pct[5], 'p99': pct[6],
        'max': x.max()
    })

desc_frames = {}
os.makedirs('tables', exist_ok=True)
for w in (12,24,36):
    col = f'beta_{w}m'
    g = betas.groupby('industry')[col].apply(describe_series).unstack()
    desc_frames[w] = g
    g.to_csv(f'tables/beta_desc_by_industry_{w}m.csv')
    
desc_frames[36].head()

Unnamed: 0_level_0,N,mean,std,skew,kurtosis,min,p1,p5,p25,p50,p75,p95,p99,max
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
"Agriculture, Forestry & Fishing",225.0,0.827871,0.914833,-0.312202,7.545064,-4.529084,-1.029945,-0.211715,0.39513,0.756782,1.17905,2.583138,3.472926,4.196799
Construction,251.0,1.404141,0.919968,0.239951,1.95191,-1.677158,-1.131084,0.010306,0.87913,1.410341,1.838645,2.904282,4.179358,4.876131
"Finance, Insurance & Real Estate",257.0,0.856095,0.729829,1.620425,6.568826,-0.609608,-0.28519,-0.150986,0.390364,0.806689,1.219059,1.939937,3.200702,5.316266
Manufacturing,246.0,1.350139,1.099561,-0.987582,14.196563,-7.022316,-0.213053,0.038943,0.777094,1.262153,1.777338,3.235102,4.450898,5.628108
Mining,244.0,1.520115,1.147356,0.540464,1.327327,-2.245095,-0.856077,-0.005699,0.749009,1.421424,2.084092,3.74339,4.805382,5.493467


## Volatility decomposition (TVOL, SVOL, IVOL)

In [23]:
def add_vol_components(df_betas: pd.DataFrame, windows=(12,24,36)):
    vol = df_betas.copy()
    for w in windows:
        bv = vol[f'beta_{w}m']
        mv = vol[f'mkt_var_{w}m']
        rv = vol[f'resid_var_{w}m']
        total_var = (bv**2) * mv + rv
        vol[f'tvol_{w}m'] = np.sqrt(np.maximum(total_var, 0))
        vol[f'svol_{w}m'] = np.sqrt(np.maximum(mv, 0)) * bv
        vol[f'ivol_{w}m'] = np.sqrt(np.maximum(rv, 0))
    return vol

vols = add_vol_components(betas, windows=(12,24,36))
vols.head()

Unnamed: 0,year,industry,PERMNO,beta_12m,resid_var_12m,mkt_var_12m,beta_24m,resid_var_24m,mkt_var_24m,beta_36m,...,mkt_var_36m,tvol_12m,svol_12m,ivol_12m,tvol_24m,svol_24m,ivol_24m,tvol_36m,svol_36m,ivol_36m
0,1996,"Agriculture, Forestry & Fishing",11642,-1.465386,0.0833,0.000973,-1.465386,0.0833,0.000973,,...,,0.292214,-0.045702,0.288618,0.292214,-0.045702,0.288618,,,
1,1996,"Agriculture, Forestry & Fishing",16468,0.869667,0.016199,0.000973,0.869667,0.016199,0.000973,,...,,0.130133,0.027123,0.127275,0.130133,0.027123,0.127275,,,
2,1996,"Agriculture, Forestry & Fishing",11790,0.456031,0.004957,0.000973,0.456031,0.004957,0.000973,,...,,0.07183,0.014223,0.070408,0.07183,0.014223,0.070408,,,
3,1996,"Agriculture, Forestry & Fishing",91708,,,,,,,,...,,,,,,,,,,
4,1996,"Agriculture, Forestry & Fishing",82225,,,,,,,,...,,,,,,,,,,


## Save beta and volatility plots (industry & overall)

In [24]:
PLOT_DIR = "plots"
os.makedirs(PLOT_DIR, exist_ok=True)

windows = (12,24,36)
betas_long = []
for w in windows:
    col = f"beta_{w}m"
    if col not in betas.columns:
        continue
    tmp = (betas.dropna(subset=[col])
                .groupby(['industry','year'])[col]
                .agg(['mean','std','count'])
                .reset_index())
    tmp['window'] = f'{w}m'
    tmp.rename(columns={'mean':'beta_mean','std':'beta_std','count':'N'}, inplace=True)
    betas_long.append(tmp)

if len(betas_long):
    betas_long = pd.concat(betas_long, ignore_index=True)
    betas_long.to_csv(os.path.join(PLOT_DIR, "beta_industry_year_stats.csv"), index=False)

    
    all_inds = sorted(betas_long['industry'].unique())
    for ind in all_inds:
        for w in windows:
            sub = betas_long[(betas_long['industry']==ind) & (betas_long['window']==f'{w}m')]
            if sub.empty: 
                continue
            plt.figure()
            plt.plot(sub['year'], sub['beta_mean'], marker='o')
            plt.title(f'Industry Mean Beta ({w}m) — {ind}')
            plt.xlabel('Year'); plt.ylabel('Mean Beta')
            fname = f"beta_{w}m_{ind.replace(' ', '_').replace(',', '').replace('/', '-')}.png"
            plt.savefig(os.path.join(PLOT_DIR, fname), dpi=300, bbox_inches='tight')
            plt.close()

    
    for w in windows:
        subw = betas_long[betas_long['window']==f'{w}m']
        if subw.empty:
            continue
        pv = subw.pivot(index='year', columns='industry', values='beta_mean').sort_index()
        plt.figure()
        for col in pv.columns:
            plt.plot(pv.index, pv[col], label=col)
        plt.title(f'Industry Mean Beta by Year (window={w}m)')
        plt.xlabel('Year'); plt.ylabel('Mean Beta')
        plt.legend(ncol=2, fontsize=8)
        fname = f"beta_{w}m_all_industries.png"
        plt.savefig(os.path.join(PLOT_DIR, fname), dpi=300, bbox_inches='tight')
        plt.close()

for need in ['tvol_36m','svol_36m','ivol_36m']:
    if need not in vols.columns:
        raise ValueError(f"`vols` missing {need}. Re-run volatility decomposition.")

v_overall = (vols.groupby('year')[['tvol_36m','svol_36m','ivol_36m']]
                 .mean()
                 .reset_index())
v_overall.to_csv(os.path.join(PLOT_DIR, "vol_overall_year_means_36m.csv"), index=False)

plt.figure()
plt.plot(v_overall['year'], v_overall['tvol_36m'], marker='o', label='TVOL')
plt.plot(v_overall['year'], v_overall['svol_36m'], marker='o', label='SVOL')
plt.plot(v_overall['year'], v_overall['ivol_36m'], marker='o', label='IVOL')
plt.title('Volatility Components — Overall (36m)')
plt.xlabel('Year'); plt.ylabel('Level')
plt.legend()
plt.savefig(os.path.join(PLOT_DIR, "vol_overall_36m.png"), dpi=300, bbox_inches='tight')
plt.close()

share = vols[['year','svol_36m','ivol_36m','tvol_36m']].dropna().copy()
share['sys_var'] = share['svol_36m']**2
share['idio_var'] = share['ivol_36m']**2
share['tot_var'] = share['tvol_36m']**2
shares = (share.groupby('year')[['sys_var','idio_var','tot_var']]
               .mean()
               .assign(sys_share=lambda d: d['sys_var']/d['tot_var'],
                       idio_share=lambda d: d['idio_var']/d['tot_var'])
               .reset_index())
shares.to_csv(os.path.join(PLOT_DIR, "vol_overall_shares_36m.csv"), index=False)

plt.figure()
plt.plot(shares['year'], shares['sys_share'], marker='o', label='Systematic share')
plt.plot(shares['year'], shares['idio_share'], marker='o', label='Idiosyncratic share')
plt.title('Variance Shares — Overall (36m)')
plt.xlabel('Year'); plt.ylabel('Share of Total Variance')
plt.legend()
plt.savefig(os.path.join(PLOT_DIR, "vol_overall_shares_36m.png"), dpi=300, bbox_inches='tight')
plt.close()

inds_v = sorted(vols['industry'].dropna().unique())
for ind in inds_v:
    sub = vols[vols['industry'] == ind]
    if sub.empty: 
        continue
    v_ind = (sub.groupby('year')[['tvol_36m','svol_36m','ivol_36m']]
                .mean()
                .reset_index())
    v_ind.to_csv(os.path.join(PLOT_DIR, f"vol_{ind.replace(' ', '_').replace(',', '')}_36m.csv"), index=False)

    plt.figure()
    plt.plot(v_ind['year'], v_ind['tvol_36m'], marker='o', label='TVOL')
    plt.plot(v_ind['year'], v_ind['svol_36m'], marker='o', label='SVOL')
    plt.plot(v_ind['year'], v_ind['ivol_36m'], marker='o', label='IVOL')
    plt.title(f'Volatility Components — {ind} (36m)')
    plt.xlabel('Year'); plt.ylabel('Level')
    plt.legend()
    fname = f"vol_{ind.replace(' ', '_').replace(',', '')}_36m.png"
    plt.savefig(os.path.join(PLOT_DIR, fname), dpi=300, bbox_inches='tight')
    plt.close()

## Portfolio sorts on beta and idiosyncratic volatility

In [25]:
def make_portfolio_sorts(df, betas_df, sort_var='beta_36m', use_value_weighted=True):
    b = betas_df.dropna(subset=[sort_var]).copy()
    b = b[['year','industry',ID_COL, sort_var]].rename(columns={sort_var:'sortkey'})
    b['form_year'] = b['year']
    
    last_obs = (df.groupby([ID_COL,'year'])
                  .apply(lambda x: x.loc[x['ym'].idxmax()])
                  .reset_index(drop=True))
    last_obs = last_obs[[ID_COL,'year','mktcap']].rename(columns={'year':'form_year','mktcap':'mktcap_form'})
    b = b.merge(last_obs, on=[ID_COL,'form_year'], how='left')
    
    results = []
    for y in range(START_YEAR, END_YEAR):
        by = b[b['form_year']==y].dropna(subset=['sortkey'])
        if by.empty:
            continue
        by['q'] = pd.qcut(by['sortkey'], q=5, labels=[1,2,3,4,5])
        
        future = df[df['year']==(y+1)][[ID_COL,'ym','excess_ret','mktcap']].copy()
        future = future.merge(by[[ID_COL,'q','mktcap_form']], on=ID_COL, how='inner')
        
        if use_value_weighted:
            future['w'] = future.groupby(['ym','q'])['mktcap_form'].transform(lambda x: x / x.sum() if x.sum()>0 else 0.0)
        else:
            future['w'] = future.groupby(['ym','q'])[ID_COL].transform(lambda x: 1.0/len(x))
        
        future['wret'] = future['w'] * future['excess_ret']
        port_m = future.groupby(['ym','q'])['wret'].sum().reset_index()
        port_y = port_m.groupby('q')['wret'].mean().reset_index()
        port_y['form_year'] = y
        results.append(port_y)
    
    if not results:
        return pd.DataFrame(columns=['q','mean_excess_ret','use_value_weighted'])
    res = pd.concat(results, ignore_index=True)
    out = res.groupby('q')['wret'].mean().reset_index().rename(columns={'wret':'mean_excess_ret'})
    out['use_value_weighted'] = use_value_weighted
    return out

betas_for_sort = vols.rename(columns={'beta_36m':'beta_36m', 'ivol_36m':'ivol_36m'})

beta_q_eq = make_portfolio_sorts(df, betas_for_sort, sort_var='beta_36m', use_value_weighted=False)
beta_q_vw = make_portfolio_sorts(df, betas_for_sort, sort_var='beta_36m', use_value_weighted=True)

ivol_q_eq = make_portfolio_sorts(df, betas_for_sort, sort_var='ivol_36m', use_value_weighted=False)
ivol_q_vw = make_portfolio_sorts(df, betas_for_sort, sort_var='ivol_36m', use_value_weighted=True)

beta_q_eq, beta_q_vw, ivol_q_eq, ivol_q_vw

(   q  mean_excess_ret  use_value_weighted
 0  1         0.017371               False
 1  2         0.007741               False
 2  3         0.007200               False
 3  4         0.015620               False
 4  5         0.008079               False,
    q  mean_excess_ret  use_value_weighted
 0  1         0.008369                True
 1  2         0.008493                True
 2  3         0.008247                True
 3  4         0.011787                True
 4  5         0.006427                True,
    q  mean_excess_ret  use_value_weighted
 0  1         0.009687               False
 1  2         0.010188               False
 2  3         0.012032               False
 3  4         0.014132               False
 4  5         0.010104               False,
    q  mean_excess_ret  use_value_weighted
 0  1         0.006456                True
 1  2         0.012692                True
 2  3         0.006982                True
 3  4         0.010004                True
 4  5   

In [26]:
def report_spread(qdf):
    if qdf.empty: 
        return np.nan
    qdf = qdf.copy()
    qdf['q'] = qdf['q'].astype(int)
    r1 = qdf.loc[qdf['q']==1, 'mean_excess_ret'].values
    r5 = qdf.loc[qdf['q']==5, 'mean_excess_ret'].values
    if r1.size==0 or r5.size==0: 
        return np.nan
    return float(r5[0] - r1[0])

print("Beta quintiles (Equal-Weighted) mean returns & 5-1 spread:")
display(beta_q_eq)
print("5-1 spread (beta, EW):", report_spread(beta_q_eq))

print("\nBeta quintiles (Value-Weighted) mean returns & 5-1 spread:")
display(beta_q_vw)
print("5-1 spread (beta, VW):", report_spread(beta_q_vw))

print("\nIdiosyncratic Vol quintiles (Equal-Weighted) mean returns & 5-1 spread:")
display(ivol_q_eq)
print("5-1 spread (ivol, EW):", report_spread(ivol_q_eq))

print("\nIdiosyncratic Vol quintiles (Value-Weighted) mean returns & 5-1 spread:")
display(ivol_q_vw)
print("5-1 spread (ivol, VW):", report_spread(ivol_q_vw))

os.makedirs('tables', exist_ok=True)
beta_q_eq.to_csv('tables/ports_beta_EW.csv', index=False)
beta_q_vw.to_csv('tables/ports_beta_VW.csv', index=False)
ivol_q_eq.to_csv('tables/ports_ivol_EW.csv', index=False)
ivol_q_vw.to_csv('tables/ports_ivol_VW.csv', index=False)


Beta quintiles (Equal-Weighted) mean returns & 5-1 spread:


Unnamed: 0,q,mean_excess_ret,use_value_weighted
0,1,0.017371,False
1,2,0.007741,False
2,3,0.0072,False
3,4,0.01562,False
4,5,0.008079,False


5-1 spread (beta, EW): -0.009291886938576481

Beta quintiles (Value-Weighted) mean returns & 5-1 spread:


Unnamed: 0,q,mean_excess_ret,use_value_weighted
0,1,0.008369,True
1,2,0.008493,True
2,3,0.008247,True
3,4,0.011787,True
4,5,0.006427,True


5-1 spread (beta, VW): -0.0019426634572417287

Idiosyncratic Vol quintiles (Equal-Weighted) mean returns & 5-1 spread:


Unnamed: 0,q,mean_excess_ret,use_value_weighted
0,1,0.009687,False
1,2,0.010188,False
2,3,0.012032,False
3,4,0.014132,False
4,5,0.010104,False


5-1 spread (ivol, EW): 0.0004172016187614907

Idiosyncratic Vol quintiles (Value-Weighted) mean returns & 5-1 spread:


Unnamed: 0,q,mean_excess_ret,use_value_weighted
0,1,0.006456,True
1,2,0.012692,True
2,3,0.006982,True
3,4,0.010004,True
4,5,0.002827,True


5-1 spread (ivol, VW): -0.0036283960832980255


## Make plots pdf

In [27]:
from PIL import Image
import os

plot_dir = "plots"
output_pdf = "all_plots.pdf"

files = [f for f in os.listdir(plot_dir) if f.lower().endswith(".png")]
files = sorted(files)  

images = []
for f in files:
    path = os.path.join(plot_dir, f)
    img = Image.open(path).convert("RGB")
    images.append(img)

if images:
    first, rest = images[0], images[1:]
    first.save(output_pdf, save_all=True, append_images=rest)

## Make tables pdf

In [28]:
import os, glob
import pandas as pd
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, PageBreak
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter, landscape  
from reportlab.lib.styles import getSampleStyleSheet

TABLE_DIR = "tables"
os.makedirs(TABLE_DIR, exist_ok=True)

csv_files = [
    "beta_desc_by_industry_12m.csv",
    "beta_desc_by_industry_24m.csv",
    "beta_desc_by_industry_36m.csv",
    "ports_beta_EW.csv",
    "ports_beta_VW.csv",
    "ports_ivol_EW.csv",
    "ports_ivol_VW.csv",
]

csv_paths = [os.path.join(TABLE_DIR, f) for f in csv_files if os.path.exists(os.path.join(TABLE_DIR, f))]
if not csv_paths:
    csv_paths = sorted(glob.glob(os.path.join(TABLE_DIR, "*.csv")))
    
out_pdf = "./all_tables.pdf"

styles = getSampleStyleSheet()
doc = SimpleDocTemplate(out_pdf, pagesize=landscape(letter), leftMargin=24, rightMargin=24, topMargin=24, bottomMargin=24)
elements = []

for path in csv_paths:
    df = pd.read_csv(path)
    title = Paragraph(f"<b>{os.path.basename(path)}</b>", styles['Heading2'])
    elements.append(title)
    elements.append(Spacer(1, 8))

    data = [df.columns.astype(str).tolist()] + df.astype(str).values.tolist()

    tbl = Table(data, repeatRows=1)
    tbl.setStyle(TableStyle([
        ('BACKGROUND', (0,0), (-1,0), colors.HexColor("#4D4D4D")),
        ('TEXTCOLOR', (0,0), (-1,0), colors.whitesmoke),
        ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
        ('FONTSIZE', (0,0), (-1,0), 9),
        ('ALIGN', (0,0), (-1,0), 'CENTER'),

        ('FONTSIZE', (0,1), (-1,-1), 8),
        ('ALIGN', (0,1), (-1,-1), 'CENTER'),
        ('VALIGN', (0,0), (-1,-1), 'MIDDLE'),

        ('ROWBACKGROUNDS', (0,1), (-1,-1), [colors.whitesmoke, colors.beige]),
        ('GRID', (0,0), (-1,-1), 0.25, colors.grey),
        ('BOTTOMPADDING', (0,0), (-1,0), 6),
        ('LEFTPADDING', (0,0), (-1,-1), 4),
        ('RIGHTPADDING', (0,0), (-1,-1), 4),
        ('TOPPADDING', (0,0), (-1,-1), 4),
        ('BOTTOMPADDING', (0,0), (-1,-1), 4),
    ]))
    elements.append(tbl)
    elements.append(Spacer(1, 18))
    elements.append(PageBreak())

if elements and isinstance(elements[-1], PageBreak):
    elements.pop()

doc.build(elements)

## Write-up

### Volatility patterns (overall and by industry)

* Total volatility (TVOL) exhibits pronounced spikes around major market stress events—1998 (LTCM/Russia), 2008–09 (global financial crisis), 2020 (COVID shock), and 2022 (inflation/war shocks). In each case, the increase is largely systematic volatility (SVOL) driven.
* In calmer regimes (e.g., 2013–17, 2021), SVOL is muted while idiosyncratic volatility (IVOL) accounts for a larger share of total variance, even if its absolute level stays modest.
* At the industry level, Technology and Services consistently show higher IVOL, reflecting firm-specific uncertainty and innovation risk. By contrast, Finance and Energy exhibit more crisis-sensitive SVOL, with sharp increases during downturns.

### Portfolio sorts (36m signals; annual formation)

* **Beta portfolios:** The 5–1 spread is negative, indicating that low-beta portfolios outperform high-beta portfolios on average. The effect is more pronounced in equal-weighted portfolios, consistent with the idea that smaller firms drive much of the low-beta premium.
* **Idiosyncratic volatility portfolios:** Excess returns are generally flat to slightly negative for high-IVOL stocks, especially under value-weighting. This suggests that investors do not earn a premium for bearing high idiosyncratic risk; if anything, such stocks underperform once weighted by size.