In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import os
import duckdb
import fastparquet

In [2]:
data_path = '../data'
if os.path.exists(data_path):
    os.chdir(data_path)
    print(f'Change directory to data path: {data_path}')
else:
    print('Please point to the correct data path!')

Change directory to data path: ../data


#### preprocessing data

In [3]:
crsp_d = pd.read_parquet('crsp_202401.dsf.parquet')
crsp_d['date'] = pd.to_datetime(crsp_d['date'])
crsp_d.head(3)

Unnamed: 0,cusip,permno,permco,issuno,hexcd,hsiccd,date,bidlo,askhi,prc,vol,ret,bid,ask,shrout,cfacpr,cfacshr,openprc,numtrd,retx
0,12753720,85570,7556,9833,3,700.0,2012-08-21,8.6,9.4,8.62,37776.0,-0.027088,8.62,8.65,15439.0,1.0,1.0,8.94,224.0,-0.027088
1,12753720,85570,7556,9833,3,700.0,2012-08-22,8.52,8.86,8.7,27851.0,0.009281,8.67,8.71,15439.0,1.0,1.0,8.64,206.0,0.009281
2,12753720,85570,7556,9833,3,700.0,2012-08-23,8.26,8.86,8.4,16653.0,-0.034483,8.37,8.4,15439.0,1.0,1.0,8.73,95.0,-0.034483


In [18]:
# print the number of unique permno in the crsp_d dataframe
unique_permno_count = crsp_d['permno'].nunique()
print(f"Number of unique permno: {unique_permno_count}")

Number of unique permno: 37776


In [46]:
# too many permno, filter out the ones that is not recent
crsp_d = crsp_d[crsp_d.groupby('permno')['date'].transform('max') > '2016-01-01']

In [47]:
print(f"After filting number of unique permno: {crsp_d['permno'].nunique()}")

After filting number of unique permno: 13754


In [52]:
# prepare data, treat spy as market and merge with crsp_d
spy_permno = 84398
market_ret = crsp_d[crsp_d['permno'] == spy_permno][['date', 'ret']].rename(columns={'ret': 'mkt_ret'})

# drop spy from crsp_d
crsp_d = crsp_d[crsp_d['permno'] != spy_permno]

merged = pd.merge(crsp_d, market_ret, on='date', how='left')
merged = merged.dropna(subset=['ret', 'mkt_ret'])

#### Run regressions to get beta

In [None]:
# define regression function
def calculate_beta(permno_group):
    X = sm.add_constant(permno_group['mkt_ret'])
    model = sm.OLS(permno_group['ret'], X, missing='drop')
    results = model.fit()
    return {"beta": results.params['mkt_ret']}

In [81]:
from tqdm import tqdm

start_date = '2016-01-01'

beta_results_list = []
for permno, group in tqdm(merged.groupby('permno')):
    
    group = group[group['date'] >= start_date]
    
    # if permno_group has less than 2 years of data, continue
    if group['date'].max() - group['date'].min() < pd.Timedelta(days=2*365):
        continue
    
    beta_result = calculate_beta(group)
    beta_result['permno'] = permno
    beta_results_list.append(beta_result)
beta_results = pd.DataFrame(beta_results_list)

# merge back with stock metadata
beta_results_df = pd.merge(
    crsp_d['permno'].drop_duplicates(),
    beta_results,
    on='permno'
)

100%|██████████| 13751/13751 [00:33<00:00, 416.57it/s]


In [82]:
beta_results_df = beta_results_df.sort_values(by="beta", ascending=False).reset_index(drop=True)
beta_results_df.head(10)

Unnamed: 0,permno,beta
0,18405,4.884568
1,93284,4.176943
2,17437,4.07124
3,91128,4.062612
4,11182,3.631746
5,92855,3.587863
6,15613,3.551815
7,19039,3.535189
8,15348,3.497892
9,21999,3.468908


#### Permno Lookup table - just curious about which stocks have beta values so high

In [83]:
crsp_names = pd.read_parquet('crsp_202401.dsenames.parquet')
crsp_names.head()
# generate a dictionary of key: permno, value: ticker, from crsp_names
permno_ticker_dict = crsp_names.set_index("permno")["ticker"].to_dict()

In [84]:
top_10_permnos = beta_results_df.head(10)["permno"]
top_10_tickers = [permno_ticker_dict.get(permno) for permno in top_10_permnos]
top_10_tickers

['BACK',
 'SOXL',
 'VMIN',
 'GPOR',
 'GASL',
 'TECL',
 'DPST',
 'HIBL',
 'LABU',
 'FNGG']

#### Examine the cross-section of high beta stocks' return around turn of the month
modification of Ethan's code

In [129]:
def calc_turn_of_month_returns(crsp_d, permno, verbose=False):

    # data preprocessing
    curr_permno_d = crsp_d.query(f"permno == {permno}").reset_index(drop=True).copy()
    curr_permno_d['date'] = pd.to_datetime(curr_permno_d['date'])

    # Identify the last trading day of each month
    monthly_last = curr_permno_d.groupby(pd.Grouper(key='date', freq='M')).tail(1).reset_index(drop=True)

    # Initialize result columns
    monthly_last['prev_3_avg'] = np.nan
    monthly_last['prev_4_8_avg'] = np.nan
    monthly_last['next_3_avg'] = np.nan
    monthly_last['next_4_8_avg'] = np.nan

    # Create a mapping from date to index for quick lookup
    date_to_idx = {date: idx for idx, date in enumerate(curr_permno_d['date'])}

    # Iterate over each last trading day to calculate windowed returns
    for i, row in monthly_last.iterrows():
        current_date = row['date']
        current_idx = date_to_idx.get(current_date, -1)
        
        if current_idx == -1:
            if verbose:
                print(f"Error: Date {current_date} for permno {permno} not found in the original data.")
            continue  # Ensure the date exists in the original data
        
        # Previous 3-day window (3 trading days before the current date)
        if current_idx >= 3:
            prev_3 = curr_permno_d.iloc[current_idx-3:current_idx]['ret'].mean()
            monthly_last.at[i, 'prev_3_avg'] = prev_3
        
        # Previous 4-8 day window (4-8 trading days before the current date)
        if current_idx >= 8:
            prev_4_8 = curr_permno_d.iloc[current_idx-8:current_idx-3]['ret'].mean()  # Includes 5 days (4-8)
            monthly_last.at[i, 'prev_4_8_avg'] = prev_4_8
        
        # Next 3-day window (3 trading days after the current date)
        if current_idx + 3 < len(curr_permno_d):
            next_3 = curr_permno_d.iloc[current_idx+1:current_idx+4]['ret'].mean()
            monthly_last.at[i, 'next_3_avg'] = next_3
        
        # Next 4-8 day window (4-8 trading days after the current date)
        if current_idx + 8 < len(curr_permno_d):
            next_4_8 = curr_permno_d.iloc[current_idx+4:current_idx+9]['ret'].mean()  # Includes 5 days (4-8)
            monthly_last.at[i, 'next_4_8_avg'] = next_4_8
             
    results = pd.DataFrame(
        {'permno': str(permno),
        'From T-8 to T-4': monthly_last['prev_4_8_avg'].mean(),
        'From T-3 to T-1': monthly_last['prev_3_avg'].mean(),
        'On T': monthly_last['ret'].mean(),
        'From T+1 to T+3': monthly_last['next_3_avg'].mean(),
        'From T+4 to T+8': monthly_last['next_4_8_avg'].mean(),
        'Average daily return': curr_permno_d['ret'].mean()},
        index=[0]
    )
    
    return results

In [130]:
top_10_permnos = beta_results_df.head(10)["permno"]
top_10_permnos

0    18405
1    93284
2    17437
3    91128
4    11182
5    92855
6    15613
7    19039
8    15348
9    21999
Name: permno, dtype: int32

In [133]:
import warnings
warnings.filterwarnings('ignore')

results_list = [calc_turn_of_month_returns(crsp_d, permno) for permno in top_10_permnos]
combined_results = pd.concat(results_list, ignore_index=True)
combined_results

Unnamed: 0,permno,From T-8 to T-4,From T-3 to T-1,On T,From T+1 to T+3,From T+4 to T+8,Average daily return
0,18405,0.030233,-0.000623,0.012209,-0.002006,-0.000932,0.005812
1,93284,-0.000277,0.006701,0.000113,0.003426,0.001501,0.002576
2,17437,0.002895,0.000228,0.004594,-0.000772,-0.005762,0.000603
3,91128,-0.002443,0.00333,0.001042,0.000106,-0.001408,0.000929
4,11182,-0.007123,0.001995,-0.001587,-0.001439,-0.007141,-0.003285
5,92855,-0.000325,0.00492,-0.001281,0.004569,0.001836,0.002287
6,15613,-0.000901,0.007753,-0.011735,0.003023,0.002066,0.000945
7,19039,-0.001319,0.008405,-0.010404,0.010465,0.003044,0.002141
8,15348,-0.002013,-0.003122,0.006341,0.007573,-0.002346,0.00062
9,21999,-0.005673,0.00573,0.006295,-0.003791,-0.001222,-0.000411
