In [None]:
import pandas as pd
import numpy as np
import wrds
from pandas.tseries.offsets import *
###################
# Connect to WRDS #
###################
# Please enter your own username and password
conn = wrds.Connection(wrds_username='XXXX')

Loading library list...
Done


# CRSP monthly

In [None]:
###################
# CRSP Block      #
###################
crsp_m = conn.raw_sql("""
                      select a.permno, a.permco, a.date, a.ret, a.retx,a.shrout, a.prc, a.cfacshr,
                             b.shrcd, b.exchcd, b.siccd, b.ncusip,
                             c.dlstcd, c.dlret
                      from crsp.msf as a
                      left join crsp.msenames as b
                      on a.permno=b.permno and b.namedt<=a.date and a.date<=b.nameendt
                      left join crsp.msedelist as c
                      on a.permno=c.permno AND date_trunc('month', a.date) = date_trunc('month', c.dlstdt)
                      where a.date between '01/01/1980' and '12/31/2023'
                      and b.exchcd between 1 and 3
                      and b.shrcd between 10 and 11
                      """,
                      date_cols=['date'])

# change variable format to int
crsp_m[['permco','permno','shrcd','exchcd']]=crsp_m[['permco','permno','shrcd','exchcd']].astype(int)

# Line up date to be end of month
crsp_m['YearMonth']=crsp_m['date']+MonthEnd(0)

# Incorporate delisting return
# when missing, set to -0.5
crsp_m['dlret'] = np.where((~crsp_m['dlstcd'].isna()) & (crsp_m['dlret'].isna()), -0.5, crsp_m['dlret'])
crsp_m['dlret'] = crsp_m['dlret'].fillna(0)
crsp_m['retadj'] = (1+crsp_m['ret'])*(1+crsp_m['dlret']) - 1
crsp_m['retadj'] = np.where((crsp_m['ret'].isna()) & (crsp_m['dlret']!=0), crsp_m['dlret'], crsp_m['ret'])

crsp_m = crsp_m.sort_values(by=['permno','YearMonth']).reset_index(drop=True)

crsp_m.to_parquet('../data/WRDS/crsp_m.parquet')

# Compustat Annual

In [None]:
####################
# Compustat Annual #
####################
comp = conn.raw_sql("""
                    select a.gvkey, a.datadate, a.fyear, a.csho, a.at, a.pstkl, a.txditc,
                           a.pstkrv, a.seq, a.pstk, a.ppegt, a.invt, a.lt, a.sich, a.ib, a.oancf,
                           a.act, a.dlc, a.che, a.lct, a.dvc, a.epspi, a.epspx,
                           a.ajex,
                           a.sale, a.ao
                    from comp.funda as a
                    where indfmt='INDL'
                    and datafmt='STD'
                    and popsrc='D'
                    and consol='C'
                    and curcd = 'USD'
                    and datadate >= '01/01/1965'
                    """, date_cols=['datadate'])

# create preferrerd stock
comp['ps']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv'])
comp['ps']=np.where(comp['ps'].isnull(),comp['pstk'], comp['ps'])
comp['ps']=np.where(comp['ps'].isnull(),0,comp['ps'])

comp['txditc']=comp['txditc'].fillna(0)

# create book equity
comp['be']=comp['seq']+comp['txditc']-comp['ps']

# accrual change in current assets (act) + in debt in current liabilities (dlc)
# -  change in cash and short-term investments (che) - change in current liabilities (lct)
comp['act']=comp['act'].fillna(0)
comp['dlc']=comp['dlc'].fillna(0)
comp['che']=comp['che'].fillna(0)
comp['lct']=comp['lct'].fillna(0)
# comp['year']=comp['datadate'].dt.year
comp.sort_values(by=['gvkey', 'datadate'], inplace=True)
comp[['act_ch','dlc_ch','che_ch','lct_ch']] = comp.groupby('gvkey')[['act','dlc','che','lct']].diff()
comp['acc'] = comp['act_ch'] + comp['dlc_ch'] - comp['che_ch'] - comp['lct_ch']

# Average Asset in recent 2 years
comp['at_l1'] = comp.groupby('gvkey')['at'].shift(1)
comp['at_avg'] = comp[['at','at_l1']].mean(axis=1)
# Asset growth
comp['ag']=comp.groupby('gvkey')['at'].pct_change(fill_method=None)
# change in PPE, unstandardized
comp['ppegt_diff'] = comp.groupby('gvkey')['ppegt'].diff()
# change in AO, unstandardized
comp['ao_diff'] = comp.groupby('gvkey')['ao'].diff()
# Sales Growth 1y/3y/5y
comp['sale_l1'] = comp.groupby('gvkey')['sale'].shift(1)
comp['sale_l3'] = comp.groupby('gvkey')['sale'].shift(3)
comp['sale_l5'] = comp.groupby('gvkey')['sale'].shift(5)
comp['sg_1y'] = comp['sale']/comp['sale_l1'] - 1
comp['sg_3y'] = (comp['sale']/comp['sale_l3'])**(1/3) - 1
comp['sg_5y'] = (comp['sale']/comp['sale_l5'])**(1/5) - 1
## Net Stock Issuance
# adjusted-shares
comp['adj_csho'] = comp['csho'] * comp['ajex']
comp['adj_csho_l1'] = comp.groupby('gvkey')['adj_csho'].shift(1)
comp['nsi'] = np.log(comp['adj_csho']/comp['adj_csho_l1'])

comp.to_parquet('../data/WRDS/compa.parquet')

# CCM link

In [None]:
#######################
# CCM Block           #
#######################
ccm=conn.raw_sql("""
                  select gvkey, lpermno as permno, linktype, linkprim,
                  linkdt, linkenddt
                  from crsp.ccmxpf_linktable
                  where substr(linktype,1,1)='L'
                  and (linkprim ='C' or linkprim='P')
                  """, date_cols=['linkdt', 'linkenddt'])
ccm.to_parquet('../data/WRDS/ccm.parquet')

# Financial Ratio

In [None]:
# Please Download From:
# https://wrds-www.wharton.upenn.edu/pages/get-data/financial-ratios-suite-wrds/
# and save to the following path: '../data/WRDS/financial_ratio.dta'

# IBES unadjusted summary

In [None]:
# WRDS path: IBES/IBES Academic/Unadjusted Summary/Summary Statistics
# Measures: EPS
# FPI: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
# and save to the following path: '../data/WRDS/Forecast_EPS_summary_unadjusted_2023.dta'

# We save the data into parquet for faster loading
# just run the following code:
# df = pd.read_stata('../data/WRDS/Forecast_EPS_summary_unadjusted_2023.dta')
# df.to_parquet('../data/WRDS/Forecast_EPS_summary_unadjusted_2023.parquet')

# IBES unadjusted actual

In [None]:
# 1. IBES Detail Actual
# WRDS path: IBES/IBES Academic/Unadjusted Detail/Actuals
# Measures: EPS
# Periodicity: ANN/QTR
# and save to the following path: '../data/WRDS/Actual_EPS_detail_unadjusted_2023.dta'

# 2. IBES Summary Actual
# WRDS path: IBES/IBES Academic/Unadjusted Summary/Actuals, Pricing and Ancillary
# Measures: EPS
# and save to the following path: '../data/WRDS/Actual_EPS_summary_unadjusted_2023.dta'

# CRSP-IBES link table

In [None]:
# WRDS path: Linking Queries by WRDS/IBES CRSP Link (Beta)
# and save to the following path: '../data/WRDS/iclink_WRDS.csv'

# Other

## Publicly Available Data Provided in the Data Folder:
1. **Macro Variables**: RGDP, RCON, INDPROD, UNEMP  
   - **File**: `../data/Macro/XXXX.xlsx`  
   - **Source**: Federal Reserve Bank of Philadelphia  

2. **Factors**: Fama-French, HMXZ, SY, DHS  
   - **File**: `../data/Other/ff5_factors_m.csv`  
   - **Source**: Authors' websites  

3. **FF49 Industry Classification**  
   - **File**: `../data/Other/Siccodes49.csv`  
   - **Source**: Kenneth R. French's website  
  
## Publicly Available Data Not Provided:
- **Anomalies Data**:  
  - **File**: `../data/Other/signed_predictors_dl_wide.csv`  
  - **Source**: Open Source Asset Pricing (https://www.openassetpricing.com/)  
  - **Version**: 1.3  

## Non-Public Data:
- **BHL Data**:  
  - **File**: `../data/Other/Conditional_Bias.csv`  
  - **Access**: Please contact BHL for this data.  