In [2]:
import os

import wrds
import pandas as pd
from pandas import DataFrame
import numpy as np

from Constants import Constants as const

In [16]:
# Connect to WRDS
conn = wrds.Connection(wrds_username='wangyouan')

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [18]:
# Define start and end dates
start_date = '2001-01-01'
end_date = '2016-12-31'

In [5]:
conn.close()

# Analyst Coverage

Method from He, J. (Jack), & Tian, X. (2013). The dark side of analyst coverage: The case of innovation. Journal of Financial Economics, 109(3), 856–878. https://doi.org/10.1016/j.jfineco.2013.04.001 

We obtain analyst information from the I/B/E/S database. For each fiscal year of a firm, we take the average of the 12 monthly numbers of earnings forecasts given by the summary file and treat that as a raw measure of analyst coverage (Coverage). This measure relies on the fact that most analysts following a firm issue at least one earnings forecast for that firm during the year before its fiscal year ending date and that a majority of them issue at most one earnings forecast in each month.

## Use Local Sas file

In [3]:
data_path = r'D:\Users\wangy\Documents\data\ibes'
# 只读取 ticker, statpers, numest, fpi, measure 四列
ibes_df = pd.read_sas(os.path.join(data_path, "statsum_epsus.sas7bdat"))

In [6]:
ibes_df.columns = [col.lower() for col in ibes_df.columns]

# 自动将所有 bytes 类型列转换为 str 类型
for col in ibes_df.select_dtypes(['object']).columns:
    if isinstance(ibes_df[col].iloc[0], bytes):
        ibes_df[col] = ibes_df[col].str.decode('utf-8')

In [20]:
columns_needed = ['ticker', 'statpers', 'numest', 'fpi', 'measure']
# 执行本地筛选逻辑，等同于 WRDS SQL 查询
ibes_summary = ibes_df[
    (ibes_df['fpi'] == '1') &
    (ibes_df['measure'] == 'EPS') &
    (ibes_df['statpers'] >= start_date) &
    (ibes_df['statpers'] <= end_date)
][['ticker', 'statpers', 'numest']].copy()

# Convert the date column to datetime format
ibes_summary['statpers'] = pd.to_datetime(ibes_summary['statpers'])

# Extract year and month from 'statpers'
ibes_summary['year'] = ibes_summary['statpers'].dt.year
ibes_summary['month'] = ibes_summary['statpers'].dt.month

# Calculate the average number of analysts for each firm for each fiscal year
ibes_summary['fiscal_year'] = ibes_summary['statpers'].dt.to_period('Y')

# Group by ticker and fiscal year, then calculate the average number of analysts
analyst_coverage = ibes_summary.groupby(['ticker', 'fiscal_year'])['numest'].mean().reset_index()

# Rename columns
analyst_coverage.rename(columns={'numest': 'coverage'}, inplace=True)

In [23]:
ibes_crsp_link = pd.read_sas(os.path.join(data_path, 'ibcrsphist.sas7bdat'),
                             encoding='utf-8')
ibes_crsp_link.columns = [col.lower() for col in ibes_crsp_link.columns]

# 4. 转换起止日期为 datetime 类型
ibes_crsp_link['sdate'] = pd.to_datetime(ibes_crsp_link['sdate'])
ibes_crsp_link['edate'] = pd.to_datetime(ibes_crsp_link['edate'])

# 5. 生成 fiscal year 的年初时间（用于匹配）
analyst_coverage['fyear_date'] = analyst_coverage['fiscal_year'].dt.start_time

# 6. 合并：按 ticker & fiscal_year_date 匹配在 sdates ≤ date ≤ edates 的 PERMNO
def get_permno(row):
    match = ibes_crsp_link[
        (ibes_crsp_link['ticker'] == row['ticker']) &
        (ibes_crsp_link['sdate'] <= row['fyear_date']) &
        (ibes_crsp_link['edate'] >= row['fyear_date'])
    ]
    return match['permno'].values[0] if not match.empty else pd.NA

analyst_coverage['permno'] = analyst_coverage.apply(get_permno, axis=1)

In [15]:
analyst_coverage.dropna(subset=['permno']).drop(columns=['fyear_date']).to_pickle(
    os.path.join(const.TEMP_PATH, '2001_2016_analyst_coverage.pkl'))

In [27]:
# Link CRSP to Compustat using the CRSP-Compustat Link table
crsp_comp_link = pd.read_pickle(os.path.join(const.TEMP_PATH, 'crsp_comp.pkl'))
analyst_coverage2 = analyst_coverage.dropna(subset=['permno']).drop(columns=['fyear_date'])
analyst_coverage2['permno'] = analyst_coverage2['permno'].astype(int)
crsp_comp_link['permco'] = crsp_comp_link['permco'].astype(int)

# Merge CRSP data with Compustat data to add GVKEY to the dataset
analyst_coverage2 = pd.merge(analyst_coverage2, crsp_comp_link, left_on='permno', right_on='permco', how='left')

# Drop 'permco' column as it is no longer needed
analyst_coverage2.drop(columns=['permco'], inplace=True)
analyst_coverage2.to_csv(os.path.join(const.TEMP_PATH, '2001_2016_annual_analyst_coverage.csv'), index=False)


## Use WRDS package to construct Analyst Coverage data

In [15]:

# Query to obtain the monthly number of earnings forecasts from the IBES summary file
ibes_summary = conn.raw_sql(f"""
    SELECT ticker, statpers, numest
    FROM ibes.statsum_epsus
    WHERE fpi = '1' AND measure='EPS' AND statpers >= '{start_date}' AND statpers <= '{end_date}'
""")

# Convert the date column to datetime format
ibes_summary['statpers'] = pd.to_datetime(ibes_summary['statpers'])

# Extract year and month from 'statpers'
ibes_summary['year'] = ibes_summary['statpers'].dt.year
ibes_summary['month'] = ibes_summary['statpers'].dt.month

# Calculate the average number of analysts for each firm for each fiscal year
ibes_summary['fiscal_year'] = ibes_summary['statpers'].dt.to_period('Y')

# Group by ticker and fiscal year, then calculate the average number of analysts
analyst_coverage = ibes_summary.groupby(['ticker', 'fiscal_year'])['numest'].mean().reset_index()

# Rename columns
analyst_coverage.rename(columns={'numest': 'coverage'}, inplace=True)

In [23]:
# Link the IBES ticker to CRSP using the IBES CRSP Link database
ibes_crsp_link = conn.raw_sql(f"""
    SELECT distinct ticker, permno
    FROM wrdsapps_link_crsp_ibes.ibcrsphist
    WHERE sdate <= '{end_date}' AND (edate IS NULL OR edate >= '{start_date}')
""")

# Merge IBES and CRSP data to link ticker with PERMNO
analyst_coverage2 = pd.merge(analyst_coverage, ibes_crsp_link, on='ticker', how='left')

In [29]:
ibes_summary['fqtr'] = ibes_summary['statpers'].dt.to_period('Q')
analyst_coverage_qtr = ibes_summary.groupby(['ticker', 'fqtr'])['numest'].mean().reset_index()
analyst_coverage_qtr.rename(columns={'numest': 'coverage'}, inplace=True)

# Merge IBES and CRSP data to link ticker with PERMNO
analyst_coverage_qtr2 = pd.merge(analyst_coverage_qtr, ibes_crsp_link, on='ticker', how='left')

In [34]:
# Link CRSP to Compustat using the CRSP-Compustat Link table
crsp_comp_link = conn.raw_sql(f"""
    SELECT distinct permco, gvkey
    FROM wrdsapps_link_crsp_comp_bdx.bdxcrspcomplink
""")

# Merge CRSP data with Compustat data to add GVKEY to the dataset
analyst_coverage2 = pd.merge(analyst_coverage2, crsp_comp_link, left_on='permno', right_on='permco', how='left')

# Drop 'permco' column as it is no longer needed
analyst_coverage2.drop(columns=['permco'], inplace=True)
analyst_coverage2.to_csv(os.path.join(const.TEMP_PATH, '2007_2016_annual_analyst_coverage.csv'), index=False)

# Merge CRSP data with Compustat data to add GVKEY to the dataset
analyst_coverage_qtr2 = pd.merge(analyst_coverage_qtr2, crsp_comp_link, left_on='permno', right_on='permco', how='left')

# Drop 'permco' column as it is no longer needed
analyst_coverage_qtr2.drop(columns=['permco'], inplace=True)
analyst_coverage_qtr2.to_csv(os.path.join(const.TEMP_PATH, '2007_2016_quarterly_analyst_coverage.csv'), index=False)


In [36]:
analyst_coverage2

Unnamed: 0,ticker,fiscal_year,coverage,permno,gvkey_x,gvkey_y
0,0000,2014,4.352113,14471.0,062491,062491
1,0000,2015,5.000000,14471.0,062491,062491
2,0000,2016,3.480000,14471.0,062491,062491
3,0001,2014,13.526316,14392.0,,
4,0001,2015,14.763158,14392.0,,
...,...,...,...,...,...,...
54481,ZYNX,2011,1.000000,,,
54482,ZYNX,2012,1.000000,,,
54483,ZYNX,2013,1.000000,,,
54484,ZYNX,2014,1.000000,,,


# Construnt DISPERSION and FCSTERROR

In [None]:
## Use WRDS package

In [5]:
# Query WRDS IBES to obtain the necessary data
# We need 'stdev', 'meanest', 'actual' from the IBES dataset
ibes_data = conn.raw_sql(f"""
    SELECT ticker, fpi, statpers, numest, fpedats, meanest AS EPS_MEAN, 
           stdev AS EPS_SD, actual AS EPS_ACTUAL
    FROM ibes.statsum_epsus
    WHERE fpi = '6' AND statpers BETWEEN '{start_date}' AND '{end_date}' AND measure = 'EPS'
""")
# ibes_data.to_pickle(os.path.join(const.TEMP_PATH, '2007_2016_ibes_annual_data.pkl'))
ibes_data.to_pickle(os.path.join(const.TEMP_PATH, '2007_2016_ibes_quarter_data.pkl'))

In [19]:
# Query WRDS CRSP to obtain the price data for each company at the end of the previous quarter
crsp_data = conn.raw_sql(f"""
    SELECT permno, date, prc AS Price_lag
    FROM crsp.msf
    WHERE date BETWEEN '{start_date}' AND '{end_date}'
""")
crsp_data.to_pickle(os.path.join(const.TEMP_PATH, '2001_2016_crsp_monthly_data.pkl'))

In [7]:
# Load the data into pandas DataFrames (WRDS package)
ibes_df = pd.DataFrame(ibes_data)
crsp_df = pd.DataFrame(crsp_data).rename(columns={'price_lag': 'prc'})
link_df = pd.DataFrame(ibes_crsp_link)

NameError: name 'ibes_crsp_link' is not defined

In [33]:
# Load the data into pandas DataFrames (IBES local package)
# ibes_full_df = ibes_df.copy()
ibes_df = ibes_df.loc[ibes_df['fpi'] == '1',
    ['ticker', 'fpi', 'statpers', 'numest', 'fpedats', 'meanest', 'stdev', 'actual']].rename(columns={'meanest': 'eps_mean', 'stdev': 'eps_sd', 'actual': 'eps_actual'})
crsp_df = pd.DataFrame(crsp_data).rename(columns={'price_lag': 'prc'})
# link_df = pd.DataFrame(ibes_crsp_link)

In [78]:
link_df.to_pickle(os.path.join(const.TEMP_PATH, 'ibes_crsp_link.pkl'))
crsp_comp_link.to_pickle(os.path.join(const.TEMP_PATH, 'crsp_comp.pkl'))

In [24]:
link_df = pd.read_pickle(os.path.join(const.TEMP_PATH, 'ibes_crsp_link.pkl'))
crsp_comp_link = pd.read_pickle(os.path.join(const.TEMP_PATH, 'crsp_comp.pkl'))

In [9]:
# Convert the IBES data to quarterly by creating a quarter value and aggregating it
ibes_df['statpers'] = pd.to_datetime(ibes_df['statpers'])
ibes_df['quarter'] = ibes_df['statpers'].dt.to_period('Q')
ibes_quarterly = ibes_df.groupby(['ticker', 'quarter']).agg({
    'eps_mean': 'mean',
    'eps_sd': 'mean',
    'eps_actual': 'mean',
    'numest': 'last',
    'fpedats': 'last'
}).reset_index()

In [10]:
# Convert the CRSP data to quarterly by creating a quarter value
crsp_df['date'] = pd.to_datetime(crsp_df['date'])
crsp_df['quarter'] = crsp_df['date'].dt.to_period('Q')
crsp_quarterly = crsp_df.groupby(['permno', 'quarter']).last().reset_index()

In [11]:
# Use the price at the end of the previous quarter as Price_lag
crsp_quarterly['Price_lag'] = crsp_quarterly.groupby('permno')['prc'].shift(1)

In [12]:
# Merge IBES and CRSP Link data to get permno for each IBES ticker
ibes_linked_df = pd.merge(ibes_quarterly, link_df, on='ticker', how='inner')

# Merge IBES-linked data with CRSP quarterly data on permno and quarter
merged_df = pd.merge(ibes_linked_df, crsp_quarterly, on=['permno', 'quarter'], how='inner')

# Create DISPERSION
# DISPERSION is calculated as the ratio of the standard deviation of EPS forecasts (eps_sd) 
# to the stock price at the end of the previous quarter (Price_lag)
merged_df['DISPERSION'] = merged_df['eps_sd'] / merged_df['Price_lag']

# Create FCSTERROR
# FCSTERROR is calculated as the absolute value of the difference between the mean analyst EPS forecast (eps_mean)
# and the actual EPS (eps_actual), scaled by the stock price at the end of the previous quarter (Price_lag)
merged_df['FCSTERROR'] = abs(merged_df['eps_mean'] - merged_df['eps_actual']) / merged_df['Price_lag']


In [13]:
merged_df.to_csv(os.path.join(const.TEMP_PATH, '2007_2016_quarterly_dispersion_fcsterror.csv'), index=False)

In [34]:
# Convert the IBES data to annual by creating a annual value and aggregating it
ibes_df['statpers'] = pd.to_datetime(ibes_df['statpers'])
ibes_df['year'] = ibes_df['statpers'].dt.year
ibes_annual = ibes_df.groupby(['ticker', 'year']).agg({
    'eps_mean': 'mean',
    'eps_sd': 'mean',
    'eps_actual': 'mean',
    'numest': 'mean',
}).reset_index()

ibes_annual2 = ibes_df.groupby(['ticker', 'year']).agg({
    'eps_mean': 'last',
    'eps_sd': 'last',
    'eps_actual': 'last',
    'numest': 'last',
}).reset_index()

ibes_annual = ibes_annual.merge(ibes_annual2, on=['ticker', 'year'], how='left', suffixes=('', '_last'))

# Convert the CRSP data to quarterly by creating a quarter value
crsp_df['date'] = pd.to_datetime(crsp_df['date'])
crsp_df['year'] = crsp_df['date'].dt.year
crsp_annual = crsp_df.groupby(['permno', 'year']).last().reset_index()

# Use the price at the end of the previous quarter as Price_lag
crsp_annual['Price_lag'] = crsp_annual.groupby('permno')['prc'].shift(1)

# Merge IBES and CRSP Link data to get permno for each IBES ticker
ibes_linked_df = pd.merge(ibes_annual, link_df, on='ticker', how='inner')

# Merge IBES-linked data with CRSP quarterly data on permno and quarter
merged_df_ann = pd.merge(ibes_linked_df, crsp_annual, on=['permno', 'year'], how='inner')
merged_df_ann = merged_df_ann[merged_df_ann['Price_lag'] > 0]

# Create DISPERSION
# DISPERSION is calculated as the ratio of the standard deviation of EPS forecasts (eps_sd) 
# to the stock price at the end of the previous quarter (Price_lag)
merged_df_ann['DISPERSION'] = merged_df_ann['eps_sd'] / merged_df_ann['Price_lag']
merged_df_ann['DISPERSION_last'] = merged_df_ann['eps_sd_last'] / merged_df_ann['Price_lag']

# Create FCSTERROR
# FCSTERROR is calculated as the absolute value of the difference between the mean analyst EPS forecast (eps_mean)
# and the actual EPS (eps_actual), scaled by the stock price at the end of the previous quarter (Price_lag)
merged_df_ann['FCSTERROR'] = abs(merged_df_ann['eps_mean'] - merged_df_ann['eps_actual']) / merged_df_ann['Price_lag']
merged_df_ann['FCSTERROR_last'] = abs(merged_df_ann['eps_mean_last'] - merged_df_ann['eps_actual_last']) / merged_df_ann['Price_lag']

merged_df_ann.to_csv(os.path.join(const.TEMP_PATH, '2001_2016_annual_dispersion_fcsterror.csv'), index=False)

# Download data for Li Xintong

In [4]:
# Define start year
start_year = 1993

# ========================
# 1. Earnings Announcements: tr_ibes.actu_epsus
# ========================
# Download tr_ibes.actu_epsus (Earnings Announcements) table
ea_data = conn.raw_sql(f"""
    SELECT *
    FROM tr_ibes.actu_epsus
    WHERE anndats >= '1993-01-01'
""")

In [9]:
output_path = r'D:\Users\wangy\Documents\temp\ibes data'
# ea_data.to_csv(os.path.join(output_path, '1993_tr_ibes_actu_epsus.csv'), index=False)

In [7]:
# ========================
# 2. Management Guidance: tr_ibes_guidance.det_guidance
# ========================
# Download Management Guidance data
mg_data = conn.raw_sql(f"""
    SELECT *
    FROM tr_ibes_guidance.det_guidance
    WHERE anndats >= '1993-01-01'
""")

ProgrammingError: (psycopg2.errors.InsufficientPrivilege) permission denied for schema tr_ibes_guidance
LINE 3:     FROM tr_ibes_guidance.det_guidance
                 ^

[SQL: 
    SELECT *
    FROM tr_ibes_guidance.det_guidance
    WHERE anndats >= '1993-01-01'
]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [4]:
# Connect to WRDS
conn = wrds.Connection(wrds_username='aheitz')

Loading library list...
Done


In [5]:
# Define year range
start_year = 2022
end_year = 2024  # adjust to your latest desired year

# Initialize empty list to store each year's data
forecast_list = []

for year in range(start_year, end_year + 1):
    print(f"Downloading data for year {year}...")
    query = f"""
        SELECT *
        FROM tr_ibes.statsum_epsus
        WHERE statpers >= '{year}-01-01' AND statpers < '{year + 1}-01-01'
    """
    try:
        yearly_data = conn.raw_sql(query)
        forecast_list.append(yearly_data)
    except Exception as e:
        print(f"Failed to download data for {year}: {e}")

# Merge all data
forecast_data = pd.concat(forecast_list, ignore_index=True)

Downloading data for year 2022...
Downloading data for year 2023...
Downloading data for year 2024...


In [6]:
conn.close()

In [10]:
forecast_data.to_csv(os.path.join(output_path, '2022_2024_tr_ibes.statsum_epsus.csv'), index=False)