In [3]:
import wrds, datetime
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'

conn = wrds.Connection()

Loading library list...
Done


# DATA COLLECTION

## SNP and RFR Data

In [4]:
query_start_date = '1990-01-01'

In [5]:
sp500_data = conn.raw_sql(f"""
SELECT
    caldt AS date,
    sprtrn AS daily_return -- Equal-Weighted Return (includes distributions) (ewretd)
FROM
    crspq.dsp500
WHERE
    caldt >= '{query_start_date}'
ORDER BY
    caldt;
""")
sp500_data['date'] = pd.to_datetime(sp500_data['date'])

sp500_data

Unnamed: 0,date,daily_return
0,1990-01-02,0.017799
1,1990-01-03,-0.002586
2,1990-01-04,-0.008613
3,1990-01-05,-0.009756
4,1990-01-08,0.004514
...,...,...
8748,2024-09-24,0.002511
8749,2024-09-25,-0.001861
8750,2024-09-26,0.004039
8751,2024-09-27,-0.001253


In [6]:
risk_free_data = conn.raw_sql(f'''
    SELECT
        date,
        rf AS daily_rf_rate -- One Month Treasury Bill Rate (daily)
    FROM
        ff.factors_daily
    WHERE
        date >= '{query_start_date}'
''')

risk_free_data['date'] = pd.to_datetime(risk_free_data['date'])

risk_free_data

Unnamed: 0,date,daily_rf_rate
0,1990-01-02,0.00026
1,1990-01-03,0.00026
2,1990-01-04,0.00026
3,1990-01-05,0.00026
4,1990-01-08,0.00026
...,...,...
8771,2024-10-25,0.00017
8772,2024-10-28,0.00017
8773,2024-10-29,0.00017
8774,2024-10-30,0.00017


In [9]:
last_trading_days = conn.raw_sql(f'''
SELECT DISTINCT
    MAX(date) AS last_trade_date
FROM crsp.dsf
WHERE date >= '{query_start_date}'
GROUP BY DATE_TRUNC('month', date)
ORDER BY last_trade_date
''')

last_trading_days['last_trade_date'] = pd.to_datetime(last_trading_days['last_trade_date'])
last_trading_days['month'] = last_trading_days['last_trade_date'].dt.month
last_trading_days['year'] = last_trading_days['last_trade_date'].dt.year
last_trading_day_mapping = last_trading_days.set_index(['year', 'month'])['last_trade_date'].to_dict()

## Choosing Companies

In [11]:
# retrieves ID (permno) and earliest recorded name for each company in CRSP database
# 37,776 companies available

comps = conn.raw_sql( '''
SELECT permno, MIN(comnam) AS company_name
FROM crsp.stocknames
GROUP BY permno
''')
comps

Unnamed: 0,permno,company_name
0,83264,GREIF BROTHERS CORP
1,63618,HINDERLITER ENERGY EQUIP CORP
2,10896,CAMILLE ST MORITZ INC
3,69906,SEIBELS BRUCE GROUP INC
4,79030,GREAT CENTRAL MINES LTD
...,...,...
37771,14886,ARK E T F TRUST
37772,79163,A M F M INC
37773,86036,BRENTWOOD INSTRUMENTS INC
37774,92970,CHINA EDUCATION ALLIANCE INC


In [None]:
#TODO: choose a way to narrow the above list of permnos to <500. Will then use that list with the following functions to gather fin data
#this will go away when the above is completed.

company_search = 'AMAZON' # FIND A COMPANY HERE

comps[comps['company_name'].str.contains(f'{company_search}')]

Unnamed: 0,permno,company_name
9682,84788,AMAZON COM INC


## Get Unique Identifiers

In [14]:
def get_gvkey(permno):
    link = conn.raw_sql(f'''
    SELECT *
    FROM crsp.ccmxpf_linktable
    WHERE lpermno = {permno};
    ''')
    
    return link['gvkey'][0]

In [24]:
#permnos = [10107, 86580, 84788]
# BELOW IS THE LIST OF PERMNOs FOR TEN COMPANIES

query = """
select distinct on (a.permno) a.permno, a.permco, a.cusip, a.comnam
from crsp.stocknames a
where a.comnam ilike '%%alphabet%%' and a.cusip like '02079K30'
    or a.comnam ilike '%%amazon%%'
    or a.comnam ilike 'apple inc%%'
    or a.comnam ilike '%%meta platforms%%'
    or a.comnam ilike '%%microsoft%%'
    or a.comnam ilike '%%nvidia%%'
    or a.comnam ilike '%%tesla inc%%'
    or a.comnam ilike '%%netflix%%'
    or a.comnam ilike '%%walmart%%'
    or a.comnam ilike '%%pfizer%%'
;
"""

ids = conn.raw_sql(query)
permnos = ids['permno'].to_list()

gvkeys = [get_gvkey(permno) for permno in permnos]
ids['gvkey'] = gvkeys

ids

Unnamed: 0,permno,permco,cusip,comnam,gvkey
0,10107,8048,59491810,MICROSOFT CORP,12141
1,13407,54084,30303M10,META PLATFORMS INC,170617
2,14593,7,03783310,APPLE INC,1690
3,21936,21394,71708110,PFIZER INC,8530
4,55976,21880,93114210,WALMART INC,11259
5,84788,15473,02313510,AMAZON COM INC,64768
6,86580,16382,67066G10,NVIDIA CORP,117768
7,89393,43145,64110L10,NETFLIX INC,147579
8,90319,45483,02079K30,ALPHABET INC,160329
9,93436,53453,88160R10,TESLA INC,184996


## Query Company Data

In [21]:
def get_company_financials(gvkey):
    company_fin_data = conn.raw_sql(f'''
    SELECT
        datadate AS date,
        gvkey AS gvkey,
        rdq AS reporting_date, -- Date of which information was reported
        atq AS total_assets,  -- Total Assets
        chq AS cash_holdings,  -- Cash and Short-Term Investments
        dlttq + dlcq AS total_debt,  -- Total Debt (long-term + short-term debt)
        ibq AS earnings,  -- Earnings before extraordinary items
        xrdq AS rd_expense,  -- R&D expense
        dvpq AS dividends_paid,  -- Dividends paid
        xintq AS interest_expense  -- Interest expense
    FROM
        comp.fundq
    WHERE
        gvkey = '{gvkey}'
        AND datadate >= '{query_start_date}'  -- Ensure data is after the link start date
    ''')
    
    company_fin_data['date'] = pd.to_datetime(company_fin_data['date'])
    company_fin_data['reporting_date'] = pd.to_datetime(company_fin_data['reporting_date'])
    
    start_date = company_fin_data['date'].min()
    end_date = company_fin_data['date'].max()
    all_months = pd.date_range(start=start_date, end=end_date, freq='ME')
    full_range_df = pd.DataFrame({'date': all_months})
    company_fin_data = pd.merge(full_range_df, company_fin_data, on='date', how='left')
    
    company_fin_data['month_start'] = company_fin_data['date'] - pd.offsets.MonthBegin()
    company_fin_data['month_end'] = company_fin_data['date']
    
    return company_fin_data

In [26]:
get_company_financials(ids['gvkey'][5])

Unnamed: 0,date,gvkey,reporting_date,total_assets,cash_holdings,total_debt,earnings,rd_expense,dividends_paid,interest_expense,month_start,month_end
0,1995-03-31,064768,NaT,,,,,,,,1995-03-01,1995-03-31
1,1995-04-30,,NaT,,,,,,,,1995-04-01,1995-04-30
2,1995-05-31,,NaT,,,,,,,,1995-05-01,1995-05-31
3,1995-06-30,064768,NaT,,,,,,,,1995-06-01,1995-06-30
4,1995-07-31,,NaT,,,,,,,,1995-07-01,1995-07-31
...,...,...,...,...,...,...,...,...,...,...,...,...
350,2024-05-31,,NaT,,,,,,,,2024-05-01,2024-05-31
351,2024-06-30,064768,2024-08-01,554818.0,71178.0,157842.0,13485.0,22304.0,0.0,589.0,2024-06-01,2024-06-30
352,2024-07-31,,NaT,,,,,,,,2024-07-01,2024-07-31
353,2024-08-31,,NaT,,,,,,,,2024-08-01,2024-08-31
