# Fetch data for the replication part of the project

In [31]:
import pandas as pd
import polars as pl
import numpy as np
import time
import logging
import wrds
import os
import sys
import sqlite3
from pathlib import Path

path_to_db = os.path.join(Path(os.getcwd()).parent) + "/data/raw/big_set.db"
path_to_external_hd_db = '/media/miroslav/Miroslav Backup/cpop_data/option_prices_raw.db'
path_to_old_db = os.path.join(Path(os.getcwd()).parent) + "/data/db/option_prices.db"

# Set up logging
logging_file = os.path.join(Path(os.getcwd()).parent) + '/logs/data_fetching.log'
logger = logging.getLogger(__name__)
logging.basicConfig(filename=logging_file, encoding='utf-8', level=logging.INFO)

In [32]:
sqlite_conn = sqlite3.connect(path_to_external_hd_db)
old_db = sqlite3.connect(path_to_old_db)
# cur = conn_sqlite.cursor()

In [None]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
wrds_conn = wrds.Connection(wrds_username=os.getenv("WRDS_USERNAME"), wrds_password=os.getenv("WRDS_PASSWORD"))

Loading library list...
Done


## Options and stock prices

In [34]:
option_table = 'opprcd'
forward_table = 'fwdprd'
price_table = 'secprd'
ir_table = 'zerocd'
index_secid=108105

start_hist = pd.Timestamp('2000-01-01')

In [35]:
# query = "SELECT * FROM sp500_constituents" 
# schema_overrides = {
#     'id': pl.Int32,
#     'S': pl.Float64,
#     'sigma': pl.Float64,
#     'tau': pl.Float64,
#     'r': pl.Float64,
#     'K': pl.Float64,
#     'V': pl.Float64,
#     'cp_flag': pl.Utf8}

# sp500_consts_data = pl.read_database(query, old_db)

In [36]:
sp500_const_permno = pd.read_sql("select * from sp500_constituents", old_db)
sp500_const_permno

Unnamed: 0,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,conm,tic,cusip,cik,sic,naics,gsubind,gind
0,76129,3COM CORP,88553510,11,3,3674,COMS,010553,01,1996-01-02,2000-07-27,3COM CORP,COMS.,885535104,0000738076,3576,334119,45201020,452010
1,22592,3M CO,88579Y10,11,1,3841,MMM,007435,01,1957-03-01,2023-12-29,3M CO,MMM,88579Y101,0000066740,9997,999977,20105010,201050
2,10006,A C F INDUSTRIES INC,00080010,10,1,3743,ACF,001010,01,1957-03-01,1984-07-18,ACF INDUSTRIES INC,4165A,00099V004,0000910627,3743,336510,20304010,203040
3,50906,A D C TELECOMMUNICATIONS INC,00088630,11,3,3661,ADCT,001013,01,1999-08-02,2007-06-29,ADC TELECOMMUNICATIONS INC,ADCT.1,000886309,0000061478,3661,334210,45201020,452010
4,50906,A D C TELECOMMUNICATIONS INC,00088610,11,3,3661,ADCT,001013,01,1999-08-02,2007-06-29,ADC TELECOMMUNICATIONS INC,ADCT.1,000886309,0000061478,3661,334210,45201020,452010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553,89070,ZIMMER HOLDINGS INC,98956P10,11,1,3842,ZMH,144559,01,2001-08-07,2023-12-29,ZIMMER BIOMET HOLDINGS INC,ZBH,98956P102,0001136869,3842,339113,35101010,351010
2554,84129,ZIONS BANCORPORATION,98970110,11,3,6021,ZION,011687,01,2001-06-25,2023-12-29,ZIONS BANCORPORATION NA,ZION,989701107,0000109380,6020,522110,40101015,401010
2555,84129,ZIONS BANCORPORATION N A,98970110,11,3,6021,ZION,011687,01,2001-06-25,2023-12-29,ZIONS BANCORPORATION NA,ZION,989701107,0000109380,6020,522110,40101015,401010
2556,13788,ZOETIS INC,98978V10,11,1,2834,ZTS,013721,01,2013-06-24,2023-12-29,ZOETIS INC,ZTS,98978V103,0001555280,2834,325412,35202010,352020


In [37]:
secid_permno_link  = pd.read_sql("select * from crsp_opm_link", old_db)
secid_permno_link

Unnamed: 0,secid,sdate,edate,permno
0,5001,1996-01-02,1996-03-13,10074
1,5002,1996-01-01,1996-02-22,10154
2,5004,1996-01-01,2000-01-27,80071
3,5005,1996-01-01,1997-08-12,85041
4,5006,1996-01-01,1996-08-28,10496
...,...,...,...,...
31472,218323,2023-03-16,2023-12-29,88885
31473,218324,2023-03-21,2023-12-29,23796
31474,218325,2023-03-17,2023-12-29,23814
31475,218326,2023-03-20,2023-12-29,23760


In [38]:
sp500_const_permno.start

0       1996-01-02
1       1957-03-01
2       1957-03-01
3       1999-08-02
4       1999-08-02
           ...    
2553    2001-08-07
2554    2001-06-25
2555    2001-06-25
2556    2013-06-24
2557    1982-03-11
Name: start, Length: 2558, dtype: object

In [39]:
sp500_const_permno['start'] = pd.to_datetime(sp500_const_permno['start'])
sp500_const_permno['ending'] = pd.to_datetime(sp500_const_permno['ending'])

In [40]:
unique_sdates = sp500_const_permno.start.unique()
unique_edates = sp500_const_permno.ending.unique()

In [41]:
unique_sdates

<DatetimeArray>
['1996-01-02 00:00:00', '1957-03-01 00:00:00', '1999-08-02 00:00:00',
 '2012-10-01 00:00:00', '1998-10-02 00:00:00', '2011-12-13 00:00:00',
 '2008-07-01 00:00:00', '1967-04-13 00:00:00', '1944-06-07 00:00:00',
 '1999-01-04 00:00:00',
 ...
 '2006-07-18 00:00:00', '1958-02-19 00:00:00', '2004-12-29 00:00:00',
 '1999-11-08 00:00:00', '2011-11-01 00:00:00', '1999-12-08 00:00:00',
 '2000-01-06 00:00:00', '2001-08-07 00:00:00', '2001-06-25 00:00:00',
 '2013-06-24 00:00:00']
Length: 1017, dtype: datetime64[ns]

In [42]:
change_dates = (pd.concat([pd.Series(unique_sdates), pd.Series(unique_edates)])
                .sort_values()
                .reset_index(drop=True)
                .drop_duplicates()
                # remove dates before 
                .loc[lambda x: x >= start_hist]
                .reset_index(drop=True))

In [43]:
change_dates

0     2000-01-03
1     2000-01-05
2     2000-01-06
3     2000-01-28
4     2000-01-31
         ...    
841   2023-10-17
842   2023-10-18
843   2023-12-15
844   2023-12-18
845   2023-12-29
Length: 846, dtype: datetime64[ns]

In [44]:
constituents = pd.DataFrame(columns=['secid_constituents_list'], index=change_dates, dtype='object')
constituents

Unnamed: 0,secid_constituents_list
2000-01-03,
2000-01-05,
2000-01-06,
2000-01-28,
2000-01-31,
...,...
2023-10-17,
2023-10-18,
2023-12-15,
2023-12-18,


In [45]:
constituents

Unnamed: 0,secid_constituents_list
2000-01-03,
2000-01-05,
2000-01-06,
2000-01-28,
2000-01-31,
...,...
2023-10-17,
2023-10-18,
2023-12-15,
2023-12-18,


In [46]:
secid_permno_link.dtypes

secid      int64
sdate     object
edate     object
permno     int64
dtype: object

In [47]:
secid_permno_link.edate = secid_permno_link.edate.astype('datetime64[ns]')
secid_permno_link.sdate = secid_permno_link.sdate.astype('datetime64[ns]')


In [48]:
secid_permno_link.dtypes

secid              int64
sdate     datetime64[ns]
edate     datetime64[ns]
permno             int64
dtype: object

In [49]:
for date in change_dates[:-1]:
    constituents_permnos = sp500_const_permno[(sp500_const_permno['start'] <= date) & (sp500_const_permno['ending'] > date)]
    sp500_const_secids = secid_permno_link[(secid_permno_link.permno.isin(constituents_permnos.permno.values)) & (secid_permno_link.sdate <= date) & (secid_permno_link.edate > date)].secid.values
    print(f'Number of constituents {sp500_const_secids.shape[0]}, date {date}')
    constituents.loc[date] = ','.join(sp500_const_secids.astype(str))

Number of constituents 502, date 2000-01-03 00:00:00
Number of constituents 501, date 2000-01-05 00:00:00
Number of constituents 502, date 2000-01-06 00:00:00
Number of constituents 499, date 2000-01-28 00:00:00
Number of constituents 502, date 2000-01-31 00:00:00
Number of constituents 501, date 2000-03-15 00:00:00
Number of constituents 502, date 2000-03-16 00:00:00
Number of constituents 500, date 2000-03-31 00:00:00
Number of constituents 502, date 2000-04-03 00:00:00
Number of constituents 501, date 2000-04-17 00:00:00
Number of constituents 502, date 2000-04-18 00:00:00
Number of constituents 500, date 2000-05-04 00:00:00
Number of constituents 502, date 2000-05-05 00:00:00
Number of constituents 501, date 2000-05-09 00:00:00
Number of constituents 502, date 2000-05-10 00:00:00
Number of constituents 501, date 2000-05-31 00:00:00
Number of constituents 502, date 2000-06-01 00:00:00
Number of constituents 501, date 2000-06-02 00:00:00
Number of constituents 502, date 2000-06-05 00

In [50]:
constituents

Unnamed: 0,secid_constituents_list
2000-01-03,"5067,5142,5169,5176,5206,5212,5237,5317,5385,5..."
2000-01-05,"5067,5142,5169,5176,5206,5212,5237,5317,5385,5..."
2000-01-06,"5067,5142,5169,5176,5206,5212,5237,5317,5385,5..."
2000-01-28,"5067,5142,5169,5176,5206,5212,5317,5385,5418,5..."
2000-01-31,"5067,5142,5169,5176,5206,5212,5317,5385,5418,5..."
...,...
2023-10-17,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-10-18,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-12-15,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-12-18,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."


In [51]:
constituents = constituents.dropna()
constituents

Unnamed: 0,secid_constituents_list
2000-01-03,"5067,5142,5169,5176,5206,5212,5237,5317,5385,5..."
2000-01-05,"5067,5142,5169,5176,5206,5212,5237,5317,5385,5..."
2000-01-06,"5067,5142,5169,5176,5206,5212,5237,5317,5385,5..."
2000-01-28,"5067,5142,5169,5176,5206,5212,5317,5385,5418,5..."
2000-01-31,"5067,5142,5169,5176,5206,5212,5317,5385,5418,5..."
...,...
2023-10-02,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-10-17,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-10-18,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-12-15,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."


In [52]:
constituents.index = constituents.index.map(lambda x: x.date())

In [53]:
[i for i in range(2002,2021)]

[2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020]

In [54]:
# TODO
# go through all years, 
# for each year, get unique dates for which data is availalbe
# for each day, fetch data for all constituents using `constituents` strings for secids
# filterout as much as possible in the query
# save to sqlite


# for year in range(2020, 2024):
#     option_table = db.raw_sql(f"""select * from optionm.opprcd{year} 
#     where date>'2020-11-11'::DATE
#     and date<'2021-02-12'::DATE
#     and secid in {secids}""")
#     stock_table = db.raw_sql(f"""select * from optionm.secprd{year} 
#     where date>'2020-11-11'::DATE
#     and date<'2021-02-12'::DATE
#     and secid in {secids}""")

#     option_table.to_sql(name='option_price_table', con=conn_sqlite, if_exists='append', index=False)
#     stock_table.to_sql(name='stock_price_table', con=conn_sqlite, if_exists='append', index=False)


In [55]:
years_range = range(2015,2024)


In [56]:
option_table

'opprcd'

In [57]:
for year in [2020]:
    start_year = time.time()
    logging.info(f'Fetching dates for year {year}')
    print(f"""select distinct date from optionm.{option_table}{year}""")
    unique_dates = np.sort(wrds_conn.raw_sql(f"""select distinct date from optionm.{option_table}{year}""").values)
    logging.info(f'Unique dates for year {year} fetched in {time.time()-start_year} s')
    for date in unique_dates[:2]:
        # unpack array in the array
        temp_date = date[0]
        date_str = temp_date.strftime('%Y-%m-%d')

        last_available_constituents = (
            constituents
            .loc[constituents.index[constituents.index <= temp_date][-1]]
            .values[0]
        )
        
        start_inner = time.time()
        logging.info(f'Fetching data for date {date_str}')

        option_table_df = wrds_conn.raw_sql(f"""
        SELECT secid, date, exdate, last_date, best_bid, best_offer, volume, open_interest, cp_flag, impl_volatility, strike_price 
        FROM optionm.opprcd{year} 
        WHERE date='{date_str}'::DATE 
        AND secid in ({last_available_constituents}) 
        AND secid IS NOT NULL  
        AND date IS NOT NULL 
        AND exdate IS NOT NULL
        AND last_date IS NOT NULL 
        AND best_bid IS NOT NULL 
        AND best_offer IS NOT NULL 
        AND volume IS NOT NULL 
        AND open_interest IS NOT NULL 
        AND cp_flag IS NOT NULL 
        AND impl_volatility IS NOT NULL 
        AND strike_price IS NOT NULL 
        AND volume > 0 
        AND open_interest > 0 
        AND best_bid > 0  
        AND (exdate::DATE - date::DATE) > 9 
        AND (exdate::DATE - date::DATE) < 730 
        AND (date::DATE - last_date::DATE) < 5
        """)

        logging.info(f'Option data for {date_str} fetched in {(time.time()-start_inner):.2f} s')

        start_inner_stocks = time.time()

        stock_table_df = wrds_conn.raw_sql(f"""select * from optionm.secprd{year} 
        where date='{date_str}'::DATE 
        and secid in ({last_available_constituents})""")

        logging.info(f'Stock data for {date_str} fetched in {(time.time()-start_inner_stocks)/60:.2f} minutes')

        start_writing = time.time()
        option_table_df.to_sql(name='option_price_table', con=sqlite_conn, if_exists='append', index=False)
        stock_table_df.to_sql(name='stock_price_table', con=sqlite_conn, if_exists='append', index=False)
        logging.info(f'Data for {date_str} written in {(time.time()-start_writing):.2f} s')

select distinct date from optionm.opprcd2020


In [58]:
unique_dates[0][0].strftime('%Y-%m-%d')

'2020-01-02'

In [59]:
unique_dates

array([[datetime.date(2020, 1, 2)],
       [datetime.date(2020, 1, 3)],
       [datetime.date(2020, 1, 6)],
       [datetime.date(2020, 1, 7)],
       [datetime.date(2020, 1, 8)],
       [datetime.date(2020, 1, 9)],
       [datetime.date(2020, 1, 10)],
       [datetime.date(2020, 1, 13)],
       [datetime.date(2020, 1, 14)],
       [datetime.date(2020, 1, 15)],
       [datetime.date(2020, 1, 16)],
       [datetime.date(2020, 1, 17)],
       [datetime.date(2020, 1, 21)],
       [datetime.date(2020, 1, 22)],
       [datetime.date(2020, 1, 23)],
       [datetime.date(2020, 1, 24)],
       [datetime.date(2020, 1, 27)],
       [datetime.date(2020, 1, 28)],
       [datetime.date(2020, 1, 29)],
       [datetime.date(2020, 1, 30)],
       [datetime.date(2020, 1, 31)],
       [datetime.date(2020, 2, 3)],
       [datetime.date(2020, 2, 4)],
       [datetime.date(2020, 2, 5)],
       [datetime.date(2020, 2, 6)],
       [datetime.date(2020, 2, 7)],
       [datetime.date(2020, 2, 10)],
       [date