# Preproces data for the replication part of the project

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import time
import logging
import wrds
import os
import sys
import sqlite3
from pathlib import Path

path_to_db = os.path.join(Path(os.getcwd()).parent) + "/data/raw/big_set.db"
path_to_external_hd_db = '/media/miroslav/Miroslav Backup/cpop_data/option_prices_raw.db'
path_to_old_db = os.path.join(Path(os.getcwd()).parent) + "/data/db/option_prices_backup.db"
path_to_db96 = os.path.join(Path(os.getcwd()).parent) + "/data/raw/big_set_96-99.db"


# Set up logging
logging_file = os.path.join(Path(os.getcwd()).parent) + '/logs/data_fetching.log'
logger = logging.getLogger(__name__)
logging.basicConfig(filename=logging_file, encoding='utf-8', level=logging.INFO)

In [2]:
sqlite_conn96 = sqlite3.connect(path_to_db96)

In [9]:
sqlite_conn = sqlite3.connect(path_to_external_hd_db)
old_db = sqlite3.connect(path_to_old_db)
# cur = conn_sqlite.cursor()

In [None]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
wrds_conn = wrds.Connection(wrds_username=os.getenv("WRDS_USERNAME"), wrds_password=os.getenv("WRDS_PASSWORD"))

Loading library list...
Done


## Data cleaning steps

### 1. Cleaning on the WRDS server, before fetching
At each timestep, we fetch data only for the SP500 constituents on that day.

The following query is used to fetch data from the WRDS server, and it is self-explanatory.

```{sql}
SELECT secid, date, exdate, last_date, best_bid, best_offer, volume, open_interest, cp_flag, impl_volatility, strike_price 
FROM optionm.opprcd{year} 
WHERE date='{date_str}'::DATE 
AND secid in ({last_available_constituents}) 

AND secid IS NOT NULL  
AND date IS NOT NULL 
AND exdate IS NOT NULL
AND last_date IS NOT NULL 
AND best_bid IS NOT NULL 
AND best_offer IS NOT NULL 
AND volume IS NOT NULL 
AND open_interest IS NOT NULL 
AND cp_flag IS NOT NULL 
AND impl_volatility IS NOT NULL 
AND strike_price IS NOT NULL 

AND volume > 0 
AND open_interest > 0 
AND best_bid > 0  
AND (exdate::DATE - date::DATE) > 9 
AND (exdate::DATE - date::DATE) < 730 
AND (date::DATE - last_date::DATE) < 5
```

The last conditions filter out options with zero volume, open interest or zero bid price (which means the option was not quoted on that day, so we can not calculate the price of an option).
The condition `(exdate::DATE - date::DATE) > 9` filters out options that are too close to expiration, and `(exdate::DATE - date::DATE) < 730` filters out options that are more then 2 years from expiration. The condition `(date::DATE - last_date::DATE) < 5` filters out options that have not been traded for the last 5 days (there are points with positive volume and `(date::DATE - last_date::DATE) > 0`, which is about 1.5% of the raw dataset).

### 2. Cleaning on the local machine, after fetching


## Options and stock prices

In [5]:
option_table = 'opprcd'
forward_table = 'fwdprd'
price_table = 'secprd'
ir_table = 'zerocd'
index_secid=108105

start_hist = pd.Timestamp('1996-01-01')

In [6]:
# query = "SELECT * FROM sp500_constituents" 
# schema_overrides = {
#     'id': pl.Int32,
#     'S': pl.Float64,
#     'sigma': pl.Float64,
#     'tau': pl.Float64,
#     'r': pl.Float64,
#     'K': pl.Float64,
#     'V': pl.Float64,
#     'cp_flag': pl.Utf8}

# sp500_consts_data = pl.read_database(query, old_db)

In [10]:
sp500_const_permno = pd.read_sql("select * from sp500_constituents", old_db)
sp500_const_permno

Unnamed: 0,permno,comnam,ncusip,shrcd,exchcd,hsiccd,ticker,gvkey,iid,start,ending,conm,tic,cusip,cik,sic,naics,gsubind,gind
0,76129,3COM CORP,88553510,11,3,3674,COMS,010553,01,1996-01-02,2000-07-27,3COM CORP,COMS.,885535104,0000738076,3576,334119,45201020,452010
1,22592,3M CO,88579Y10,11,1,3841,MMM,007435,01,1957-03-01,2023-12-29,3M CO,MMM,88579Y101,0000066740,9997,999977,20105010,201050
2,10006,A C F INDUSTRIES INC,00080010,10,1,3743,ACF,001010,01,1957-03-01,1984-07-18,ACF INDUSTRIES INC,4165A,00099V004,0000910627,3743,336510,20304010,203040
3,50906,A D C TELECOMMUNICATIONS INC,00088630,11,3,3661,ADCT,001013,01,1999-08-02,2007-06-29,ADC TELECOMMUNICATIONS INC,ADCT.1,000886309,0000061478,3661,334210,45201020,452010
4,50906,A D C TELECOMMUNICATIONS INC,00088610,11,3,3661,ADCT,001013,01,1999-08-02,2007-06-29,ADC TELECOMMUNICATIONS INC,ADCT.1,000886309,0000061478,3661,334210,45201020,452010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2553,89070,ZIMMER HOLDINGS INC,98956P10,11,1,3842,ZMH,144559,01,2001-08-07,2023-12-29,ZIMMER BIOMET HOLDINGS INC,ZBH,98956P102,0001136869,3842,339113,35101010,351010
2554,84129,ZIONS BANCORPORATION,98970110,11,3,6021,ZION,011687,01,2001-06-25,2023-12-29,ZIONS BANCORPORATION NA,ZION,989701107,0000109380,6020,522110,40101015,401010
2555,84129,ZIONS BANCORPORATION N A,98970110,11,3,6021,ZION,011687,01,2001-06-25,2023-12-29,ZIONS BANCORPORATION NA,ZION,989701107,0000109380,6020,522110,40101015,401010
2556,13788,ZOETIS INC,98978V10,11,1,2834,ZTS,013721,01,2013-06-24,2023-12-29,ZOETIS INC,ZTS,98978V103,0001555280,2834,325412,35202010,352020


In [11]:
secid_permno_link  = pd.read_sql("select * from crsp_opm_link", old_db)
secid_permno_link

Unnamed: 0,secid,sdate,edate,permno
0,5001,1996-01-02,1996-03-13,10074
1,5002,1996-01-01,1996-02-22,10154
2,5004,1996-01-01,2000-01-27,80071
3,5005,1996-01-01,1997-08-12,85041
4,5006,1996-01-01,1996-08-28,10496
...,...,...,...,...
31472,218323,2023-03-16,2023-12-29,88885
31473,218324,2023-03-21,2023-12-29,23796
31474,218325,2023-03-17,2023-12-29,23814
31475,218326,2023-03-20,2023-12-29,23760


In [12]:
sp500_const_permno.start

0       1996-01-02
1       1957-03-01
2       1957-03-01
3       1999-08-02
4       1999-08-02
           ...    
2553    2001-08-07
2554    2001-06-25
2555    2001-06-25
2556    2013-06-24
2557    1982-03-11
Name: start, Length: 2558, dtype: object

In [13]:
sp500_const_permno['start'] = pd.to_datetime(sp500_const_permno['start'])
sp500_const_permno['ending'] = pd.to_datetime(sp500_const_permno['ending'])

In [14]:
unique_sdates = sp500_const_permno.start.unique()
unique_edates = sp500_const_permno.ending.unique()

In [15]:
unique_sdates

<DatetimeArray>
['1996-01-02 00:00:00', '1957-03-01 00:00:00', '1999-08-02 00:00:00',
 '2012-10-01 00:00:00', '1998-10-02 00:00:00', '2011-12-13 00:00:00',
 '2008-07-01 00:00:00', '1967-04-13 00:00:00', '1944-06-07 00:00:00',
 '1999-01-04 00:00:00',
 ...
 '2006-07-18 00:00:00', '1958-02-19 00:00:00', '2004-12-29 00:00:00',
 '1999-11-08 00:00:00', '2011-11-01 00:00:00', '1999-12-08 00:00:00',
 '2000-01-06 00:00:00', '2001-08-07 00:00:00', '2001-06-25 00:00:00',
 '2013-06-24 00:00:00']
Length: 1017, dtype: datetime64[ns]

In [16]:
change_dates = (pd.concat([pd.Series(unique_sdates), pd.Series(unique_edates)])
                .sort_values()
                .reset_index(drop=True)
                .drop_duplicates()
                # remove dates before 
                .loc[lambda x: x >= start_hist]
                .reset_index(drop=True))

In [17]:
change_dates

0      1996-01-02
1      1996-01-19
2      1996-01-22
3      1996-02-09
4      1996-02-12
          ...    
1055   2023-10-17
1056   2023-10-18
1057   2023-12-15
1058   2023-12-18
1059   2023-12-29
Length: 1060, dtype: datetime64[ns]

In [18]:
constituents = pd.DataFrame(columns=['secid_constituents_list'], index=change_dates, dtype='object')
constituents

Unnamed: 0,secid_constituents_list
1996-01-02,
1996-01-19,
1996-01-22,
1996-02-09,
1996-02-12,
...,...
2023-10-17,
2023-10-18,
2023-12-15,
2023-12-18,


In [20]:
secid_permno_link.dtypes

secid      int64
sdate     object
edate     object
permno     int64
dtype: object

In [21]:
secid_permno_link.edate = secid_permno_link.edate.astype('datetime64[ns]')
secid_permno_link.sdate = secid_permno_link.sdate.astype('datetime64[ns]')


In [22]:
secid_permno_link.dtypes

secid              int64
sdate     datetime64[ns]
edate     datetime64[ns]
permno             int64
dtype: object

In [23]:
for date in change_dates[:-1]:
    constituents_permnos = sp500_const_permno[(sp500_const_permno['start'] <= date) & (sp500_const_permno['ending'] > date)]
    sp500_const_secids = secid_permno_link[(secid_permno_link.permno.isin(constituents_permnos.permno.values)) & (secid_permno_link.sdate <= date) & (secid_permno_link.edate > date)].secid.values
    print(f'Number of constituents {sp500_const_secids.shape[0]}, date {date}')
    constituents.loc[date] = ','.join(sp500_const_secids.astype(str))

Number of constituents 503, date 1996-01-02 00:00:00
Number of constituents 502, date 1996-01-19 00:00:00
Number of constituents 503, date 1996-01-22 00:00:00
Number of constituents 502, date 1996-02-09 00:00:00
Number of constituents 503, date 1996-02-12 00:00:00
Number of constituents 502, date 1996-03-07 00:00:00
Number of constituents 503, date 1996-03-08 00:00:00
Number of constituents 502, date 1996-03-12 00:00:00
Number of constituents 503, date 1996-03-13 00:00:00
Number of constituents 502, date 1996-03-27 00:00:00
Number of constituents 503, date 1996-03-28 00:00:00
Number of constituents 501, date 1996-03-29 00:00:00
Number of constituents 503, date 1996-04-01 00:00:00
Number of constituents 502, date 1996-04-22 00:00:00
Number of constituents 503, date 1996-04-23 00:00:00
Number of constituents 503, date 1996-05-30 00:00:00
Number of constituents 504, date 1996-05-31 00:00:00
Number of constituents 502, date 1996-07-18 00:00:00
Number of constituents 502, date 1996-07-19 00

In [25]:
constituents = constituents.dropna()
constituents

Unnamed: 0,secid_constituents_list
1996-01-02,"5015,5022,5029,5036,5046,5048,5049,5056,5058,5..."
1996-01-19,"5015,5022,5029,5036,5046,5048,5049,5056,5058,5..."
1996-01-22,"5015,5022,5029,5036,5046,5048,5049,5056,5058,5..."
1996-02-09,"5015,5022,5029,5036,5046,5048,5049,5056,5058,5..."
1996-02-12,"5015,5022,5029,5036,5046,5048,5049,5056,5058,5..."
...,...
2023-10-02,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-10-17,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-10-18,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."
2023-12-15,"5505,5519,5876,5916,6426,6514,7308,8420,100892..."


In [26]:
constituents.index = constituents.index.map(lambda x: x.date())

In [28]:
# Add IR data
ir = wrds_conn.get_table(library='optionm', table='zerocd')
ir.to_sql('zerocd', con=sqlite_conn, if_exists='replace', index=False)

298801

# Split data by years and store in parquet files

In [29]:
path_to_live_external_hd_db = '/media/miroslav/Miroslav Backup/cpop_data/option_prices_live.db'
sqlite_conn = sqlite3.connect(path_to_live_external_hd_db)


In [None]:
# sqlite_conn.close()

In [30]:
query = """
SELECT name
FROM sqlite_master
WHERE type='table'
"""

tables = pd.read_sql(query, sqlite_conn96)

In [31]:
tables

Unnamed: 0,name
0,option_price_table
1,stock_price_table
2,zerocd
3,interpolated_data
4,clean_table


In [None]:
query = """
SELECT *
FROM clean_table 
WHERE date<'2001-01-01'
"""

# schema_overrides = {
#     'secid': pl.Int32,
#     'date': pl.Date,
#     'S': pl.Float64,
#     'sigma': pl.Float64,
#     'tau': pl.Float64,
#     'r': pl.Float64,
#     'cp_flag': pl.Utf8,
#     'V': pl.Float64
#     }

# sample_data = pl.read_database(query, sqlite_conn96)#, schema_overrides=schema_overrides, infer_schema_length=None)
sample_data = pl.read_database(query, sqlite_conn)

In [None]:
# change date columns type from string to date
sample_data = sample_data.with_columns([pl.col('date').cast(pl.Date),
                                        pl.col('secid').cast(pl.Int32)])

# set cp_flag to TURE if it is C, false it is P
sample_data = sample_data.with_columns([pl.when(pl.col('cp_flag') == 'C').then(True).otherwise(False).alias('cp_flag').cast(pl.Boolean)])


In [None]:
sample_data

secid,date,S,sigma,tau,r,cp_flag,V
i32,date,f64,f64,f64,f64,bool,f64
5067,2000-01-03,1.121667,0.572145,0.052055,5.925314,false,0.014167
5067,2000-01-03,1.0515625,0.564826,0.052055,5.925314,false,0.030469
5067,2000-01-03,0.84125,0.452223,0.052055,5.925314,true,0.001875
5067,2000-01-03,0.934722,0.523723,0.052055,5.925314,true,0.020833
5067,2000-01-03,0.989706,0.54661,0.052055,5.925314,true,0.044853
…,…,…,…,…,…,…,…
112507,2000-12-29,1.173229,0.538585,0.309589,6.218447,true,0.246429
112507,2000-12-29,1.026575,0.531451,0.309589,6.218447,true,0.1421875
112507,2000-12-29,0.912511,0.495321,0.558904,6.020147,true,0.1125
112507,2000-12-29,1.026575,0.543513,0.060274,6.654328,false,0.040625


In [None]:
# drop secid and date columns
sample_data = sample_data.drop(['secid', 'date'])

In [41]:
os.path.join(Path(os.getcwd()).parent)

'/home/miroslav/Documents/projects/applied_qf/conformal-op'

In [None]:
# loop through years and write data to parquet

for year in range(1996, 2024):
    query = f"""
    SELECT *
    FROM clean_table 
    WHERE date>='{year}-01-01'
    AND date<'{year+1}-01-01'
    """
    sample_data = pl.read_database(query, sqlite_conn)
    sample_data = sample_data.drop(['secid', 'date'])
    #sample_data = sample_data.with_columns([pl.col('date').cast(pl.Date),
    #                                        pl.col('secid').cast(pl.Int32)])
    sample_data = sample_data.with_columns([pl.when(pl.col('cp_flag') == 'C').then(True).otherwise(False).alias('cp_flag').cast(pl.Boolean)])
    sample_data.write_parquet(os.path.join(Path(os.getcwd()).parent) + f'/data/processed/real_walk_fwd/sample_data_{year}.parquet')

In [48]:
# read the parquet file
sample_data = pl.read_parquet(os.path.join(Path(os.getcwd()).parent) + '/data/processed/real_walk_fwd/sample_data_2023.parquet')

In [49]:
sample_data

S,sigma,tau,r,cp_flag,V
f64,f64,f64,f64,bool,f64
1.068358,0.267878,0.065753,4.079103,false,0.00597
0.967297,0.227169,0.084932,4.116778,true,0.013851
0.941842,0.226045,0.084932,4.116778,true,0.006908
1.068358,0.276102,0.084932,4.116778,false,0.008955
0.89475,0.216716,0.621918,4.847913,true,0.028125
…,…,…,…,…,…
0.992222,0.257706,0.041096,5.0384935,false,0.023889
0.976719,0.260331,0.041096,5.0384935,false,0.033359
0.961692,0.263865,0.041096,5.0384935,false,0.044231
0.947121,0.267222,0.041096,5.0384935,false,0.056061


## Save dates and secid

We later realized we need this for the analysis part, so we save it here.

In [50]:
# loop through years and write data to parquet

for year in range(2000, 2024):
    query = f"""
    SELECT secid, date
    FROM clean_table 
    WHERE date>='{year}-01-01'
    AND date<'{year+1}-01-01'
    """
    sample_data = pl.read_database(query, sqlite_conn)
    # sample_data = sample_data.drop(['secid', 'date'])
    # sample_data = sample_data.drop(['S', 'sigma', 'tau', 'r', 'V', 'cp_flag'])
    sample_data = sample_data.with_columns([pl.col('date').cast(pl.Date),
                                           pl.col('secid').cast(pl.Int32)])
    #sample_data = sample_data.with_columns([pl.when(pl.col('cp_flag') == 'C').then(True).otherwise(False).alias('cp_flag').cast(pl.Boolean)])
    sample_data.write_parquet(os.path.join(Path(os.getcwd()).parent) + f'/data/processed/real_walk_fwd/sample_data_{year}_dates_secids.parquet')