In [6]:
import pandas as pd 
import numpy as np

import os
from tqdm import tqdm

In [7]:
directory = 'data/prices_1_day'
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
csv_files[:5]

['ABI_daily_stock_prices.csv',
 'EMC_daily_stock_prices.csv',
 'TROW_daily_stock_prices.csv',
 'ASN_daily_stock_prices.csv',
 'MZIAQ_daily_stock_prices.csv']

In [13]:
def build_one_df(directory:str, core_ticker:str) -> pd.DataFrame:

    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    tickers = [f.split('_')[0] for f in csv_files]
    df_list = [pd.read_csv(os.path.join(directory, file)) for file in tqdm(csv_files)]

    core_frame = pd.read_csv(os.path.join(directory,[_ for _ in csv_files if core_ticker in _][0]))
    
    ohlc_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Stock Splits', 'in_sp500']

    for i in tqdm(range(len(df_list))):
        if (tickers[i] != core_ticker) and (df_list[i].empty == False):
            frame = df_list[i][['Date'] + ohlc_columns]
            frame.columns = ['Date'] + [f'{tickers[i]}_{col}' for col in ohlc_columns]
            core_frame = core_frame.merge(frame, how='left', on=['Date'])

    return core_frame



In [10]:
csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
tickers = [f.split('_')[0] for f in csv_files]
df_list = [pd.read_csv(os.path.join(directory, file)) for file in tqdm(csv_files)]
df_list[0]

100%|██████████| 1163/1163 [00:10<00:00, 110.36it/s]


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,in_sp500


In [12]:
df_list[20]

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits,in_sp500
0,2000-01-03,PAYX,26.666668,26.750000,24.916668,26.666668,3018300,0.0,0.0,1
1,2000-01-04,PAYX,25.375000,27.000000,24.166668,24.666668,2965650,0.0,0.0,1
2,2000-01-05,PAYX,24.750000,26.500000,24.666668,26.125000,4059300,0.0,0.0,1
3,2000-01-06,PAYX,26.000000,27.375000,25.666668,27.083332,2601600,0.0,0.0,1
4,2000-01-07,PAYX,27.041668,27.625000,26.250000,27.625000,1896600,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...
6336,2025-03-13,PAYX,144.979996,145.839996,142.880005,143.559998,1977300,0.0,0.0,1
6337,2025-03-14,PAYX,143.360001,145.850006,143.259995,145.440002,1591300,0.0,0.0,1
6338,2025-03-17,PAYX,144.910004,148.440002,144.479996,147.990005,1897000,0.0,0.0,1
6339,2025-03-18,PAYX,146.470001,147.529999,145.619995,146.990005,1351100,0.0,0.0,1


In [14]:
spy_df = build_one_df('data/prices_1_day', 'SPY')

100%|██████████| 1163/1163 [00:10<00:00, 109.52it/s]
100%|██████████| 1163/1163 [04:28<00:00,  4.34it/s]


In [None]:
spy_df

array([  0.99944198,   0.91517901,   0.92857099, ..., 214.        ,
       212.69000244, 215.24000549], shape=(5805,))

In [19]:
spy_df

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits,in_sp500,...,VLTO_Volume,VLTO_Stock Splits,VLTO_in_sp500,JOS_Open,JOS_High,JOS_Low,JOS_Close,JOS_Volume,JOS_Stock Splits,JOS_in_sp500
0,2000-01-03,SPY,148.250000,148.250000,143.875000,145.437500,8164300,0.0,0.0,1,...,,,,,,,,,,
1,2000-01-04,SPY,143.531250,144.062500,139.640625,139.750000,8089800,0.0,0.0,1,...,,,,,,,,,,
2,2000-01-05,SPY,139.937500,141.531250,137.250000,140.000000,12177900,0.0,0.0,1,...,,,,,,,,,,
3,2000-01-06,SPY,139.625000,141.500000,137.750000,137.750000,6227200,0.0,0.0,1,...,,,,,,,,,,
4,2000-01-07,SPY,140.312500,145.750000,140.062500,145.750000,8066500,0.0,0.0,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6336,2025-03-13,SPY,558.489990,559.109985,549.679993,551.419983,74079400,0.0,0.0,1,...,1368100.0,0.0,1.0,,,,,,,
6337,2025-03-14,SPY,556.109985,563.830017,551.489990,562.809998,62660300,0.0,0.0,1,...,2056800.0,0.0,1.0,,,,,,,
6338,2025-03-17,SPY,562.789978,569.710022,562.349976,567.150024,49008700,0.0,0.0,1,...,1189300.0,0.0,1.0,,,,,,,
6339,2025-03-18,SPY,564.799988,565.020020,559.059998,561.020020,66041400,0.0,0.0,1,...,1315800.0,0.0,1.0,,,,,,,


In [21]:
print('Date-range')
print(spy_df['Date'].min(), '--', spy_df['Date'].max())

Date-range
2000-01-03 -- 2025-03-19


In [26]:
missing_pct = (spy_df.isnull().sum()/len(spy_df)).reset_index()
missing_pct.columns = ['Ticker', 'missing_%']

missing_pct = missing_pct[~missing_pct['Ticker'].isin(['Open', 'High', 'Low', 'Close', 'Volume', 'Stock Splits', 'in_sp500'])]
missing_pct['Ticker'] = missing_pct['Ticker'].apply(lambda x: x.split('_')[0])

missing_stats = missing_pct.groupby(by = ['Ticker'])['missing_%'].max().reset_index()

missing_stats = missing_stats.sort_values(by=['missing_%'], ascending=False)
missing_stats

Unnamed: 0,Ticker,missing_%
613,PETM,0.999842
206,CVH,0.999842
644,PVN,0.999842
501,MFE,0.999685
106,BLY,0.999685
...,...,...
82,BALL,0.000000
83,BAX,0.000000
85,BBWI,0.000000
86,BBY,0.000000


In [None]:
## Tickers presented in SPY for the whole period 2000-2025
missing_stats[missing_stats['missing_%'] == 0]

Unnamed: 0,Ticker,missing_%
56,APD,0.0
57,APH,0.0
61,ARE,0.0
64,ASH,0.0
802,UPS,0.0
...,...,...
82,BALL,0.0
83,BAX,0.0
85,BBWI,0.0
86,BBY,0.0


In [None]:
## Tickers presented in SPY for 80% of the period 2000-2025
missing_stats[missing_stats['missing_%'] < 0.2]

Unnamed: 0,Ticker,missing_%
474,LVS,0.196184
522,MPWR,0.193503
338,GLD,0.193345
509,MKTX,0.191926
231,DLR,0.191137
...,...,...
82,BALL,0.000000
83,BAX,0.000000
85,BBWI,0.000000
86,BBY,0.000000


In [35]:
core_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']

sample = spy_df[core_columns + [col for col in spy_df.columns if col.split('_')[0] in ['AAPL', 'TSLA', 'MSFT'] ]]

sample.to_csv('sample.csv', index=False)

sample

Unnamed: 0,Date,Open,High,Low,Close,Volume,AAPL_Open,AAPL_High,AAPL_Low,AAPL_Close,...,MSFT_Volume,MSFT_Stock Splits,MSFT_in_sp500,TSLA_Open,TSLA_High,TSLA_Low,TSLA_Close,TSLA_Volume,TSLA_Stock Splits,TSLA_in_sp500
0,2000-01-03,148.250000,148.250000,143.875000,145.437500,8164300,0.936384,1.004464,0.907924,0.999442,...,53228400,0.0,1,,,,,,,
1,2000-01-04,143.531250,144.062500,139.640625,139.750000,8089800,0.966518,0.987723,0.903460,0.915179,...,54119000,0.0,1,,,,,,,
2,2000-01-05,139.937500,141.531250,137.250000,140.000000,12177900,0.926339,0.987165,0.919643,0.928571,...,64059600,0.0,1,,,,,,,
3,2000-01-06,139.625000,141.500000,137.750000,137.750000,6227200,0.947545,0.955357,0.848214,0.848214,...,54976600,0.0,1,,,,,,,
4,2000-01-07,140.312500,145.750000,140.062500,145.750000,8066500,0.861607,0.901786,0.852679,0.888393,...,62013600,0.0,1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6336,2025-03-13,558.489990,559.109985,549.679993,551.419983,74079400,215.949997,216.839996,208.419998,209.679993,...,20473000,0.0,1,248.130005,248.289993,232.600006,240.679993,114813500.0,0.0,1.0
6337,2025-03-14,556.109985,563.830017,551.489990,562.809998,62660300,211.250000,213.949997,209.580002,213.490005,...,19952800,0.0,1,247.309998,251.580002,240.729996,249.979996,100242300.0,0.0,1.0
6338,2025-03-17,562.789978,569.710022,562.349976,567.150024,49008700,213.309998,215.220001,209.970001,214.000000,...,22474300,0.0,1,245.059998,245.399994,232.800003,238.009995,111900600.0,0.0,1.0
6339,2025-03-18,564.799988,565.020020,559.059998,561.020020,66041400,214.160004,215.149994,211.490005,212.690002,...,19486900,0.0,1,228.160004,230.100006,222.279999,225.309998,111477600.0,0.0,1.0
