In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from polygon import RESTClient

from tqdm import tqdm
from dotenv import load_dotenv
import os


# Setup

In [2]:
def get_intervals(start_date:str=None, end_date:str=None) -> list[tuple[str]]:
    '''
    Creates list of dateranges
    '''
    period_years, year_size = 10, 365
    
    if end_date is None:
        end_date = datetime.now()
        start_date = end_date-timedelta(days=period_years * year_size)
    else:
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
        end_date = datetime.strptime(end_date, "%Y-%m-%d")

    datarange = pd.date_range(start=start_date, end=end_date)
    intervals = [(datarange[i].strftime("%Y-%m-%d"), datarange[i+year_size-1].strftime("%Y-%m-%d")) for i in range(0, len(datarange)-year_size+1, year_size)]
    return intervals


def get_tickers():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    sp_500_wiki_info = pd.read_html(url, attrs={'id':'constituents'})[0]
    sp_500_tickers = sp_500_wiki_info['Symbol'].to_list() + ['SPY']
    return sp_500_tickers, sp_500_wiki_info



In [3]:
load_dotenv()

data_dir = 'data'

# Data Range settings
now, end_date = datetime.now(), datetime.now().strftime("%Y-%m-%d")
start_date = (now-timedelta(days= 10 * 365)).strftime("%Y-%m-%d")

sp_500_tickers, sp_500_wiki_info = get_tickers()

# Polygon Client settings
timespan = 'minute'
multiplier = 30
client = RESTClient(os.environ['PTOKEN'])

# Data Collection

In [4]:
def collect_tickers_history(tickers:list, interval:list, timespan:str, multiplier:int):

    for ticker in tqdm(tickers):

        result_file_name = f'{data_dir}/{ticker}_{multiplier}_{timespan}_stock_prices.csv'
        
        if os.path.isfile(result_file_name):
            continue

        batch = client.list_aggs(
            ticker = ticker,
            multiplier = multiplier,
            timespan = timespan,
            from_ = interval[0],
            to = interval[1]
            )
        
        batch_df = pd.DataFrame([_ for _ in batch])
        batch_df['ticker'] = ticker
        batch_df.to_csv(result_file_name, index=False)


In [5]:
collect_tickers_history(
    tickers=sp_500_tickers, 
    interval=(start_date, end_date),
    timespan = timespan, 
    multiplier = multiplier
    )

  0%|          | 0/504 [00:00<?, ?it/s]

100%|██████████| 504/504 [4:07:19<00:00, 29.44s/it]  


In [2]:
df = pd.read_csv('/Users/vav/Work/thesis_2025/ts_llm_prediction/data/prices_30_min/AAPL_30_minute_stock_prices.csv')
df

Unnamed: 0,open,high,low,close,volume,vwap,timestamp,transactions,otc,ticker
0,31.5950,31.6425,31.5825,31.5925,25760.0,31.5954,1427097600000,23,,AAPL
1,31.5925,31.6475,31.5750,31.6475,10500.0,31.6125,1427099400000,11,,AAPL
2,31.6475,31.6625,31.6325,31.6625,12780.0,31.6484,1427101200000,15,,AAPL
3,31.6700,31.6850,31.6350,31.6850,9760.0,31.6598,1427103000000,24,,AAPL
4,31.6600,31.6750,31.6350,31.6450,54844.0,31.6556,1427104800000,32,,AAPL
...,...,...,...,...,...,...,...,...,...,...
79229,212.7600,213.0000,212.7500,212.7835,5854.0,212.8193,1742333400000,104,,AAPL
79230,212.8800,212.8800,212.6037,212.6037,9727.0,212.7563,1742335200000,229,,AAPL
79231,212.6000,212.9900,212.3700,212.8300,8418.0,212.6877,1742337000000,174,,AAPL
79232,212.7500,212.9000,212.6000,212.8000,6812.0,212.7825,1742338800000,109,,AAPL
