In [15]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from polygon import RESTClient

from tqdm import tqdm
from dotenv import load_dotenv
import os


# Setup

In [16]:
def get_intervals(start_date:str=None, end_date:str=None) -> list[tuple[str]]:
    '''
    Creates list of dateranges
    '''
    period_years, year_size = 10, 365
    
    if end_date is None:
        end_date = datetime.now()
        start_date = end_date-timedelta(days=period_years * year_size)
    else:
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
        end_date = datetime.strptime(end_date, "%Y-%m-%d")

    datarange = pd.date_range(start=start_date, end=end_date)
    intervals = [(datarange[i].strftime("%Y-%m-%d"), datarange[i+year_size-1].strftime("%Y-%m-%d")) for i in range(0, len(datarange)-year_size+1, year_size)]
    return intervals


def get_tickers():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    sp_500_wiki_info = pd.read_html(url, attrs={'id':'constituents'})[0]
    sp_500_tickers = ['SPY'] + sp_500_wiki_info['Symbol'].to_list()
    return sp_500_tickers, sp_500_wiki_info



In [None]:
load_dotenv()

data_dir = 'data'

# Data Range settings
intervals = get_intervals(start_date=None, end_date=None)
sp_500_tickers, sp_500_wiki_info = get_tickers()

# Polygon Client settings
timespan = 'minute'
client = RESTClient(os.environ['PTOKEN'])

In [18]:
intervals

[('2020-03-21', '2021-03-20'),
 ('2021-03-21', '2022-03-20'),
 ('2022-03-21', '2023-03-20'),
 ('2023-03-21', '2024-03-19')]

# Data Collection

In [None]:
def collect_tickers_history(tickers:list, intervals:list):

    for ticker in tqdm(tickers):
        frames = []
        for interval in intervals:
            print(interval)
            batch = client.list_aggs(
                ticker = ticker,
                multiplier = 1,
                timespan = timespan,
                from_ = interval[0],
                to = interval[1]
                )
            batch_df = pd.DataFrame([_ for _ in batch])
            frames.append(batch_df)

        ticker_history_df = pd.concat(frames).drop_duplicates(ignore_index=True)
        ticker_history_df['ticker'] = ticker
        ticker_history_df.to_csv(f'{data_dir}/{ticker}_{timespan}_stock_prices.csv', index=False)


collect_tickers_history(sp_500_tickers[:100], intervals)

  0%|          | 0/2 [00:00<?, ?it/s]

('2020-03-21', '2021-03-20')
