In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from polygon import RESTClient

from tqdm import tqdm
from dotenv import load_dotenv
import os


# Setup

In [2]:
def get_intervals(start_date:str=None, end_date:str=None) -> list[tuple[str]]:
    '''
    Creates list of dateranges
    '''
    period_years, year_size = 10, 365
    
    if end_date is None:
        end_date = datetime.now()
        start_date = end_date-timedelta(days=period_years * year_size)
    else:
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
        end_date = datetime.strptime(end_date, "%Y-%m-%d")

    datarange = pd.date_range(start=start_date, end=end_date)
    intervals = [(datarange[i].strftime("%Y-%m-%d"), datarange[i+year_size-1].strftime("%Y-%m-%d")) for i in range(0, len(datarange)-year_size+1, year_size)]
    return intervals


def get_tickers():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    sp_500_wiki_info = pd.read_html(url, attrs={'id':'constituents'})[0]
    sp_500_tickers = sp_500_wiki_info['Symbol'].to_list() + ['SPY']
    return sp_500_tickers, sp_500_wiki_info



In [3]:
load_dotenv()

data_dir = 'data'

# Data Range settings
now, end_date = datetime.now(), datetime.now().strftime("%Y-%m-%d")
start_date = (now-timedelta(days= 10 * 365)).strftime("%Y-%m-%d")

sp_500_tickers, sp_500_wiki_info = get_tickers()

# Polygon Client settings
timespan = 'minute'
multiplier = 30
client = RESTClient(os.environ['PTOKEN'])

# Data Collection

In [4]:
def collect_tickers_history(tickers:list, interval:list, timespan:str, multiplier:int):

    for ticker in tqdm(tickers):

        result_file_name = f'{data_dir}/{ticker}_{multiplier}_{timespan}_stock_prices.csv'
        
        if os.path.isfile(result_file_name):
            continue

        batch = client.list_aggs(
            ticker = ticker,
            multiplier = multiplier,
            timespan = timespan,
            from_ = interval[0],
            to = interval[1]
            )
        
        batch_df = pd.DataFrame([_ for _ in batch])
        batch_df['ticker'] = ticker
        batch_df.to_csv(result_file_name, index=False)


In [5]:
collect_tickers_history(
    tickers=sp_500_tickers, 
    interval=(start_date, end_date),
    timespan = timespan, 
    multiplier = multiplier
    )

  0%|          | 0/504 [00:00<?, ?it/s]

100%|██████████| 504/504 [4:07:19<00:00, 29.44s/it]  
