In [5]:
from typing import Optional
from pathlib import Path
from datetime import datetime, timedelta

import pandas as pd
import fire
from alpaca.data.historical import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame

from src.path import DATA_DIR
from src.logger import get_console_logger
import src.config as cfg


In [8]:
cfg.APIkeys.alpaca_api_key

In [90]:

logger = get_console_logger(name='dataset_generation')

# Initialize Alpaca client
#client = StockHistoricalDataClient(cfg.APIkeys.alpaca_api_key, cfg.APIkeys.alpaca_secret_key)
client = StockHistoricalDataClient("PKSQNNG8GVINE9P4BOCI",  "G7PZXasCjXqnYJYB9QoSiF5iBYQSscn7zexMsfKx")
def download_ohlc_data_from_alpaca(
    symbol: Optional[str] = "NYSE",
    from_day: Optional[str] = "2023-01-01",
    to_day: Optional[str] = "2024-01-01",
) -> Path:
    """
    Downloads historical OHLC data from Alpaca API and saves data to disk
    """
    # create list of days as strings
    days = pd.date_range(start=from_day, end=to_day, freq="1D")
    
    # create empty dataframe
    data = pd.DataFrame()

    # create download dir folder if it doesn't exist
    download_dir = DATA_DIR / 'downloads'
    download_dir.mkdir(parents=True, exist_ok=True)
    
    for day in days:
        day_str = day.strftime("%Y-%m-%d")
        file_name = download_dir / f'{day_str}.parquet'
        if file_name.exists():
            logger.info(f'File {file_name} already exists, skipping')
            data_one_day = pd.read_parquet(file_name)
        else:
            logger.info(f'Downloading data for {day_str}')
            data_one_day = download_data_for_one_day(symbol, day_str)
            data_one_day.to_parquet(file_name, index=False)
        
        # combine today's file with the rest of the data
        data = pd.concat([data, data_one_day], ignore_index=True)

    # save data to disk   
    output_file = DATA_DIR / f"ohlc_data.parquet"
    data.to_parquet(output_file, index=False)

    return output_file

def download_data_for_one_day(symbol: str, day: str) -> pd.DataFrame:
    """
    Downloads one day of data and returns a pandas DataFrame
    """
    # Convert day string to datetime object
    start = datetime.strptime(day, "%Y-%m-%d")
    end = start + timedelta(days=1)

    # Request data from Alpaca
    request_params = StockBarsRequest(
        symbol_or_symbols=[symbol],
        timeframe=TimeFrame.Hour,
        start=start,
        end=end
    )
    bars = client.get_stock_bars(request_params)
    
    # Transform bars to pandas DataFrame
    data = bars.df.reset_index()
    return data[['timestamp', 'low', 'high', 'open', 'close', 'trade_count']].rename(columns={'timestamp':'time', 'trade_count':'volume'})


In [162]:
# Convert day string to datetime object
start = datetime.strptime("2023-01-04", "%Y-%m-%d")
end = start + timedelta(days=1)

# Request data from Alpaca
request_params = StockBarsRequest(
    symbol_or_symbols=['AAPL'],
    timeframe=TimeFrame.Hour,
    start=start,
    end=end
)
bars = client.get_stock_bars(request_params)

In [163]:
start

datetime.datetime(2023, 1, 4, 0, 0)

In [164]:
end

datetime.datetime(2023, 1, 5, 0, 0)

In [165]:
bars.df.reset_index()

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,AAPL,2023-01-04 00:00:00+00:00,124.97,125.12,124.92,125.0,105933.0,1919.0,125.016784
1,AAPL,2023-01-04 09:00:00+00:00,125.9,126.4,125.66,126.24,126668.0,2860.0,126.266366
2,AAPL,2023-01-04 10:00:00+00:00,126.21,126.68,125.91,126.15,69695.0,1734.0,126.264045
3,AAPL,2023-01-04 11:00:00+00:00,126.13,126.5,126.13,126.3,46747.0,1015.0,126.376723
4,AAPL,2023-01-04 12:00:00+00:00,126.25,126.25,125.53,125.86,214166.0,4335.0,125.899086
5,AAPL,2023-01-04 13:00:00+00:00,126.071,126.66,125.38,126.33,717664.0,11496.0,126.215468
6,AAPL,2023-01-04 14:00:00+00:00,126.28,127.555,125.94,126.66,16196514.0,132604.0,126.722349
7,AAPL,2023-01-04 15:00:00+00:00,126.64,127.34,125.08,126.99,15124968.0,141901.0,126.131945
8,AAPL,2023-01-04 16:00:00+00:00,126.96,128.44,126.5,127.77,10840638.0,105808.0,127.529135
9,AAPL,2023-01-04 17:00:00+00:00,127.77,128.6557,127.23,127.23,6641406.0,67275.0,128.007564


In [180]:
data = pd.read_parquet("../data/downloads/2023-01-09.parquet")

In [181]:
len(data)

16

In [182]:
data

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,AAPL,2023-01-09 09:00:00+00:00,129.6,130.5,129.6,130.29,74475.0,1764.0,130.153842
1,AAPL,2023-01-09 10:00:00+00:00,130.26,130.5,130.15,130.25,30779.0,979.0,130.311146
2,AAPL,2023-01-09 11:00:00+00:00,130.21,130.35,130.02,130.22,31943.0,918.0,130.212986
3,AAPL,2023-01-09 12:00:00+00:00,130.25,130.4,130.16,130.25,149821.0,2684.0,130.275261
4,AAPL,2023-01-09 13:00:00+00:00,130.2,130.63,129.955,130.2,427988.0,8029.0,130.263472
5,AAPL,2023-01-09 14:00:00+00:00,130.24,132.06,130.13,132.015,10887101.0,142801.0,131.120139
6,AAPL,2023-01-09 15:00:00+00:00,132.01,133.14,131.96,132.74,12707189.0,118285.0,132.502369
7,AAPL,2023-01-09 16:00:00+00:00,132.74,133.41,132.67,132.945,8730413.0,78932.0,133.047405
8,AAPL,2023-01-09 17:00:00+00:00,132.94,132.955,132.25,132.459,6341391.0,55394.0,132.59708
9,AAPL,2023-01-09 18:00:00+00:00,132.46,132.64,131.19,131.395,7190518.0,66470.0,131.941013
