# Library

In [49]:
import pandas as pd 
import numpy as np
import os
from datetime import datetime
import sys
import json

# Data Import

## Polygon S3

In [24]:
# Read gz file
import gzip

import glob

# Path to the day folder
day_folder = '../polygon_data/day/'

# Find all .csv.gz files in the folder
gz_files = glob.glob(os.path.join(day_folder, '*.csv.gz'))

# Read and concatenate all files
dfs = []
for file in gz_files:
    with gzip.open(file, 'rt') as f:
        dfs.append(pd.read_csv(f))

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

print(df.head())

  ticker   volume    open    close     high       low         window_start  \
0      A  1021103  119.12  118.150  119.600  118.1500  1755489600000000000   
1     AA  7364613   30.77   29.630   30.945   29.3900  1755489600000000000   
2    AAA     6697   25.00   24.995   25.000   24.9415  1755489600000000000   
3   AAAA     4800   25.83   25.860   25.860   25.8290  1755489600000000000   
4   AAAU  1052562   33.02   32.910   33.020   32.8800  1755489600000000000   

   transactions  
0         19840  
1         61715  
2            61  
3            30  
4          2370  


In [25]:
# Process the data
df['datetime'] = pd.to_datetime(df['window_start'], unit='ns')
df.head()

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions,datetime
0,A,1021103,119.12,118.15,119.6,118.15,1755489600000000000,19840,2025-08-18 04:00:00
1,AA,7364613,30.77,29.63,30.945,29.39,1755489600000000000,61715,2025-08-18 04:00:00
2,AAA,6697,25.0,24.995,25.0,24.9415,1755489600000000000,61,2025-08-18 04:00:00
3,AAAA,4800,25.83,25.86,25.86,25.829,1755489600000000000,30,2025-08-18 04:00:00
4,AAAU,1052562,33.02,32.91,33.02,32.88,1755489600000000000,2370,2025-08-18 04:00:00


In [27]:
df = df.sort_values(by=['ticker', 'window_start'], ascending=True)
# Resample to 3-day candles per ticker
df['datetime'] = pd.to_datetime(df['window_start'], unit='ns')
df = df.sort_values(['ticker', 'datetime'])

# Set multi-index for resampling
df.set_index('datetime', inplace=True)

resampled = (
    df.groupby('ticker')
    .resample('3D', level='datetime')
    .agg(
        open=('open', 'first'),
        high=('high', 'max'),
        low=('low', 'min'),
        close=('close', 'last'),
        volume=('volume', 'sum')
    )
    .reset_index()
)

resampled



Unnamed: 0,ticker,datetime,open,high,low,close,volume
0,A,2025-08-01,113.4500,114.3200,111.190,113.5000,1888649
1,A,2025-08-04,113.1700,115.4300,112.010,113.2300,3672925
2,A,2025-08-07,114.5000,115.0000,113.160,114.6200,2287227
3,A,2025-08-10,115.0400,117.3700,113.730,117.3200,1999727
4,A,2025-08-13,118.0600,120.3950,117.860,119.2000,3148435
...,...,...,...,...,...,...,...
208730,ZZZ,2025-08-16,30.0700,30.1607,30.070,30.1607,1252
208731,ZZZ,2025-08-19,30.0000,30.0000,29.610,29.6225,1229
208732,ZZZ,2025-08-22,29.7140,30.2339,29.714,30.2339,1229
208733,ZZZ,2025-08-25,30.0900,30.0900,29.750,30.0329,1310


## Yahoo Finance

In [50]:
# Fetch data from Yahoo Finance
import yfinance as yf

# Define the ticker symvbol
ticker = yf.Ticker('AAPL')

# Get historical data
data = ticker.history(period='max')

# Display the first few rows of the data
print(data.head())

                               Open      High       Low     Close     Volume  \
Date                                                                           
1980-12-12 00:00:00-05:00  0.098485  0.098913  0.098485  0.098485  469033600   
1980-12-15 00:00:00-05:00  0.093775  0.093775  0.093347  0.093347  175884800   
1980-12-16 00:00:00-05:00  0.086924  0.086924  0.086495  0.086495  105728000   
1980-12-17 00:00:00-05:00  0.088636  0.089064  0.088636  0.088636   86441600   
1980-12-18 00:00:00-05:00  0.091206  0.091634  0.091206  0.091206   73449600   

                           Dividends  Stock Splits  
Date                                                
1980-12-12 00:00:00-05:00        0.0           0.0  
1980-12-15 00:00:00-05:00        0.0           0.0  
1980-12-16 00:00:00-05:00        0.0           0.0  
1980-12-17 00:00:00-05:00        0.0           0.0  
1980-12-18 00:00:00-05:00        0.0           0.0  


In [51]:
# Ticker Info
ticker.info

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '(408) 996-1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'industryKey': 'consumer-electronics',
 'industryDisp': 'Consumer Electronics',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and p

In [53]:
# Get Market Cap
print(ticker.info["marketCap"])
# Get Sector
print(ticker.info["sector"])
# Get Industry
print(ticker.info["industry"])

3791126003712
Technology
Consumer Electronics
