# Library

In [1]:
import pandas as pd 
import numpy as np
import os
from datetime import datetime
import sys
import json

# Data Import

## Polygon S3

In [2]:
# Read gz file
import gzip

import glob

# Path to the day folder
day_folder = '../polygon_data/day/'

# Find all .csv.gz files in the folder
gz_files = glob.glob(os.path.join(day_folder, '*.csv.gz'))

# Read and concatenate all files
dfs = []
for file in gz_files:
    with gzip.open(file, 'rt') as f:
        dfs.append(pd.read_csv(f))

# Concatenate all DataFrames into one
df = pd.concat(dfs, ignore_index=True)

print(df.head())

  ticker   volume    open    close     high       low         window_start  \
0      A  1021103  119.12  118.150  119.600  118.1500  1755489600000000000   
1     AA  7364613   30.77   29.630   30.945   29.3900  1755489600000000000   
2    AAA     6697   25.00   24.995   25.000   24.9415  1755489600000000000   
3   AAAA     4800   25.83   25.860   25.860   25.8290  1755489600000000000   
4   AAAU  1052562   33.02   32.910   33.020   32.8800  1755489600000000000   

   transactions  
0         19840  
1         61715  
2            61  
3            30  
4          2370  


In [3]:
# Process the data
df['datetime'] = pd.to_datetime(df['window_start'], unit='ns')
df.head()

Unnamed: 0,ticker,volume,open,close,high,low,window_start,transactions,datetime
0,A,1021103,119.12,118.15,119.6,118.15,1755489600000000000,19840,2025-08-18 04:00:00
1,AA,7364613,30.77,29.63,30.945,29.39,1755489600000000000,61715,2025-08-18 04:00:00
2,AAA,6697,25.0,24.995,25.0,24.9415,1755489600000000000,61,2025-08-18 04:00:00
3,AAAA,4800,25.83,25.86,25.86,25.829,1755489600000000000,30,2025-08-18 04:00:00
4,AAAU,1052562,33.02,32.91,33.02,32.88,1755489600000000000,2370,2025-08-18 04:00:00


In [4]:
df = df.sort_values(by=['ticker', 'window_start'], ascending=True)
# Resample to 3-day candles per ticker
df['datetime'] = pd.to_datetime(df['window_start'], unit='ns')
df = df.sort_values(['ticker', 'datetime'])

# Set multi-index for resampling
df.set_index('datetime', inplace=True)

resampled = (
    df.groupby('ticker')
    .resample('3D', level='datetime')
    .agg(
        open=('open', 'first'),
        high=('high', 'max'),
        low=('low', 'min'),
        close=('close', 'last'),
        volume=('volume', 'sum')
    )
    .reset_index()
)

resampled

Unnamed: 0,ticker,datetime,open,high,low,close,volume
0,A,2025-08-01,113.4500,114.3200,111.190,113.5000,1888649
1,A,2025-08-04,113.1700,115.4300,112.010,113.2300,3672925
2,A,2025-08-07,114.5000,115.0000,113.160,114.6200,2287227
3,A,2025-08-10,115.0400,117.3700,113.730,117.3200,1999727
4,A,2025-08-13,118.0600,120.3950,117.860,119.2000,3148435
...,...,...,...,...,...,...,...
208730,ZZZ,2025-08-16,30.0700,30.1607,30.070,30.1607,1252
208731,ZZZ,2025-08-19,30.0000,30.0000,29.610,29.6225,1229
208732,ZZZ,2025-08-22,29.7140,30.2339,29.714,30.2339,1229
208733,ZZZ,2025-08-25,30.0900,30.0900,29.750,30.0329,1310


## Polygon API

In [12]:
import requests
import os

def fetch_meta(symbol: str, api_key: str):
    """
    Fetch metadata for a given symbol from Polygon.io
    
    Args:
        symbol (str): Stock symbol (e.g., "AAPL")
        api_key (str): Your Polygon.io API key
        
    Returns:
        dict or None: Metadata dictionary or None if failed
    """
    try:
        url = f"https://api.polygon.io/v3/reference/tickers/{symbol}"
        params = {'apiKey': api_key}

        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"❌ API request failed with status code {response.status_code}")
            return None

        response_json = response.json()
        return response_json.get('results', None)

    except Exception as e:
        print(f"❌ Error fetching metadata for {symbol}: {e}")
        return None

# Example usage
if __name__ == "__main__":
    result = fetch_meta("AMD", os.environ['POLYGON_API_KEY'])
    print(result)

{'ticker': 'AMD', 'name': 'Advanced Micro Devices', 'market': 'stocks', 'locale': 'us', 'primary_exchange': 'XNAS', 'type': 'CS', 'active': True, 'currency_name': 'usd', 'cik': '0000002488', 'composite_figi': 'BBG000BBQCY0', 'share_class_figi': 'BBG001S5NN36', 'market_cap': 330589487886.19, 'phone_number': '(408) 749-4000', 'address': {'address1': '2485 AUGUSTINE DRIVE', 'city': 'SANTA CLARA', 'state': 'CA', 'postal_code': '95054'}, 'description': "Advanced Micro Devices designs a variety of digital semiconductors for markets such as PCs, gaming consoles, data centers, industrial, and automotive applications. AMD's traditional strength was in central processing units and graphics processing units used in PCs and data centers. Additionally, the firm supplies the chips found in prominent game consoles such as the Sony PlayStation and Microsoft Xbox. In 2022, the firm acquired field-programmable gate array leader Xilinx to diversify its business and augment its opportunities in key end ma

In [11]:
import os
import requests

def get_market_status():
    """
    Get current market status
    Returns a dictionary containing market status information
    """
    try:
        url = f"https://api.polygon.io/v1/marketstatus/now?apiKey={os.environ['POLYGON_API_KEY']}"
        response = requests.get(url)
        result = response.json()
        
        # Parse the response into a dictionary using correct dict access
        market_status = {
            'after_hours': result.get('afterHours'),
            'currencies': {
                'crypto': result.get('currencies', {}).get('crypto'),
                'fx': result.get('currencies', {}).get('fx')
            },
            'early_hours': result.get('earlyHours'),
            'exchanges': {
                'nasdaq': result.get('exchanges', {}).get('nasdaq'),
                'nyse': result.get('exchanges', {}).get('nyse'),
                'otc': result.get('exchanges', {}).get('otc')
            },
            'market': result.get('market'),
            'server_time': result.get('serverTime')
        }
        
        return market_status
        
    except Exception as e:
        print(f"Error getting market status: {e}")
        return {}

get_market_status()

{'after_hours': False,
 'currencies': {'crypto': 'open', 'fx': 'closed'},
 'early_hours': False,
 'exchanges': {'nasdaq': 'closed', 'nyse': 'closed', 'otc': 'closed'},
 'market': 'closed',
 'server_time': '2025-10-04T12:27:32-04:00'}

## Yahoo Finance

In [5]:
# Fetch data from Yahoo Finance
import yfinance as yf

# Define the ticker symvbol
ticker = yf.Ticker('AAPL')

# Get historical data
data = ticker.history(period='max')

# Display the first few rows of the data
print(data.head())

                               Open      High       Low     Close     Volume  \
Date                                                                           
1980-12-12 00:00:00-05:00  0.098485  0.098913  0.098485  0.098485  469033600   
1980-12-15 00:00:00-05:00  0.093775  0.093775  0.093347  0.093347  175884800   
1980-12-16 00:00:00-05:00  0.086924  0.086924  0.086495  0.086495  105728000   
1980-12-17 00:00:00-05:00  0.088636  0.089064  0.088636  0.088636   86441600   
1980-12-18 00:00:00-05:00  0.091206  0.091634  0.091206  0.091206   73449600   

                           Dividends  Stock Splits  
Date                                                
1980-12-12 00:00:00-05:00        0.0           0.0  
1980-12-15 00:00:00-05:00        0.0           0.0  
1980-12-16 00:00:00-05:00        0.0           0.0  
1980-12-17 00:00:00-05:00        0.0           0.0  
1980-12-18 00:00:00-05:00        0.0           0.0  


In [6]:
# Ticker Info
ticker.info

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '(408) 996-1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'industryKey': 'consumer-electronics',
 'industryDisp': 'Consumer Electronics',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple TV, Apple Watch, Beats products, and HomePod. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download applications and digital content, such as books, music, video, games, and p

In [7]:
# Get Market Cap
print(ticker.info["marketCap"])
# Get Sector
print(ticker.info["sector"])
# Get Industry
print(ticker.info["industry"])

3791126003712
Technology
Consumer Electronics


## FMP

In [22]:
import dotenv
import os

dotenv.load_dotenv()

FMP_API_KEY = os.getenv('FMP_API_KEY')

In [23]:
# HTTP Request for stock historical data
import requests

symbol = "AAPL"
api_key = FMP_API_KEY  # Make sure FMP_API_KEY is loaded from your .env
from_date = "2025-10-01"
to_date = "2025-10-01"
url = f"https://financialmodelingprep.com/stable/historical-price-eod/full?symbol={symbol}&from={from_date}&to={to_date}&apikey={api_key}"

try:
    response = requests.get(url)
    response.raise_for_status()  # Raises HTTPError for bad responses
    data = response.json()
    print(data)
except requests.exceptions.RequestException as e:
    print(f"Error fetching data from FMP API: {e}")



[{'symbol': 'AAPL', 'date': '2025-10-01', 'open': 255.04, 'high': 258.79, 'low': 254.93, 'close': 255.45, 'volume': 48713940, 'change': 0.41, 'changePercent': 0.16076, 'vwap': 256.0525}]


In [33]:
# HTTP Request for stock meta data
import requests

symbol = "AAPL"
api_key = FMP_API_KEY  # Make sure FMP_API_KEY is loaded from your .env
from_date = "2025-10-01"
to_date = "2025-10-01"
url = f"https://financialmodelingprep.com/stable/profile?symbol={symbol}&apikey={api_key}"
try:
    
    response = requests.get(url)
    response.raise_for_status()  # Raises HTTPError for bad responses
    data = response.json()[0]

except requests.exceptions.RequestException as e:
    print(f"Error fetching data from FMP API: {e}")

market_cap = data['marketCap']
industry = data['industry']
sector = data['sector']

print("Market Cap: ", market_cap, "Industry: ", industry, "Sector: ", sector)

Market Cap:  3815909480700 Industry:  Consumer Electronics Sector:  Technology


In [25]:
# HTTP Request for available symbols
import requests

symbol = "AAPL"
api_key = FMP_API_KEY  # Make sure FMP_API_KEY is loaded from your .env
from_date = "2025-10-01"
to_date = "2025-10-01"
url = f"https://financialmodelingprep.com/stable/quote?symbol={symbol}&from={from_date}&to={to_date}&apikey={api_key}"
try:
    
    response = requests.get(url)
    response.raise_for_status()  # Raises HTTPError for bad responses
    data = response.json()
    print(data)
except requests.exceptions.RequestException as e:
    print(f"Error fetching data from FMP API: {e}")



[{'symbol': 'AAPL', 'name': 'Apple Inc.', 'price': 257.13, 'changePercentage': 0.65766, 'change': 1.68, 'volume': 42459369, 'dayLow': 254.15, 'dayHigh': 258.18, 'yearHigh': 260.1, 'yearLow': 169.21, 'marketCap': 3815909480700, 'priceAvg50': 230.9796, 'priceAvg200': 222.0911, 'exchange': 'NASDAQ', 'open': 256.59, 'previousClose': 255.45, 'timestamp': 1759435201}]


# DuckDB

In [None]:
import duckdb
conn = duckdb.connect()

# Test resampling logic with your migrated data using read_parquet
try:
    conn.execute("SET s3_region='ca-west-1'")
    conn.execute(f"SET s3_access_key_id='{os.environ.get('AWS_ACCESS_KEY_ID')}'")
    conn.execute(f"SET s3_secret_access_key='{os.environ.get('AWS_SECRET_ACCESS_KEY')}'")


    # Now create the view as before
    conn.execute("""
        CREATE OR REPLACE VIEW s3_ohlcv AS
        SELECT * FROM read_parquet('s3://dev-condvest-datalake/silver/silver_3d/year=2025/month=10/data_3d_202510.parquet');
    """)
    
    # Check the data structure first
    result = conn.execute("""
        SELECT COUNT(*) as total_rows, 
            MIN(ts) as min_time, 
            MAX(ts) as max_time,
            COUNT(DISTINCT symbol) as unique_symbols
        FROM s3_ohlcv;
    """).fetchall()
    
    print("Data summary:")
    for row in result:
        print(f"Total rows: {row[0]}")
        print(f"Time range: {row[1]} to {row[2]}")
        print(f"Unique symbols: {row[3]}")
    
    
except Exception as e:
    print(f"❌ Error in resampling: {e}\n")
    print("Tip: Make sure you have set the correct AWS S3 region using DuckDB's SET s3_region statement.")



Data summary:
Total rows: 5350
Time range: 2025-10-09 05:11:14.805275 to 2025-10-09 05:22:45.687542
Unique symbols: 5350
