# Library

In [1]:
import pandas as pd 
import numpy as np
import os
from datetime import datetime
import sys
import json

# Data Import

## Polygon API

In [2]:
import requests
import os

def fetch_meta(symbol: str, api_key: str):
    """
    Fetch metadata for a given symbol from Polygon.io
    
    Args:
        symbol (str): Stock symbol (e.g., "AAPL")
        api_key (str): Your Polygon.io API key
        
    Returns:
        dict or None: Metadata dictionary or None if failed
    """
    try:
        url = f"https://api.polygon.io/v3/reference/tickers/{symbol}"
        params = {'apiKey': api_key}

        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"‚ùå API request failed with status code {response.status_code}")
            return None

        response_json = response.json()
        return response_json.get('results', None)

    except Exception as e:
        print(f"‚ùå Error fetching metadata for {symbol}: {e}")
        return None

# Example usage
if __name__ == "__main__":
    result = fetch_meta("AMD", os.environ['POLYGON_API_KEY'])
    print(result)

{'ticker': 'AMD', 'name': 'Advanced Micro Devices', 'market': 'stocks', 'locale': 'us', 'primary_exchange': 'XNAS', 'type': 'CS', 'active': True, 'currency_name': 'usd', 'cik': '0000002488', 'composite_figi': 'BBG000BBQCY0', 'share_class_figi': 'BBG001S5NN36', 'market_cap': 360497238202.2, 'phone_number': '(408) 749-4000', 'address': {'address1': '2485 AUGUSTINE DRIVE', 'city': 'SANTA CLARA', 'state': 'CA', 'postal_code': '95054'}, 'description': "Advanced Micro Devices designs a variety of digital semiconductors for markets such as PCs, gaming consoles, data centers (including artificial intelligence), industrial, and automotive applications. AMD's traditional strength was in central processing units and graphics processing units used in PCs and data centers. However, AMD is emerging as a prominent player in AI GPUs and related hardware. Additionally, the firm supplies the chips found in prominent game consoles such as the Sony PlayStation and Microsoft Xbox.", 'sic_code': '3674', 'si

In [3]:
import os
import requests

def get_market_status():
    """
    Get current market status
    Returns a dictionary containing market status information
    """
    try:
        url = f"https://api.polygon.io/v1/marketstatus/now?apiKey={os.environ['POLYGON_API_KEY']}"
        response = requests.get(url)
        result = response.json()
        
        # Parse the response into a dictionary using correct dict access
        market_status = {
            'after_hours': result.get('afterHours'),
            'currencies': {
                'crypto': result.get('currencies', {}).get('crypto'),
                'fx': result.get('currencies', {}).get('fx')
            },
            'early_hours': result.get('earlyHours'),
            'exchanges': {
                'nasdaq': result.get('exchanges', {}).get('nasdaq'),
                'nyse': result.get('exchanges', {}).get('nyse'),
                'otc': result.get('exchanges', {}).get('otc')
            },
            'market': result.get('market'),
            'server_time': result.get('serverTime')
        }
        
        return market_status
        
    except Exception as e:
        print(f"Error getting market status: {e}")
        return {}

get_market_status()

{'after_hours': False,
 'currencies': {'crypto': 'open', 'fx': 'closed'},
 'early_hours': False,
 'exchanges': {'nasdaq': 'closed', 'nyse': 'closed', 'otc': 'closed'},
 'market': 'closed',
 'server_time': '2025-12-13T00:27:10-05:00'}

In [11]:
import requests
import os
from datetime import timedelta, datetime

def fetch_ohlcv(symbol: str, api_key: str, multiplier: int, timespan: str, start_date: str, end_date: str):
    """
    Fetch ohlcv for a given symbol from Polygon.io
    
    Args:
        symbol (str): Stock symbol (e.g., "AAPL")
        api_key (str): Your Polygon.io API key
        
    Returns:
        dict or None: Metadata dictionary or None if failed
    """
    try:
        url = f"https://api.polygon.io/v2/aggs/ticker/{symbol}/range/{multiplier}/{timespan}/{start_date}/{end_date}"
        params = {'apiKey': api_key}

        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"‚ùå API request failed with status code {response.status_code}")
            return None

        response_json = response.json()
        return response_json.get('results', None)

    except Exception as e:
        print(f"‚ùå Error fetching metadata for {symbol}: {e}")
        return None

# Example usage
if __name__ == "__main__":
    today = datetime.now().strftime("%Y-%m-%d")
    _5_years_ago = (datetime.now() - timedelta(days=5*365)).strftime("%Y-%m-%d")
    result = fetch_ohlcv("AMD", os.environ['POLYGON_API_KEY'], 1, "day", _5_years_ago, today)
    df = pd.DataFrame(result)
    print(f"Max date: {datetime.fromtimestamp(df['t'].max() / 1000).strftime('%Y-%m-%d')} | Min date: {datetime.fromtimestamp(df['t'].min() / 1000).strftime('%Y-%m-%d')}")

Max date: 2025-12-11 | Min date: 2020-12-13


## Yahoo Finance

In [5]:
# Fetch data from Yahoo Finance
import yfinance as yf

# Define the ticker symvbol
ticker = yf.Ticker('AAPL')

# Get historical data
data = ticker.history(period='max')

# Display the first few rows of the data
print(data.head())

                               Open      High       Low     Close     Volume  \
Date                                                                           
1980-12-12 00:00:00-05:00  0.098389  0.098817  0.098389  0.098389  469033600   
1980-12-15 00:00:00-05:00  0.093684  0.093684  0.093256  0.093256  175884800   
1980-12-16 00:00:00-05:00  0.086839  0.086839  0.086412  0.086412  105728000   
1980-12-17 00:00:00-05:00  0.088550  0.088978  0.088550  0.088550   86441600   
1980-12-18 00:00:00-05:00  0.091118  0.091545  0.091118  0.091118   73449600   

                           Dividends  Stock Splits  
Date                                                
1980-12-12 00:00:00-05:00        0.0           0.0  
1980-12-15 00:00:00-05:00        0.0           0.0  
1980-12-16 00:00:00-05:00        0.0           0.0  
1980-12-17 00:00:00-05:00        0.0           0.0  
1980-12-18 00:00:00-05:00        0.0           0.0  


In [6]:
# Ticker Info
ticker.info

{'address1': 'One Apple Park Way',
 'city': 'Cupertino',
 'state': 'CA',
 'zip': '95014',
 'country': 'United States',
 'phone': '(408) 996-1010',
 'website': 'https://www.apple.com',
 'industry': 'Consumer Electronics',
 'industryKey': 'consumer-electronics',
 'industryDisp': 'Consumer Electronics',
 'sector': 'Technology',
 'sectorKey': 'technology',
 'sectorDisp': 'Technology',
 'longBusinessSummary': 'Apple Inc. designs, manufactures, and markets smartphones, personal computers, tablets, wearables, and accessories worldwide. The company offers iPhone, a line of smartphones; Mac, a line of personal computers; iPad, a line of multi-purpose tablets; and wearables, home, and accessories comprising AirPods, Apple Vision Pro, Apple TV, Apple Watch, Beats products, and HomePod, as well as Apple branded and third-party accessories. It also provides AppleCare support and cloud services; and operates various platforms, including the App Store that allow customers to discover and download app

In [7]:
# Get Market Cap
print(ticker.info["marketCap"])
# Get Sector
print(ticker.info["sector"])
# Get Industry
print(ticker.info["industry"])

4126073421824
Technology
Consumer Electronics


# DuckDB

## Resampled Data Fetching

In [None]:
import duckdb
conn = duckdb.connect()

# Test resampling logic with your migrated data using read_parquet
try:
    conn.execute("SET s3_region='ca-west-1'")
    conn.execute(f"SET s3_access_key_id='{os.environ.get('AWS_ACCESS_KEY_ID')}'")
    conn.execute(f"SET s3_secret_access_key='{os.environ.get('AWS_SECRET_ACCESS_KEY')}'")


    # Now create the view as before
    conn.execute("""
        CREATE VIEW s3_ohlcv AS
        SELECT * 
        FROM read_parquet('s3://dev-condvest-datalake/silver/silver_3d/year=2025/**/*.parquet')
        WHERE ts >= DATE '2025-01-01'
    """)
    
    # Show basic statistics for the s3_ohlcv view
    summary = conn.execute("""
        SELECT 
            COUNT(*) AS total_rows, 
            MIN(ts) AS min_time, 
            MAX(ts) AS max_time,
            MIN(open) AS min_open,
            MAX(open) AS max_open,
            MIN(high) AS min_high,
            MAX(high) AS max_high,
            MIN(low) AS min_low,
            MAX(low) AS max_low,
            MIN(close) AS min_close,
            MAX(close) AS max_close,
            COUNT(DISTINCT symbol) AS unique_symbols
        FROM s3_ohlcv
        WHERE symbol = 'AAPL'
    """).fetchone()

    if summary:
        print("AAPL Data summary:")
        print(f"Total rows: {summary[0]}")
        print(f"Time range: {summary[1]} to {summary[2]}")
        print(f"Unique symbols: {summary[10]}")
        print("Value ranges:")
        print(f"  Open: {summary[3]} to {summary[4]}")
        print(f"  High: {summary[5]} to {summary[6]}")
        print(f"  Low: {summary[7]} to {summary[8]}")
        print(f"  Close: {summary[9]} to {summary[10]}")
    else:
        print("No data found in s3_ohlcv for AAPL analysis.")
        
    
    
except Exception as e:
    print(f"‚ùå Error in resampling: {e}\n")
    print("Tip: Make sure you have set the correct AWS S3 region using DuckDB's SET s3_region statement.")



FDIX Data summary:
Total rows: 68
Time range: 2025-01-02 22:00:00-07:00 to 2025-12-11 05:00:00-07:00
Unique symbols: 278.85
Value ranges:
  Open: 176.77 to 279.095
  High: 198.46 to 280.38
  Low: 168.8 to 275.25
  Close: 187.92 to 278.85


# üîç Historical Backfill Verification

Verify which symbols have been backfilled with full historical data vs symbols that only have recent data.


In [12]:
# Connect to RDS PostgreSQL
import psycopg2
import pandas as pd
from datetime import datetime, timedelta

# RDS Connection (use your credentials)
rds_conn = psycopg2.connect(
    host=os.environ.get('RDS_HOST', 'dev-condvest-db.cfuwoy862i3r.ca-west-1.rds.amazonaws.com'),
    database=os.environ.get('RDS_DATABASE', 'condvest'),
    user=os.environ.get('RDS_USER', 'postgres'),
    password=os.environ.get('RDS_PASSWORD'),  # Set this in your environment
    port=5432
)
print("‚úÖ Connected to RDS PostgreSQL")


‚úÖ Connected to RDS PostgreSQL


In [13]:
# Query 1: Check all symbol types in symbol_metadata
query_types = """
SELECT 
    type, 
    COUNT(*) as count,
    SUM(CASE WHEN LOWER(active) = 'true' THEN 1 ELSE 0 END) as active_count
FROM symbol_metadata 
GROUP BY type 
ORDER BY count DESC;
"""
df_types = pd.read_sql(query_types, rds_conn)
print("üìä Symbol Types in symbol_metadata:")
print(df_types.to_string(index=False))
print(f"\nüìà Total symbols: {df_types['count'].sum()}")


  df_types = pd.read_sql(query_types, rds_conn)


üìä Symbol Types in symbol_metadata:
type  count  active_count
  CS   5351          5351
 ETF   4568          4568
 PFD    443           443
ADRC    395           395
 ETV     76            76
UNIT      1             1

üìà Total symbols: 10834


In [14]:
# Query 2: Find symbols with LIMITED history (< 7 days of OHLCV records)
# These symbols need historical backfill

query_limited_history = """
WITH symbol_record_counts AS (
    SELECT 
        symbol,
        COUNT(*) as record_count,
        MIN(DATE(timestamp)) as earliest_date,
        MAX(DATE(timestamp)) as latest_date
    FROM raw_ohlcv
    WHERE interval = '1d'
    GROUP BY symbol
)
SELECT 
    sm.symbol,
    sm.type,
    sm.name,
    COALESCE(src.record_count, 0) as ohlcv_records,
    src.earliest_date,
    src.latest_date,
    CASE 
        WHEN src.record_count IS NULL THEN 'NO_DATA'
        WHEN src.record_count < 7 THEN 'NEEDS_BACKFILL'
        WHEN src.record_count < 252 THEN 'PARTIAL_YEAR'
        WHEN src.record_count < 1260 THEN 'PARTIAL_5YR'
        ELSE 'FULL_HISTORY'
    END as status
FROM symbol_metadata sm
LEFT JOIN symbol_record_counts src ON sm.symbol = src.symbol
WHERE LOWER(sm.active) = 'true'
ORDER BY src.record_count ASC NULLS FIRST;
"""

df_history = pd.read_sql(query_limited_history, rds_conn)

# Summary by status
print("=" * 60)
print("üìä BACKFILL STATUS SUMMARY")
print("=" * 60)
status_summary = df_history.groupby('status').agg({
    'symbol': 'count',
    'ohlcv_records': 'mean'
}).rename(columns={'symbol': 'symbol_count', 'ohlcv_records': 'avg_records'})
print(status_summary.to_string())

# Count by type for symbols needing backfill
needs_backfill = df_history[df_history['status'].isin(['NO_DATA', 'NEEDS_BACKFILL'])]
print(f"\n‚ö†Ô∏è Symbols needing backfill: {len(needs_backfill)}")
if len(needs_backfill) > 0:
    print("\nBy type:")
    print(needs_backfill.groupby('type')['symbol'].count().to_string())


  df_history = pd.read_sql(query_limited_history, rds_conn)


üìä BACKFILL STATUS SUMMARY
                symbol_count  avg_records
status                                   
FULL_HISTORY            3780  1309.473016
NEEDS_BACKFILL          5020     1.004183
NO_DATA                  288     0.000000
PARTIAL_5YR             1291   859.616576
PARTIAL_YEAR             455   123.092308

‚ö†Ô∏è Symbols needing backfill: 5308

By type:
type
ADRC      13
CS       327
ETF     4452
ETV       76
PFD      439
UNIT       1


In [15]:
# Query 3: Show sample of symbols needing backfill (first 20)
print("=" * 60)
print("üìã Sample Symbols Needing Backfill (first 20)")
print("=" * 60)

if len(needs_backfill) > 0:
    sample = needs_backfill.head(20)[['symbol', 'type', 'name', 'ohlcv_records', 'status']]
    print(sample.to_string(index=False))
else:
    print("‚úÖ All symbols have sufficient historical data!")


üìã Sample Symbols Needing Backfill (first 20)
symbol type                                                                         name  ohlcv_records  status
  OAKI  ETF                                          Oakmark International Large Cap ETF              0 NO_DATA
  MBNE  ETF                                   State Street Nuveen Municipal Bond ESG ETF              0 NO_DATA
  RMME  ETF                                         Rareview Government Money Market ETF              0 NO_DATA
  SNAV  ETF                                                          Mohr Sector Nav ETF              0 NO_DATA
  KCSH  ETF                       KraneShares Sustainable Ultra Short Duration Index ETF              0 NO_DATA
  TENM  ETF                                  iShares Large Cap 10% Target Buffer Mar ETF              0 NO_DATA
   SIM ADRC                                                  Groupo Simec, S.A.B de C.V.              0 NO_DATA
 STRRP  PFD Star Equity Holdings, Inc. 10% Series A Cumu

In [None]:
# Query 4: Get list of symbols to backfill (for Lambda invocation)
symbols_to_backfill = needs_backfill['symbol'].tolist()

print(f"üìä Total symbols needing backfill: {len(symbols_to_backfill)}")

# If you want to trigger backfill for specific symbols, use this JSON payload:
if len(symbols_to_backfill) > 0 and len(symbols_to_backfill) <= 500:
    import json
    payload = {
        "historical_backfill": True,
        "years_back": 5,
        "symbols": symbols_to_backfill,
        "skip_market_check": True
    }
    print(f"\nüöÄ Lambda payload for backfill ({len(symbols_to_backfill)} symbols):")
    print(json.dumps(payload, indent=2)[:500] + "...")
else:
    print(f"\n‚ö†Ô∏è Too many symbols ({len(symbols_to_backfill)}) - run in batches or use auto-detection")


In [16]:
# Query 5: Check backfill status specifically for NEW types (ETF, ADRC, ETV, PFD)
# These are the types we just added support for

query_new_types = """
WITH symbol_record_counts AS (
    SELECT symbol, COUNT(*) as record_count
    FROM raw_ohlcv
    WHERE interval = '1d'
    GROUP BY symbol
)
SELECT 
    sm.type,
    COUNT(*) as total_symbols,
    SUM(CASE WHEN COALESCE(src.record_count, 0) < 7 THEN 1 ELSE 0 END) as needs_backfill,
    SUM(CASE WHEN COALESCE(src.record_count, 0) >= 1200 THEN 1 ELSE 0 END) as has_full_history,
    ROUND(AVG(COALESCE(src.record_count, 0)), 1) as avg_records
FROM symbol_metadata sm
LEFT JOIN symbol_record_counts src ON sm.symbol = src.symbol
WHERE LOWER(sm.active) = 'true'
  AND sm.type IN ('ETF', 'ADRC', 'ETV', 'PFD')
GROUP BY sm.type
ORDER BY needs_backfill DESC;
"""

df_new_types = pd.read_sql(query_new_types, rds_conn)
print("=" * 60)
print("üìä NEW ASSET TYPES BACKFILL STATUS (ETF, ADRC, ETV, PFD)")
print("=" * 60)
print(df_new_types.to_string(index=False))


  df_new_types = pd.read_sql(query_new_types, rds_conn)


üìä NEW ASSET TYPES BACKFILL STATUS (ETF, ADRC, ETV, PFD)
type  total_symbols  needs_backfill  has_full_history  avg_records
 ETF           4568            4452                15         13.5
 PFD            443             439                 1          8.5
 ETV             76              76                 0          1.0
ADRC            395              13               297       1117.0
