# Library

In [19]:
import os
import duckdb
import boto3
import pandas as pd
import psycopg2
from dotenv import load_dotenv
import json

load_dotenv()

True

# Data Import

In [20]:
# RDS Crendtials
RDS_HOST = os.environ.get('RDS_HOST')
RDS_PORT = os.environ.get('RDS_PORT')
RDS_DATABASE = os.environ.get('RDS_DATABASE')
RDS_USER = os.environ.get('RDS_USER')
RDS_PASSWORD = os.environ.get('RDS_PASSWORD')
# RDS Postgres Connection
try:
    rds_conn = psycopg2.connect(
        host=RDS_HOST,
        port=RDS_PORT,
        database=RDS_DATABASE,
        user=RDS_USER,
        password=RDS_PASSWORD,
        sslmode='require'
    )
except Exception as e:
    print(f"Error connecting to RDS: {e}")
    raise

# RDS Postgres Cursor
rds_cursor = rds_conn.cursor()

In [21]:
# Check available tables in the connected database
try:
    rds_cursor.execute("""
        SELECT table_schema, table_name 
        FROM information_schema.tables 
        WHERE table_type = 'BASE TABLE' AND table_schema NOT IN ('pg_catalog', 'information_schema')
        ORDER BY table_schema, table_name
    """)
    tables = rds_cursor.fetchall()
    for schema, table in tables:
        print(f"{schema}.{table}")
except Exception as e:
    print(f"Error fetching tables: {e}")
    raise

public.raw_ohlcv
public.silver_13d
public.silver_21d
public.silver_34d
public.silver_3d
public.silver_5d
public.silver_8d
public.symbol_metadata


# Data Processing Query

In [22]:
# Resampling with Postgres for only symbol 'AMD' and timing the processing
sql = f"""
    WITH numbered AS (
        SELECT
            symbol,
            DATE(timestamp) as date,
            open as open,
            high as high,
            low as low,
            close as close,
            volume,
            ROW_NUMBER() OVER (PARTITION BY symbol ORDER BY DATE(timestamp)) AS rn
        FROM raw_ohlcv
        WHERE interval = '1d'
            AND symbol in ('AMD', 'AAPL', 'TSLA','NVDA')
    ),
    grp AS (
        SELECT
            symbol,
            date,
            open,
            high,
            low,
            close,
            volume,
            (rn - 1) / 3 AS grp_id
        FROM numbered
    ),
    aggregated AS (
        SELECT
            symbol,
            grp_id,
            MIN(date) AS start_date,
            MAX(high) AS high,
            MIN(low) AS low,
            SUM(volume) AS volume,
            (array_agg(open ORDER BY date))[1] AS open,
            (array_agg(close ORDER BY date DESC))[1] AS close
        FROM grp
        GROUP BY symbol, grp_id
        HAVING COUNT(*) = 3
    )
    SELECT
        symbol,
        start_date AS date,
        open,
        high,
        low,
        close,
        volume
    FROM aggregated
    ORDER BY symbol, date
"""

# Performance Evaluation

## Postgres

In [23]:
import time

start_time = time.time()
try:
    rds_cursor.execute(sql)
    postgres_resampled_data = rds_cursor.fetchall()
    end_time = time.time()
    print(f"Resampling for AMD took {end_time - start_time:.4f} seconds")
    postgres_resampled_data
except Exception as e:
    print(f"Error resampling data: {e}")
    raise

Resampling for AMD took 3.1968 seconds


## DuckDB

### Install Postgres Extension

In [24]:
# Initialize DuckDB Connection
duckdb_con = duckdb.connect(":default:")
# install postgres extension
duckdb_con.execute('INSTALL postgres')
# load postgres extension
duckdb_con.execute('LOAD postgres')

<duckdb.duckdb.DuckDBPyConnection at 0x107d0abf0>

In [25]:
# Connect to Postgres
# Construct a PostgreSQL URI
postgres_uri = (
    f"postgresql://{RDS_USER}:{RDS_PASSWORD}@{RDS_HOST}:{RDS_PORT}/{RDS_DATABASE}?sslmode=require"
)

# Attach Postgres to DuckDB using the URI
duckdb_con.execute(f"ATTACH '{postgres_uri}' AS postgres (TYPE POSTGRES);")

print("Successfully attached RDS PostgreSQL to DuckDB as 'postgres'.")

BinderException: Binder Error: Failed to attach database: database with name "postgres" already exists

In [None]:
# Execute SQL Query
start_time = time.time()
temp_dat = duckdb_con.execute(
    '''
    SELECT * 
    FROM postgres.public.raw_ohlcv 
    WHERE symbol in ('AMD', 'AAPL', 'TSLA','NVDA') 
    ORDER BY symbol, timestamp;
    '''
    )
end_time = time.time()
print(f"Time taken to fetch data: {end_time - start_time:.4f} seconds")