## Library Import

In [1]:
import duckdb
import polars as pl
import dotenv
import os

## Load Silver Data

In [2]:
# Test connection to Silver Data db
try:
    db_path = os.getenv("DUCKDB_FILE")
    print(f"Attempting to connect to: {db_path}")
    
    # Try connection
    con = duckdb.connect(os.getenv("DUCKDB_FILE"))
    
    # Test Connection with a simple query
    result = con.execute("SELECT 1").fetchone()
    print("Connection Succesfful!")
    print(f"Test query result: {result}")
    
    
    # List all tables in the database
    tables = con.execute("SHOW TABLES").fetchall()
    print("\nAvailable tables:", tables)
    
except Exception as e:
    print(f"Connection Failed {str(e)}")

Attempting to connect to: /Users/yiukitcheung/Desktop/data_pipeline/storage/silver/resampled.db
Connection Succesfful!
Test query result: (1,)

Available tables: [('gold_data',), ('raw_data',), ('silver_1',), ('silver_13',), ('silver_2',), ('silver_21',), ('silver_3',), ('silver_34',), ('silver_5',), ('silver_55',), ('silver_8',)]


In [3]:
# List all tables
tables = con.execute("SHOW TABLES").fetchall()
print("Available tables:", tables)

Available tables: [('gold_data',), ('raw_data',), ('silver_1',), ('silver_13',), ('silver_2',), ('silver_21',), ('silver_3',), ('silver_34',), ('silver_5',), ('silver_55',), ('silver_8',)]


In [4]:
# Get schema information for a specific table
# Replace 'your_table_name' with an actual table name
table_info = con.execute("DESCRIBE silver_1").fetchall()
print("Table schema:", table_info)

Table schema: [('symbol', 'VARCHAR', 'YES', None, None, None), ('date', 'TIMESTAMP WITH TIME ZONE', 'YES', None, None, None), ('open', 'DOUBLE', 'YES', None, None, None), ('high', 'DOUBLE', 'YES', None, None, None), ('low', 'DOUBLE', 'YES', None, None, None), ('close', 'DOUBLE', 'YES', None, None, None), ('volume', 'DOUBLE', 'YES', None, None, None)]


In [5]:
# Get sample data from the table
sample_data = con.execute("SELECT * FROM silver_1 LIMIT 5").fetchall()
print("Sample data:", sample_data)

Sample data: [('TANH', datetime.datetime(2024, 11, 8, 0, 0, tzinfo=<DstTzInfo 'America/Edmonton' MST-1 day, 17:00:00 STD>), 6.4000000954, 6.4000000954, 5.8800001144, 6.0399999619, 81243.0), ('TANH', datetime.datetime(2024, 11, 25, 0, 0, tzinfo=<DstTzInfo 'America/Edmonton' MST-1 day, 17:00:00 STD>), 5.9600000381, 6.4000000954, 5.8800001144, 5.9600000381, 24653.0), ('TANH', datetime.datetime(2025, 1, 24, 0, 0, tzinfo=<DstTzInfo 'America/Edmonton' MST-1 day, 17:00:00 STD>), 6.4400000572, 6.8000001907, 6.2399997711, 6.6799998283, 45125.0), ('VATE', datetime.datetime(2009, 9, 17, 0, 0, tzinfo=<DstTzInfo 'America/Edmonton' MDT-1 day, 18:00:00 DST>), -4.8022216485, -4.9022679329, -4.8022216485, -4.9022679329, 2280.0), ('VATE', datetime.datetime(2009, 11, 16, 0, 0, tzinfo=<DstTzInfo 'America/Edmonton' MST-1 day, 17:00:00 STD>), -4.5687808242, -4.6021295893, -4.5087529451, -4.595459938, 2640.0)]


In [6]:
# Get total row count
row_count = con.execute("SELECT COUNT(*) FROM silver_1").fetchone()
print("Total rows:", row_count)

Total rows: (6159779,)


In [None]:
# Get the first 5 rows of the table
first_5_rows = con.execute("SELECT * FROM silver_1 LIMIT 5").fetchall()
print("First 5 rows:", first_5_rows)
# Get the last 5 rows of the table
last_5_rows = con.execute("SELECT * FROM silver_1 ORDER BY date DESC LIMIT 5").fetchall()
print("Last 5 rows:", last_5_rows)


In [None]:
# Get column statistics
stats = con.execute("""
    SELECT 
        symbol,
        COUNT(*) as count,
        AVG(open) as avg,
        MIN(open) as min,
        MAX(open) as max
    FROM silver_1
    GROUP BY symbol
""").fetchall()
print("Column statistics:", stats)

In [None]:
# Turn stats into a pl dataframe with mixed types allowed
stats_df = pl.DataFrame(stats, 
                        schema={
                            "symbol": pl.Utf8,
                            "count": pl.Int64,
                            "avg": pl.Float64,
                            "min": pl.Float64,
                            "max": pl.Float64
                        })
print(stats_df)

In [10]:
con.close()
