Crypto Data Loading with Snowpark

#------------------------------------------------------------------------------
# Data Engineering with Snowpark for Cryptocurrency Data
# Script:       crypto_data_loader.py
# Last Updated: February 27, 2025
#------------------------------------------------------------------------------

In [None]:
# Define our crypto tables and corresponding file names
CRYPTO_TABLES = {
    'BTC': 'BTC_raw_daily.csv',
    'DOGE': 'DOGE_raw_daily.csv',
    'ETH': 'ETH_raw_daily.csv'
}

In [None]:
# SNOWFLAKE ADVANTAGE: Schema detection
# SNOWFLAKE ADVANTAGE: Data ingestion with COPY

def load_raw_table(session, table_name, file_name, schema="RAW_CRYPTO"):
    """
    Load data from S3 stage into Snowflake tables
    
    Parameters:
    session: Snowpark session
    table_name: Table name (BTC, DOGE, ETH)
    file_name: File name in the stage (e.g., BTC_raw_daily.csv)
    schema: Schema name
    """
    print(f"Loading {table_name} from {file_name}")
    session.use_schema(schema)
    
    # Define the FULLY QUALIFIED stage path to the file
    stage_path = f"@CRYPTO_DB.INTEGRATIONS.CRYPTO_RAW_STAGE/{file_name}"
    
    # Read the CSV file from the stage
    df = session.read.option("header", True) \
                     .option("infer_schema", True) \
                     .csv(stage_path)
    
    # Print column names to debug
    print(f"Columns in dataframe: {df.columns}")
    
    # Create a case-insensitive mapping for columns
    columns_map = {col.lower(): col for col in df.columns}
    
    # Convert date string to DATE type - handling different case possibilities
    if "date" in columns_map:
        date_col = columns_map["date"]
        df = df.with_column(date_col, F.to_date(df[date_col]))

    
    # Convert numeric columns to FLOAT - handling different case possibilities
    for col in ["open", "high", "low", "close", "volume", "adjclose"]:
        if col.lower() in columns_map:
            actual_col = columns_map[col.lower()]
            df = df.with_column(actual_col, F.to_double(df[actual_col]))
      
    
    # Copy data into table
    df.write.mode("overwrite").save_as_table(f"{schema}.{table_name}")
    
    print(f"Successfully loaded {table_name}")
    
    # Return row count
    return session.table(f"{schema}.{table_name}").count()

In [None]:
# SNOWFLAKE ADVANTAGE: Warehouse elasticity (dynamic scaling)
def load_all_crypto_tables(session, warehouse_name="CRYPTO_WH"):
    """
    Load all cryptocurrency tables with warehouse scaling
    
    Parameters:
    session: Snowpark session
    warehouse_name: The name of the Snowflake warehouse to use
    """
    # Set role to CRYPTO_ROLE
    session.sql("USE ROLE CRYPTO_ROLE").collect()
    
    # Scale up warehouse for faster loading
    session.sql(f"ALTER WAREHOUSE {warehouse_name} SET WAREHOUSE_SIZE = LARGE WAIT_FOR_COMPLETION = TRUE").collect()
    
    try:
        # Explicitly use CRYPTO_ROLE for schema creation
        session.sql("USE ROLE CRYPTO_ROLE").collect()
        # Create schema if it doesn't exist
        session.sql("CREATE SCHEMA IF NOT EXISTS RAW_CRYPTO").collect()
        
        # Create tables if they don't exist
        for table_name in CRYPTO_TABLES.keys():
            # Use CRYPTO_ROLE explicitly for each table creation
            session.sql("USE ROLE CRYPTO_ROLE").collect()
            session.sql(f"""
            CREATE TABLE IF NOT EXISTS RAW_CRYPTO.{table_name} (
                date DATE PRIMARY KEY,
                open FLOAT,
                high FLOAT,
                low FLOAT,
                close FLOAT,
                volume FLOAT,
                adjclose FLOAT
            )
            """).collect()
        
        # Load data for each table
        results = {}
        for table_name, file_name in CRYPTO_TABLES.items():
            # Use CRYPTO_ROLE for data loading as well
            session.sql("USE ROLE CRYPTO_ROLE").collect()
            row_count = load_raw_table(session, table_name, file_name)
            results[table_name] = row_count
        
        return results
            
    finally:
        # Scale down warehouse when done
        session.sql(f"ALTER WAREHOUSE {warehouse_name} SET WAREHOUSE_SIZE = XSMALL").collect()

In [None]:
def validate_crypto_tables(session, schema="RAW_CRYPTO"):
    """
    Validate the loaded crypto tables
    """
    print("Validating loaded tables:")
    
    # First, get actual column names to check their case
    for table_name in ["BTC", "DOGE", "ETH"]:
        # Get column names from the table
        columns = session.table(f"{schema}.{table_name}").columns
        print(f"Actual columns in {table_name}: {columns}")
        
        # Find the date column (case insensitive)
        date_column = next((col for col in columns if col.upper() == "DATE"), None)
        
        if date_column:
            # Use the actual column name in the query
            date_range = session.sql(f"""
                SELECT 
                    MIN("{date_column}") as min_date, 
                    MAX("{date_column}") as max_date 
                FROM {schema}.{table_name}
            """).collect()[0]
            
            print(f"{table_name} data range: {date_range['MIN_DATE']} to {date_range['MAX_DATE']}")

In [None]:
# Main execution function
def main(session):
    """
    Main function to execute the data loading process
    
    Parameters:
    session: Snowpark session
    """
    print("Starting cryptocurrency data loading process...")
    
    # Use correct role
    session.sql("USE ROLE ACCOUNTADMIN").collect()
    
    # Load all tables
    results = load_all_crypto_tables(session)
    print(f"Data loading complete! Rows loaded: {results}")
    
    # Validate loaded tables
    validate_crypto_tables(session)
    
    print("Process completed successfully!")


# For executing from a Snowflake notebook
if __name__ == "__main__":
    # Create a Snowpark session
    from snowflake.snowpark import Session
    import snowflake.snowpark.functions as F
    with Session.builder.getOrCreate() as session:
        main(session)