In [2]:
# Cell 1: Setup and Configuration
import pandas as pd
import numpy as np
from supabase import create_client
import os
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv('.env.local')

# Get and verify environment variables
url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_KEY")

if not url or not key:
    raise ValueError(
        "Missing Supabase credentials. Please ensure SUPABASE_URL and SUPABASE_KEY "
        "are set in your .env.local file"
    )

print("Supabase credentials loaded successfully")

# Initialize Supabase client
try:
    supabase = create_client(url, key)
    print("Successfully connected to Supabase")
except Exception as e:
    print(f"Error connecting to Supabase: {str(e)}")
    raise

# Read processed CSV files
try:
    tables = {
        'census_records': pd.read_csv('data/processed/census_records.csv'),
        'locations': pd.read_csv('data/processed/locations.csv'),
        'persons': pd.read_csv('data/processed/persons.csv'),
        'personal_attributes': pd.read_csv('data/processed/personal_attributes.csv'),
        'occupations': pd.read_csv('data/processed/occupations.csv'),
        'families': pd.read_csv('data/processed/families.csv'),
        'relationships': pd.read_csv('data/processed/relationships.csv'),
        'property_status': pd.read_csv('data/processed/property_status.csv'),
        'marital_status': pd.read_csv('data/processed/marital_status.csv')
    }
    print("Successfully loaded all CSV files")
except Exception as e:
    print(f"Error loading CSV files: {str(e)}")
    raise

ValueError: Missing Supabase credentials. Please ensure SUPABASE_URL and SUPABASE_KEY are set in your .env.local file

In [None]:
# Cell 2: Define upload function with error handling and batching
def upload_to_supabase(table_name, df, batch_size=100):
    """
    Upload dataframe to Supabase table in batches with error handling
    """
    print(f"Uploading {table_name}...")
    total_rows = len(df)
    successful_uploads = 0
    
    # Convert DataFrame to list of dictionaries
    records = df.to_dict('records')
    
    # Process in batches
    for i in range(0, total_rows, batch_size):
        batch = records[i:i + batch_size]
        try:
            # Upload batch
            data, count = supabase.table(table_name).insert(batch).execute()
            successful_uploads += len(batch)
            
            # Print progress
            progress = (i + len(batch)) / total_rows * 100
            print(f"Progress: {progress:.2f}% ({successful_uploads}/{total_rows} rows)")
            
            # Small delay to avoid rate limits
            time.sleep(0.1)
            
        except Exception as e:
            print(f"Error uploading batch starting at row {i}: {str(e)}")
            # Log failed batch for retry if needed
            with open(f'failed_{table_name}_batch_{i}.txt', 'w') as f:
                f.write(str(batch))
    
    return successful_uploads

In [None]:
# Cell 2: Define upload function with error handling and batching
def upload_to_supabase(table_name, df, batch_size=100):
    """
    Upload dataframe to Supabase table in batches with error handling
    """
    print(f"Uploading {table_name}...")
    total_rows = len(df)
    successful_uploads = 0
    
    # Convert DataFrame to list of dictionaries
    records = df.to_dict('records')
    
    # Process in batches
    for i in range(0, total_rows, batch_size):
        batch = records[i:i + batch_size]
        try:
            # Upload batch
            data, count = supabase.table(table_name).insert(batch).execute()
            successful_uploads += len(batch)
            
            # Print progress
            progress = (i + len(batch)) / total_rows * 100
            print(f"Progress: {progress:.2f}% ({successful_uploads}/{total_rows} rows)")
            
            # Small delay to avoid rate limits
            time.sleep(0.1)
            
        except Exception as e:
            print(f"Error uploading batch starting at row {i}: {str(e)}")
            # Log failed batch for retry if needed
            with open(f'failed_{table_name}_batch_{i}.txt', 'w') as f:
                f.write(str(batch))
    
    return successful_uploads

In [None]:
# Cell 3: Upload order respecting foreign key constraints
upload_order = [
    'census_records',  # No foreign keys
    'locations',       # Depends on census_records
    'persons',        # No foreign keys
    'families',       # Depends on census_records and locations
    'personal_attributes',  # Depends on persons and census_records
    'occupations',    # Depends on persons and census_records
    'relationships',  # Depends on persons, families, and census_records
    'property_status',  # Depends on persons and census_records
    'marital_status'   # Depends on persons and census_records
]

# Track upload statistics
upload_stats = {}

# Upload tables in order
for table_name in upload_order:
    if table_name in tables:
        df = tables[table_name]
        # Remove any created_at columns as they're handled by Supabase
        if 'created_at' in df.columns:
            df = df.drop('created_at', axis=1)
        
        print(f"\nProcessing {table_name}...")
        successful_rows = upload_to_supabase(table_name, df)
        upload_stats[table_name] = {
            'total_rows': len(df),
            'uploaded_rows': successful_rows
        }

In [None]:
# Cell 4: Print upload summary
print("\nUpload Summary:")
print("-" * 50)
for table_name, stats in upload_stats.items():
    success_rate = (stats['uploaded_rows'] / stats['total_rows']) * 100
    print(f"{table_name}:")
    print(f"  Total rows: {stats['total_rows']}")
    print(f"  Uploaded rows: {stats['uploaded_rows']}")
    print(f"  Success rate: {success_rate:.2f}%")
    print("-" * 50)