In [14]:
# Cell 1: Setup and Configuration
import pandas as pd
import numpy as np
from supabase import create_client
import os
from dotenv import load_dotenv
import time
from pathlib import Path

# Create directories for processed data and error logs
os.makedirs('data/processed', exist_ok=True)
os.makedirs('logs/failed_uploads', exist_ok=True)

# Add to .gitignore if it doesn't exist
gitignore_path = '.gitignore'
gitignore_entries = """
# Processed data and logs
data/processed/
logs/
.env.local
"""

if not os.path.exists(gitignore_path):
    with open(gitignore_path, 'w') as f:
        f.write(gitignore_entries)
else:
    with open(gitignore_path, 'a') as f:
        f.write(gitignore_entries)

# Load environment variables
load_dotenv('.env.local')
url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_KEY")

if not url or not key:
    raise ValueError("Missing Supabase credentials. Please check .env.local file")

# Initialize Supabase client
try:
    supabase = create_client(url, key)
    print("Successfully connected to Supabase")
except Exception as e:
    print(f"Error connecting to Supabase: {str(e)}")
    raise

# Read processed CSV files with error handling
tables = {}
required_tables = [
    'census_records', 'locations', 'persons', 'personal_attributes',
    'occupations', 'families', 'relationships', 'property_status',
    'marital_status'
]

for table_name in required_tables:
    file_path = f'data/processed/{table_name}.csv'
    try:
        tables[table_name] = pd.read_csv(file_path)
        print(f"Successfully loaded {table_name}")
    except Exception as e:
        print(f"Error loading {table_name}: {str(e)}")
        raise

Successfully connected to Supabase
Successfully loaded census_records
Successfully loaded locations
Successfully loaded persons
Successfully loaded personal_attributes
Successfully loaded occupations
Successfully loaded families
Successfully loaded relationships
Successfully loaded property_status
Successfully loaded marital_status


In [15]:
# Cell 2: Define upload function
def upload_to_supabase(table_name, df, batch_size=100):
    """Upload dataframe to Supabase table in batches with error handling"""
    print(f"Uploading {table_name}...")
    total_rows = len(df)
    successful_uploads = 0
    
    # Convert DataFrame to list of dictionaries
    records = df.to_dict('records')
    
    # Process in batches
    for i in range(0, total_rows, batch_size):
        batch = records[i:i + batch_size]
        try:
            data, count = supabase.table(table_name).insert(batch).execute()
            successful_uploads += len(batch)
            
            # Print progress
            progress = (i + len(batch)) / total_rows * 100
            print(f"Progress: {progress:.2f}% ({successful_uploads}/{total_rows} rows)")
            time.sleep(0.1)  # Small delay to avoid rate limits
            
        except Exception as e:
            print(f"Error uploading batch starting at row {i}: {str(e)}")
            with open(f'failed_{table_name}_batch_{i}.txt', 'w') as f:
                f.write(str(batch))
    
    return successful_uploads

In [16]:
# Cell 3: Upload tables in correct order
upload_order = [
    'census_records',  # No foreign keys
    'locations',       # Depends on census_records
    'persons',        # No foreign keys
    'families',       # Depends on census_records and locations
    'personal_attributes',  # Depends on persons and census_records
    'occupations',    # Depends on persons and census_records
    'relationships',  # Depends on persons, families, and census_records
    'property_status',  # Depends on persons and census_records
    'marital_status'   # Depends on persons and census_records
]

# Track upload statistics
upload_stats = {}

# Upload tables in order
for table_name in upload_order:
    if table_name in tables:
        df = tables[table_name]
        if 'created_at' in df.columns:
            df = df.drop('created_at', axis=1)
        
        print(f"\nProcessing {table_name}...")
        successful_rows = upload_to_supabase(table_name, df)
        upload_stats[table_name] = {
            'total_rows': len(df),
            'uploaded_rows': successful_rows
        }


Processing census_records...
Uploading census_records...
Error uploading batch starting at row 0: {'code': '22P02', 'details': 'Token "NaN" is invalid.', 'hint': None, 'message': 'invalid input syntax for type json'}
Error uploading batch starting at row 100: {'code': '22P02', 'details': 'Token "NaN" is invalid.', 'hint': None, 'message': 'invalid input syntax for type json'}
Error uploading batch starting at row 200: {'code': '22P02', 'details': 'Token "NaN" is invalid.', 'hint': None, 'message': 'invalid input syntax for type json'}
Error uploading batch starting at row 300: {'code': '22P02', 'details': 'Token "NaN" is invalid.', 'hint': None, 'message': 'invalid input syntax for type json'}
Error uploading batch starting at row 400: {'code': '22P02', 'details': 'Token "NaN" is invalid.', 'hint': None, 'message': 'invalid input syntax for type json'}
Error uploading batch starting at row 500: {'code': '22P02', 'details': 'Token "NaN" is invalid.', 'hint': None, 'message': 'invalid i

In [17]:
# Cell 4: Print upload summary
2|print("\nUpload Summary:")
print("-" * 50)
for table_name, stats in upload_stats.items():
    success_rate = (stats['uploaded_rows'] / stats['total_rows']) * 100
    print(f"{table_name}:")
    print(f"  Total rows: {stats['total_rows']}")
    print(f"  Uploaded rows: {stats['uploaded_rows']}")
    print(f"  Success rate: {success_rate:.2f}%")
    print("-" * 50)


Upload Summary:


TypeError: unsupported operand type(s) for |: 'int' and 'NoneType'