In [147]:
# Cell 0: Schema Verification and Setup
import pandas as pd
import numpy as np
import json
import os

# Create processed directory if it doesn't exist
os.makedirs('data/processed', exist_ok=True)

# Define schema requirements
schema = {
    'census_records': {
        'required_columns': ['record_id', 'census_year', 'source_pk'],
        'integer_columns': ['record_id', 'census_year', 'source_pk'],
        'varchar_columns': ['ed', 'page_number']
    },
    'locations': {
        'required_columns': ['location_id', 'record_id'],
        'integer_columns': ['location_id', 'record_id'],
        'varchar_columns': ['street_name', 'house_num', 'build_num', 'dwelling_number', 'family_number']
    },
    'persons': {
        'required_columns': ['person_id', 'first_name', 'last_name'],
        'varchar_columns': ['person_id', 'first_name', 'last_name']
    },
    'personal_attributes': {
        'required_columns': ['attribute_id', 'person_id', 'record_id'],
        'integer_columns': ['attribute_id', 'record_id', 'age'],
        'varchar_columns': ['person_id', 'sex', 'race', 'place_birth']
    },
    'occupations': {
        'required_columns': ['occupation_id', 'person_id', 'record_id'],
        'integer_columns': ['occupation_id', 'record_id'],
        'varchar_columns': ['person_id', 'work', 'business']
    },
    'families': {
        'required_columns': ['family_id', 'record_id', 'location_id'],
        'integer_columns': ['record_id', 'location_id'],
        'varchar_columns': ['family_id', 'head_first_name', 'head_last_name']
    },
    'relationships': {
        'required_columns': ['relationship_id', 'person_id', 'family_id', 'record_id'],
        'integer_columns': ['relationship_id', 'record_id'],
        'varchar_columns': ['person_id', 'family_id', 'relation_to_head']
    },
    'property_status': {
        'required_columns': ['property_id', 'person_id', 'record_id'],
        'integer_columns': ['property_id', 'record_id'],
        'varchar_columns': ['person_id', 'owned_rented']
    },
    'marital_status': {
        'required_columns': ['marital_id', 'person_id', 'record_id'],
        'integer_columns': ['marital_id', 'record_id'],
        'varchar_columns': ['person_id', 'marital_status']
    }
}

In [148]:
# Cell 1: Read Census Data
print("Reading census data...")
# Read the extracted CSV files
census_dfs = {
    1900: pd.read_csv('data/extracted/lakeland_1900_census.csv'),
    1920: pd.read_csv('data/extracted/lakeland_1920_census.csv'),
    1930: pd.read_csv('data/extracted/lakeland_1930_census.csv'),
    1940: pd.read_csv('data/extracted/lakeland_1940_census.csv'),
    1950: pd.read_csv('data/extracted/lakeland_1950_census.csv')
}


Reading census data...


In [149]:
# Cell 2: Standardize Columns
def standardize_columns(df, year):
    """Standardize column names to match database schema"""
    df = df.copy()
    
    # Convert column names to lowercase and strip whitespace
    df.columns = df.columns.str.lower().str.strip()
    
    # Standard column mappings
    column_mapping = {
        'pk': 'source_pk',
        'dwelling number': 'dwelling_number',
        'dwelling': 'dwelling_number',
        'dwelling no': 'dwelling_number',
        'dwelling_no': 'dwelling_number',
        'house_number': 'house_num',
        'house number': 'house_num',
        'relation_head': 'relation_to_head',
        'relation to head': 'relation_to_head',
        'head_last': 'head_last_name',
        'head last': 'head_last_name',
        'head_first': 'head_first_name',
        'head first': 'head_first_name',
        'marital': 'marital_status',
        'birth_place': 'place_birth',
        'birth place': 'place_birth',
        'family': 'family_number',
        'family no': 'family_number',
        'family_no': 'family_number',
        'family number': 'family_number'
    }
    
    # Apply mappings
    df = df.rename(columns=column_mapping)
    df['census_year'] = year
    
    return df

# Create standardized dataframes
standardized_dfs = {}
for year, df in census_dfs.items():
    print(f"Standardizing {year} census...")
    standardized_dfs[year] = standardize_columns(df, year)

Standardizing 1900 census...
Standardizing 1920 census...
Standardizing 1930 census...
Standardizing 1940 census...
Standardizing 1950 census...


In [150]:
# Cell 3: Create census_records table
census_records_data = []
record_id_counter = 1

for year, df in standardized_dfs.items():
    census_records = pd.DataFrame({
        'record_id': range(record_id_counter, record_id_counter + len(df)),
        'census_year': df['census_year'].astype('Int64'),
        'source_pk': df['source_pk'].astype('Int64'),
        'ed': df['ed'] if 'ed' in df.columns else None,
        'page_number': df['page_number'] if 'page_number' in df.columns else None
    })
    record_id_counter += len(df)
    census_records_data.append(census_records)

census_records_table = pd.concat(census_records_data, ignore_index=True)

In [151]:
# Cell 4: Create persons table with unique IDs
def generate_person_id(year, index):
    return f"P{year}_{index:06d}"

persons_data = []
for year, df in standardized_dfs.items():
    persons = pd.DataFrame({
        'person_id': [generate_person_id(year, i) for i in range(len(df))],
        'first_name': df['first_name'],
        'last_name': df['last_name']
    })
    persons_data.append(persons)

persons_table = pd.concat(persons_data, ignore_index=True).drop_duplicates()

In [152]:
# Cell 5: Create locations table
locations_data = []
location_id_counter = 1

for year, df in standardized_dfs.items():
    # Create a base DataFrame with the required columns
    locations = pd.DataFrame({
        'location_id': range(location_id_counter, location_id_counter + len(df)),
        'record_id': range(record_id_counter - len(df), record_id_counter)
    })
    
    # Add optional columns if they exist, otherwise use None
    optional_columns = {
        'street_name': None,
        'house_num': None,
        'build_num': None,
        'dwelling_number': None,
        'family_number': None
    }
    
    for col, default in optional_columns.items():
        locations[col] = df[col] if col in df.columns else default
        
    # If dwelling_number is missing, create it from index
    if locations['dwelling_number'].isna().all():
        locations['dwelling_number'] = range(1, len(df) + 1)
        print(f"Warning: Created sequential dwelling_number for {year} census")
    
    # If family_number is missing, create it from index
    if locations['family_number'].isna().all():
        locations['family_number'] = range(1, len(df) + 1)
        print(f"Warning: Created sequential family_number for {year} census")
    
    location_id_counter += len(df)
    locations_data.append(locations)

locations_table = pd.concat(locations_data, ignore_index=True)

# Print sample of the locations table
print("\nLocations Table Sample:")
print(locations_table.head())


Locations Table Sample:
   location_id  record_id street_name  house_num build_num  dwelling_number  \
0            1        612        None        NaN      None             32.0   
1            2        613        None        NaN      None             32.0   
2            3        614        None        NaN      None             32.0   
3            4        615        None        NaN      None             32.0   
4            5        616        None        NaN      None             32.0   

   family_number  
0           32.0  
1           32.0  
2           32.0  
3           32.0  
4           32.0  


  locations_table = pd.concat(locations_data, ignore_index=True)


In [153]:
# Cell 6: Create families table
def generate_family_id(year, index):
    return f"F{year}_{index:06d}"

def ensure_family_columns(df, year):
    """Ensure required family columns exist and are properly formatted"""
    df = df.copy()
    
    # Create dwelling_number if missing
    if 'dwelling_number' not in df.columns:
        df['dwelling_number'] = range(1, len(df) + 1)
        print(f"Warning: Created sequential dwelling_number for {year}")
    
    # Create family_number if missing
    if 'family_number' not in df.columns:
        df['family_number'] = range(1, len(df) + 1)
        print(f"Warning: Created sequential family_number for {year}")
    
    # Convert to string and handle NaN values
    df['dwelling_number'] = df['dwelling_number'].fillna(0).astype(int).astype(str)
    df['family_number'] = df['family_number'].fillna(0).astype(int).astype(str)
    
    return df

families_data = []
family_id_maps = {}  # Store family ID mappings for each year

for year, df in standardized_dfs.items():
    try:
        print(f"\nProcessing year {year}...")
        
        # Ensure required columns exist and are properly formatted
        df = ensure_family_columns(df, year)
        
        # Group by dwelling and family number
        family_groups = df.groupby(['dwelling_number', 'family_number'])
        
        # Get the first record for each family group
        first_records = family_groups.first().reset_index()
        num_families = len(first_records)
        
        # Create family IDs and store mapping
        family_ids = [generate_family_id(year, i) for i in range(num_families)]
        family_id_maps[year] = dict(zip(
            zip(first_records['dwelling_number'], first_records['family_number']),
            family_ids
        ))
        
        # Create the families DataFrame
        families = pd.DataFrame({
            'family_id': family_ids,
            'record_id': range(record_id_counter - num_families, record_id_counter),
            'location_id': range(location_id_counter - num_families, location_id_counter),
            'head_first_name': first_records['head_first_name'] if 'head_first_name' in first_records.columns else None,
            'head_last_name': first_records['head_last_name'] if 'head_last_name' in first_records.columns else None
        })
        
        families_data.append(families)
        print(f"Created {num_families} family records for year {year}")
        
    except Exception as e:
        print(f"\nError processing year {year}")
        print("Available columns:", df.columns.tolist())
        print(f"Error details: {str(e)}")
        raise

families_table = pd.concat(families_data, ignore_index=True)

# Print sample and statistics
print("\nFamilies Table Sample:")
print(families_table.head())
print("\nFamily ID Statistics:")
print(f"Total number of families: {len(families_table)}")
print(f"Number of unique family IDs: {len(families_table['family_id'].unique())}")


Processing year 1900...
Created 50 family records for year 1900

Processing year 1920...
Created 33 family records for year 1920

Processing year 1930...
Created 10 family records for year 1930

Processing year 1940...
Created 61 family records for year 1940

Processing year 1950...
Created 314 family records for year 1950

Families Table Sample:
      family_id  record_id  location_id head_first_name head_last_name
0  F1900_000000        748          748         Ewell A           Dick
1  F1900_000001        749          749         William          Davis
2  F1900_000002        750          750         James F         Meegan
3  F1900_000003        751          751          Andrew           Hill
4  F1900_000004        752          752          Joseph         Tucker

Family ID Statistics:
Total number of families: 468
Number of unique family IDs: 468


In [154]:
# Cell 7: Create personal_attributes table
def clean_age(age):
    """Convert age to integer, handling fractions"""
    if pd.isna(age):
        return None
    try:
        return int(float(age))
    except (ValueError, TypeError):
        if isinstance(age, str) and '/' in age:
            num, denom = map(int, age.split('/'))
            return max(0, int(num / denom))
        return None

personal_attributes_data = []
attribute_id_counter = 1

for year, df in standardized_dfs.items():
    attributes = pd.DataFrame({
        'attribute_id': range(attribute_id_counter, attribute_id_counter + len(df)),
        'person_id': [generate_person_id(year, i) for i in range(len(df))],
        'record_id': range(record_id_counter - len(df), record_id_counter),
        'sex': df['sex'],
        'race': df['race'],
        'age': df['age'].apply(clean_age),
        'place_birth': df['place_birth'] if 'place_birth' in df.columns else None
    })
    attribute_id_counter += len(df)
    personal_attributes_data.append(attributes)

personal_attributes_table = pd.concat(personal_attributes_data, ignore_index=True)

In [155]:
# Cell 8: Create occupations table
occupations_data = []
occupation_id_counter = 1

for year, df in standardized_dfs.items():
    occupations = pd.DataFrame({
        'occupation_id': range(occupation_id_counter, occupation_id_counter + len(df)),
        'person_id': [generate_person_id(year, i) for i in range(len(df))],
        'record_id': range(record_id_counter - len(df), record_id_counter),
        'work': df['work'].replace({np.nan: None}),
        'business': df['business'].replace({np.nan: None}) if 'business' in df.columns else None
    })
    occupation_id_counter += len(df)
    occupations_data.append(occupations)

occupations_table = pd.concat(occupations_data, ignore_index=True)

In [156]:
# Cell 9: Create relationships table using the same family IDs
relationships_data = []
relationship_id_counter = 1

for year, df in standardized_dfs.items():
    try:
        # Ensure required columns exist
        if 'dwelling_number' not in df.columns:
            df['dwelling_number'] = range(1, len(df) + 1)
        if 'family_number' not in df.columns:
            df['family_number'] = range(1, len(df) + 1)
        
        # Get the family ID mapping for this year
        year_family_map = family_id_maps[year]
        
        # Create family IDs using the same mapping as families table
        family_ids = [year_family_map.get((d, f), generate_family_id(year, 0)) 
                     for d, f in zip(df['dwelling_number'], df['family_number'])]
        
        # Create relationships DataFrame
        relationships = pd.DataFrame({
            'relationship_id': range(relationship_id_counter, relationship_id_counter + len(df)),
            'person_id': [generate_person_id(year, i) for i in range(len(df))],
            'family_id': family_ids,
            'record_id': range(record_id_counter - len(df), record_id_counter),
            'relation_to_head': df['relation_to_head'].fillna('Unknown')
        })
        
        relationship_id_counter += len(df)
        relationships_data.append(relationships)
        print(f"Created {len(relationships)} relationship records for year {year}")
        
    except Exception as e:
        print(f"Error processing year {year}: {str(e)}")
        print("DataFrame columns:", df.columns.tolist())
        raise

relationships_table = pd.concat(relationships_data, ignore_index=True)

# Print samples to verify
print("\nFamilies Table Sample:")
print(families_table.head())
print("\nRelationships Table Sample:")
print(relationships_table.head())


Created 186 relationship records for year 1900
Created 186 relationship records for year 1920
Created 50 relationship records for year 1930
Created 61 relationship records for year 1940
Created 314 relationship records for year 1950

Families Table Sample:
      family_id  record_id  location_id head_first_name head_last_name
0  F1900_000000        748          748         Ewell A           Dick
1  F1900_000001        749          749         William          Davis
2  F1900_000002        750          750         James F         Meegan
3  F1900_000003        751          751          Andrew           Hill
4  F1900_000004        752          752          Joseph         Tucker

Relationships Table Sample:
   relationship_id     person_id     family_id  record_id relation_to_head
0                1  P1900_000000  F1900_000000        612             Head
1                2  P1900_000001  F1900_000000        613             Wife
2                3  P1900_000002  F1900_000000        614      

In [157]:
# Cell 10: Create property_status table
property_status_data = []
property_id_counter = 1

for year, df in standardized_dfs.items():
    if 'owned_rented' in df.columns:
        property_status = pd.DataFrame({
            'property_id': range(property_id_counter, property_id_counter + len(df)),
            'person_id': [generate_person_id(year, i) for i in range(len(df))],
            'record_id': range(record_id_counter - len(df), record_id_counter),
            'owned_rented': df['owned_rented'].replace({np.nan: None})
        })
        property_id_counter += len(df)
        property_status_data.append(property_status)

property_status_table = pd.concat(property_status_data, ignore_index=True)

In [158]:
# Cell 11: Create marital_status table
marital_status_data = []
marital_id_counter = 1

for year, df in standardized_dfs.items():
    if 'marital_status' in df.columns:
        marital = pd.DataFrame({
            'marital_id': range(marital_id_counter, marital_id_counter + len(df)),
            'person_id': [generate_person_id(year, i) for i in range(len(df))],
            'record_id': range(record_id_counter - len(df), record_id_counter),
            'marital_status': df['marital_status'].replace({np.nan: None})
        })
        marital_id_counter += len(df)
        marital_status_data.append(marital)

marital_status_table = pd.concat(marital_status_data, ignore_index=True)

In [159]:
# Cell 12: Verify foreign key relationships
def verify_foreign_keys():
    """Verify all foreign key relationships match the database schema"""
    # Get all unique IDs from primary tables
    record_ids = set(census_records_table['record_id'])
    person_ids = set(persons_table['person_id'])
    location_ids = set(locations_table['location_id'])
    family_ids = set(families_table['family_id'])
    
    # Debug family IDs
    print("\nDebugging family IDs:")
    print(f"Number of unique family IDs in families table: {len(family_ids)}")
    print(f"Number of unique family IDs in relationships table: {len(set(relationships_table['family_id']))}")
    
    # Find mismatched family IDs
    relationship_family_ids = set(relationships_table['family_id'])
    mismatched_ids = relationship_family_ids - family_ids
    if mismatched_ids:
        print("\nExample of mismatched family IDs:")
        print("First 5 mismatched IDs:", list(mismatched_ids)[:5])
        print("\nExample records from families table:")
        print(families_table[['family_id']].head())
        print("\nExample records from relationships table:")
        print(relationships_table[['family_id']].head())
        raise AssertionError(f"Found {len(mismatched_ids)} family IDs in relationships table that don't exist in families table")

    # Verify locations foreign keys
    assert all(rid in record_ids for rid in locations_table['record_id']), \
        "Invalid record_id in locations table"

    # Verify families foreign keys
    assert all(rid in record_ids for rid in families_table['record_id']), \
        "Invalid record_id in families table"
    assert all(lid in location_ids for lid in families_table['location_id']), \
        "Invalid location_id in families table"

    # Verify personal_attributes foreign keys
    assert all(pid in person_ids for pid in personal_attributes_table['person_id']), \
        "Invalid person_id in personal_attributes table"
    assert all(rid in record_ids for rid in personal_attributes_table['record_id']), \
        "Invalid record_id in personal_attributes table"

    # Verify occupations foreign keys
    assert all(pid in person_ids for pid in occupations_table['person_id']), \
        "Invalid person_id in occupations table"
    assert all(rid in record_ids for rid in occupations_table['record_id']), \
        "Invalid record_id in occupations table"

    # Verify relationships foreign keys
    assert all(pid in person_ids for pid in relationships_table['person_id']), \
        "Invalid person_id in relationships table"
    assert all(fid in family_ids for fid in relationships_table['family_id']), \
        "Invalid family_id in relationships table"
    assert all(rid in record_ids for rid in relationships_table['record_id']), \
        "Invalid record_id in relationships table"

    print("All foreign key relationships verified!")

# Debug family IDs
print("\nDebugging family IDs:")
print("\nFamilies table sample:")
print(families_table[['family_id']].head())
print("\nRelationships table sample:")
print(relationships_table[['family_id']].head())

print("\nFamily ID counts:")
print(f"Number of unique family IDs in families table: {len(set(families_table['family_id']))}")
print(f"Number of unique family IDs in relationships table: {len(set(relationships_table['family_id']))}")

# Find some mismatched IDs
family_ids = set(families_table['family_id'])
relationship_family_ids = set(relationships_table['family_id'])
mismatched_ids = relationship_family_ids - family_ids
if mismatched_ids:
    print("\nFirst 5 family IDs that exist in relationships but not in families:")
    print(list(mismatched_ids)[:5])
    
# Verify foreign keys
verify_foreign_keys()



Debugging family IDs:

Families table sample:
      family_id
0  F1900_000000
1  F1900_000001
2  F1900_000002
3  F1900_000003
4  F1900_000004

Relationships table sample:
      family_id
0  F1900_000000
1  F1900_000000
2  F1900_000000
3  F1900_000000
4  F1900_000000

Family ID counts:
Number of unique family IDs in families table: 468
Number of unique family IDs in relationships table: 5

Debugging family IDs:
Number of unique family IDs in families table: 468
Number of unique family IDs in relationships table: 5
All foreign key relationships verified!


In [160]:
# Cell 13: Data Type Conversion
def convert_to_string(value):
    """Convert value to string, handling NaN and None"""
    if pd.isna(value):
        return None
    # Remove .0 from float strings
    str_val = str(value).rstrip('.0')
    return str_val if str_val != 'nan' else None

def convert_to_integer(value):
    """Convert value to integer, handling NaN and None"""
    if pd.isna(value):
        return None
    try:
        # First convert to float to handle any decimal values
        float_val = float(value)
        # Then convert to int, rounding down
        return int(float_val)
    except (ValueError, TypeError):
        return None

def convert_to_postgres_types():
    """Convert DataFrame columns to match PostgreSQL schema types"""
    global locations_table, census_records_table, personal_attributes_table
    global persons_table, occupations_table, families_table
    global relationships_table, property_status_table, marital_status_table
    
    # Convert locations table string columns
    string_cols = ['street_name', 'house_num', 'build_num', 'dwelling_number', 'family_number']
    for col in string_cols:
        if col in locations_table.columns:
            locations_table[col] = locations_table[col].apply(convert_to_string)
    
    # Convert census_records integer columns
    census_records_table['census_year'] = census_records_table['census_year'].apply(convert_to_integer)
    census_records_table['source_pk'] = census_records_table['source_pk'].apply(convert_to_integer)
    
    # Convert census_records string columns
    for col in ['ed', 'page_number']:
        if col in census_records_table.columns:
            census_records_table[col] = census_records_table[col].apply(convert_to_string)
    
    # Convert personal_attributes age to integer
    if 'age' in personal_attributes_table.columns:
        personal_attributes_table['age'] = personal_attributes_table['age'].apply(convert_to_integer)
    
    # Ensure all record_id fields are integers
    for table in [locations_table, personal_attributes_table, occupations_table, 
                 families_table, relationships_table, property_status_table, 
                 marital_status_table]:
        if 'record_id' in table.columns:
            table['record_id'] = table['record_id'].apply(convert_to_integer)
    
    # Convert IDENTITY columns to integers
    id_columns = {
        'locations_table': ['location_id'],
        'personal_attributes_table': ['attribute_id'],
        'occupations_table': ['occupation_id'],
        'relationships_table': ['relationship_id'],
        'property_status_table': ['property_id'],
        'marital_status_table': ['marital_id']
    }
    
    for table_name, columns in id_columns.items():
        df = eval(table_name)
        for col in columns:
            if col in df.columns:
                df[col] = df[col].apply(convert_to_integer)
    
    # Convert all varchar columns
    varchar_columns = {
        'persons_table': ['person_id', 'first_name', 'last_name'],
        'personal_attributes_table': ['person_id', 'sex', 'race', 'place_birth'],
        'occupations_table': ['person_id', 'work', 'business'],
        'families_table': ['family_id', 'head_first_name', 'head_last_name'],
        'relationships_table': ['person_id', 'family_id', 'relation_to_head'],
        'property_status_table': ['person_id', 'owned_rented'],
        'marital_status_table': ['person_id', 'marital_status']
    }
    
    for table_name, columns in varchar_columns.items():
        df = eval(table_name)
        for col in columns:
            if col in df.columns:
                df[col] = df[col].apply(convert_to_string)

# Run conversion
print("Converting data types to match PostgreSQL schema...")
convert_to_postgres_types()

# Verify conversions
print("\nVerifying data types after conversion:")

# Check locations table string columns
print("\nLocations table string columns:")
for col in ['house_num', 'dwelling_number', 'family_number']:
    if col in locations_table.columns:
        print(f"\n{col}:")
        print("Sample values:", locations_table[col].head().tolist())
        print("Data type:", locations_table[col].dtype)

# Check personal_attributes age column
print("\nPersonal attributes age column:")
if 'age' in personal_attributes_table.columns:
    print("Sample values:", personal_attributes_table['age'].head().tolist())
    print("Data type:", personal_attributes_table['age'].dtype)

Converting data types to match PostgreSQL schema...

Verifying data types after conversion:

Locations table string columns:

house_num:
Sample values: [None, None, None, None, None]
Data type: object

dwelling_number:
Sample values: ['32', '32', '32', '32', '32']
Data type: object

family_number:
Sample values: ['32', '32', '32', '32', '32']
Data type: object

Personal attributes age column:
Sample values: [58.0, 56.0, 22.0, 20.0, 15.0]
Data type: float64


In [161]:
# Cell 14: Export Tables
# Export tables to CSV
print("Exporting tables to CSV...")
tables_to_export = {
    'census_records': census_records_table,
    'locations': locations_table,
    'persons': persons_table,
    'personal_attributes': personal_attributes_table,
    'occupations': occupations_table,
    'families': families_table,
    'relationships': relationships_table,
    'property_status': property_status_table,
    'marital_status': marital_status_table
}

for table_name, df in tables_to_export.items():
    output_path = f'data/processed/{table_name}.csv'
    df.to_csv(output_path, index=False, na_rep='NULL')
    print(f"Exported {table_name} to {output_path}")
    print(f"\n{table_name.title()} Table Sample:")
    print(df.head())

print("\nAll tables exported successfully!")



Exporting tables to CSV...
Exported census_records to data/processed/census_records.csv

Census_Records Table Sample:
   record_id  census_year  source_pk    ed page_number
0          1         1900          1  None        None
1          2         1900          2  None        None
2          3         1900          3  None        None
3          4         1900          4  None        None
4          5         1900          5  None        None
Exported locations to data/processed/locations.csv

Locations Table Sample:
   location_id  record_id street_name house_num build_num dwelling_number  \
0            1        612        None      None      None              32   
1            2        613        None      None      None              32   
2            3        614        None      None      None              32   
3            4        615        None      None      None              32   
4            5        616        None      None      None              32   

  family_numb

In [162]:
# Cell 15: Print summary statistics
print("\nData Transform Summary:")
print("-" * 50)
for table_name, df in tables.items():
    print(f"\n{table_name}:")
    print(f"  Rows: {len(df)}")
    print(f"  Columns: {', '.join(df.columns)}")
    print(f"  Sample:")
    print(df.head(2))
    print("-" * 50)


Data Transform Summary:
--------------------------------------------------

census_records:
  Rows: 797
  Columns: record_id, census_year, source_pk, ed, page_number
  Sample:
   record_id  census_year  source_pk    ed page_number
0          1         1900          1  None        None
1          2         1900          2  None        None
--------------------------------------------------

locations:
  Rows: 797
  Columns: location_id, record_id, street_name, house_num, build_num, dwelling_number, family_number
  Sample:
   location_id  record_id street_name  house_num build_num  dwelling_number  \
0            1        612        None        NaN      None             32.0   
1            2        613        None        NaN      None             32.0   

   family_number  
0           32.0  
1           32.0  
--------------------------------------------------

persons:
  Rows: 797
  Columns: person_id, first_name, last_name
  Sample:
      person_id first_name      last_name
0  P1900