In [37]:
# Cell 1: Import libraries and read data
import pandas as pd
import numpy as np


In [38]:
# Read census files
df_1900 = pd.read_csv('data/lakeland_1900_census.csv')
df_1920 = pd.read_csv('data/lakeland_1920_census.csv')
df_1930 = pd.read_csv('data/lakeland_1930_census.csv')
df_1940 = pd.read_csv('data/lakeland_1940_census.csv')
df_1950 = pd.read_csv('data/lakeland_1950_census.csv')

census_dfs = {
    1900: df_1900,
    1920: df_1920,
    1930: df_1930,
    1940: df_1940,
    1950: df_1950
}


In [39]:
# Cell 2: Define column standardization function
def standardize_columns(df, year):
    """Standardize column names across census years"""
    column_mapping = {
        'pk': 'source_pk',
        'dwelling number': 'dwelling_number',
        'dwelling': 'dwelling_number',
        'house_num': 'house_number',
        'relation_head': 'relation_to_head',
        'head_last': 'head_last_name',
        'head_first': 'head_first_name',
        'marital': 'marital_status',
        'place_birth': 'birth_place',
        'family': 'family_number'  # Added this mapping
    }
    
    df = df.copy()
    df = df.rename(columns=column_mapping)
    df['census_year'] = year
    
    # Print columns before and after standardization
    print(f"\nYear {year}:")
    print("Original columns:", df.columns.tolist())
    return df

# Standardize all dataframes
standardized_dfs = {
    year: standardize_columns(df, year) 
    for year, df in census_dfs.items()
}


1900 Census Columns: ['source_pk', 'dwelling_number', 'family', 'last_name', 'first_name', 'head_last_name', 'head_first_name', 'relation_to_head', 'race', 'sex', 'age', 'marital_status', 'birth_place', 'work', 'owned_rented', 'census_year']

1920 Census Columns: ['source_pk', 'dwelling_number', 'family', 'last_name', 'first_name', 'relation_to_head', 'head_last_name', 'head_first_name', 'sex', 'race', 'marital_status', 'age', 'birth_place', 'work', 'business', 'owned_rented', 'census_year']

1930 Census Columns: ['source_pk', 'dwelling_number', 'family', 'street_name', 'last_name', 'first_name', 'relation_to_head', 'sex', 'race', 'marital_status', 'age', 'birth_place', 'work', 'business', 'owned_rented', 'census_year']

1940 Census Columns: ['source_pk', 'ed', 'house_number', 'street_name', 'last_name', 'first_name', 'relation_to_head', 'head_last_name', 'head_first_name', 'sex', 'race', 'marital_status', 'age', 'birth_place', 'work', 'business', 'owned_rented', 'census_year']

1950 

In [40]:
# Cell 3: Generate IDs for persons and families
def generate_ids(df, prefix):
    """Generate string IDs with prefix"""
    return [f"{prefix}_{i:06d}" for i in range(len(df))]

# Add IDs to each dataframe
for year, df in standardized_dfs.items():
    df['person_id'] = generate_ids(df, f"P{year}")
    df['family_id'] = generate_ids(
        df.groupby(['dwelling_number', 'family_number']).ngroup(), 
        f"F{year}"
    )

# Print sample to verify
print("Sample from 1900:")
print(standardized_dfs[1900][['person_id', 'family_id', 'first_name', 'last_name']].head())

KeyError: 'family_number'

In [None]:
# Cell 4: Create census_records table
census_records_data = []

for year, df in standardized_dfs.items():
    census_records = pd.DataFrame({
        'census_year': df['census_year'],
        'source_pk': df['source_pk'],
        'ed': df['ed'] if 'ed' in df.columns else None,
        'page_number': None  # Add if available
    })
    census_records['record_id'] = range(1, len(census_records) + 1)
    census_records_data.append(census_records)

census_records_table = pd.concat(census_records_data, ignore_index=True)
print("Census Records Table Sample:")
print(census_records_table.head())

In [None]:
# Cell 5: Create locations table
locations_data = []

for year, df in standardized_dfs.items():
    locations = pd.DataFrame({
        'street_name': df['street_name'] if 'street_name' in df.columns else None,
        'house_num': df['house_number'] if 'house_number' in df.columns else None,
        'build_num': df['build_num'] if 'build_num' in df.columns else None,
        'dwelling_number': df['dwelling_number'],
        'family_number': df['family_number']
    })
    locations['location_id'] = range(1, len(locations) + 1)
    locations_data.append(locations)

locations_table = pd.concat(locations_data, ignore_index=True)
print("Locations Table Sample:")
print(locations_table.head())

In [None]:
# Cell 6: Create persons table
persons_data = []

for year, df in standardized_dfs.items():
    persons = pd.DataFrame({
        'person_id': df['person_id'],
        'first_name': df['first_name'],
        'last_name': df['last_name']
    }).drop_duplicates()
    persons_data.append(persons)

persons_table = pd.concat(persons_data, ignore_index=True).drop_duplicates()
print("Persons Table Sample:")
print(persons_table.head())

In [None]:
# Cell 7: Create personal_attributes table
personal_attributes_data = []

for year, df in standardized_dfs.items():
    personal_attributes = pd.DataFrame({
        'person_id': df['person_id'],
        'sex': df['sex'],
        'race': df['race'],
        'age': df['age'],
        'place_birth': df['birth_place'] if 'birth_place' in df.columns else None
    })
    personal_attributes['attribute_id'] = range(1, len(personal_attributes) + 1)
    personal_attributes_data.append(personal_attributes)

personal_attributes_table = pd.concat(personal_attributes_data, ignore_index=True)
print("Personal Attributes Table Sample:")
print(personal_attributes_table.head())

In [None]:
# Cell 8: Create remaining tables (occupations, families, relationships, etc.)
# Occupations
occupations_data = []
for year, df in standardized_dfs.items():
    occupations = pd.DataFrame({
        'person_id': df['person_id'],
        'work': df['work'],
        'business': df['business'] if 'business' in df.columns else None
    })
    occupations['occupation_id'] = range(1, len(occupations) + 1)
    occupations_data.append(occupations)

occupations_table = pd.concat(occupations_data, ignore_index=True)

# Families
families_data = []
for year, df in standardized_dfs.items():
    families = pd.DataFrame({
        'family_id': df['family_id'],
        'head_first_name': df['head_first_name'],
        'head_last_name': df['head_last_name']
    }).drop_duplicates()
    families_data.append(families)

families_table = pd.concat(families_data, ignore_index=True).drop_duplicates()

# Print samples
print("Occupations Table Sample:")
print(occupations_table.head())
print("\nFamilies Table Sample:")
print(families_table.head())

In [None]:
# Cell 9: Export all tables to CSV
# Create processed directory if it doesn't exist
import os
os.makedirs('data/processed', exist_ok=True)

# Dictionary of all tables
tables = {
    'census_records': census_records_table,
    'locations': locations_table,
    'persons': persons_table,
    'personal_attributes': personal_attributes_table,
    'occupations': occupations_table,
    'families': families_table
}

# Export each table
for table_name, df in tables.items():
    output_path = f'data/processed/{table_name}.csv'
    df.to_csv(output_path, index=False)
    print(f"Exported {table_name} to {output_path}")

In [None]:
# Cell 10: Verify data integrity
# Print summary statistics for each table
for table_name, df in tables.items():
    print(f"\n{table_name} Summary:")
    print(f"Number of rows: {len(df)}")
    print(f"Number of unique person_ids: {len(df['person_id'].unique()) if 'person_id' in df.columns else 'N/A'}")
    print(f"Columns: {df.columns.tolist()}")
    print("-" * 50)