In [11]:
import pandas as pd
import numpy as np



In [12]:
# Read census files
df_1900 = pd.read_csv('data/lakeland_1900_census.csv')
df_1920 = pd.read_csv('data/lakeland_1920_census.csv')
df_1930 = pd.read_csv('data/lakeland_1930_census.csv')
df_1940 = pd.read_csv('data/lakeland_1940_census.csv')
df_1950 = pd.read_csv('data/lakeland_1950_census.csv')
df_1965 = pd.read_csv('data/lakeland_1965_voter.csv')

# Print sample of each DataFrame
print("\nSample of each census year:")
for year, df in [('1900', df_1900), ('1920', df_1920), 
                 ('1930', df_1930), ('1940', df_1940),
                 ('1950', df_1950), ('1965', df_1965)]:
    print(f"\n{year} Census Columns:", df.columns.tolist())




Sample of each census year:

1900 Census Columns: ['pk', 'dwelling', 'family', 'last_name', 'first_name', 'head_last', 'head_first', 'relation_head', 'race', 'sex', 'age', 'marital', 'place_birth', 'work', 'owned_rented']

1920 Census Columns: ['pk', 'dwelling number', 'family', 'last_name', 'first_name', 'relation_head', 'head_last', 'head_first', 'sex', 'race', 'marital', 'age', 'place_birth', 'work', 'business', 'owned_rented']

1930 Census Columns: ['pk', 'dwelling number', 'family', 'street_name', 'last_name', 'first_name', 'relation_head', 'sex', 'race', 'marital', 'age', 'place_birth', 'work', 'business', 'owned_rented']

1940 Census Columns: ['pk', 'ed', 'house_num', 'street_name', 'last_name', 'first_name', 'relation_head', 'head_last', 'head_first', 'sex', 'race', 'marital', 'age', 'place_birth', 'work', 'business', 'owned_rented']

1950 Census Columns: ['pk', 'ed', 'house_num', 'build_num', 'street_name', 'last_name', 'first_name', 'relation_head', 'head_last', 'head_first'

In [13]:
def standardize_columns(df, year):
    df = df.copy()
    
    # Add census year
    df['census_year'] = year
    
    # Rename columns to match database schema
    column_mapping = {
        'pk': 'source_pk',
        'first_name': 'first_name',
        'last_name': 'last_name',
        'dwelling': 'dwelling',
        'family': 'family',
        'relation_head': 'relation_to_hoh',
        'head_first': 'hoh_first_name',
        'head_last': 'hoh_last_name',
        'sex': 'sex',
        'race': 'race',
        'age': 'age',
        'marital': 'marital_status',
        'work': 'work',
        'business': 'business',
        'owned_rented': 'owned_rented'
    }
    
    # Rename existing columns
    df = df.rename(columns={k: v for k, v in column_mapping.items() if k in df.columns})
    
    # Add missing columns with null values
    for col in column_mapping.values():
        if col not in df.columns:
            df[col] = None
            
    return df

In [15]:
# ... existing code ...
df_1900 = standardize_columns(df_1900, 1900)
df_1920 = standardize_columns(df_1920, 1920)
df_1930 = standardize_columns(df_1930, 1930)
df_1940 = standardize_columns(df_1940, 1940)
df_1950 = standardize_columns(df_1950, 1950)
df_1965 = standardize_columns(df_1965, 1965)

In [16]:
combined_df = pd.concat([
    df_1900, df_1920, df_1930, df_1940, df_1950, df_1965
], ignore_index=True)

In [17]:
race_mapping = {
    'W': 'White',
    'B': 'Black',
    'N': 'Black',
    'Neg': 'Black'
}
combined_df['race'] = combined_df['race'].map(race_mapping).fillna(combined_df['race'])

In [18]:
marital_mapping = {
    'M': 'Married',
    'S': 'Single',
    'W': 'Widowed',
    'Wd': 'Widowed',
    'D': 'Divorced',
    'Mar': 'Married',
    'Sep': 'Separated',
    'Nev': 'Never Married'
}
combined_df['marital_status'] = combined_df['marital_status'].map(marital_mapping).fillna(combined_df['marital_status'])

In [19]:
final_columns = [
    'source_pk', 'census_year', 'person_id', 'record_id', 'attribute_id', 'family_id',
    'first_name', 'last_name', 'dwelling', 'family', 'relation_to_hoh',
    'hoh_first_name', 'hoh_last_name', 'sex', 'race', 'age',
    'marital_status', 'work', 'business', 'owned_rented'
]

In [21]:
# Add the missing ID columns
combined_df['person_id'] = np.arange(len(combined_df))
combined_df['record_id'] = np.arange(len(combined_df))
combined_df['attribute_id'] = np.arange(len(combined_df))
combined_df['family_id'] = combined_df.groupby(['census_year', 'dwelling', 'family']).ngroup()

# Now select the final columns
final_df = combined_df[final_columns]
final_df

In [22]:
final_df

Unnamed: 0,source_pk,census_year,person_id,record_id,attribute_id,family_id,first_name,last_name,dwelling,family,relation_to_hoh,hoh_first_name,hoh_last_name,sex,race,age,marital_status,work,business,owned_rented
0,1,1900,0,0,0,0.0,Ezra,Vanvalkenburg,32,32.0,Head,Ezra,Vanvalkenburg,M,White,58,Married,Merchant,,O
1,2,1900,1,1,1,0.0,Hattie,Vanvalkenburg,32,32.0,Wife,Ezra,Vanvalkenburg,F,White,56,Married,,,
2,3,1900,2,2,2,0.0,William,Vanvalkenburg,32,32.0,Son,Ezra,Vanvalkenburg,M,White,22,Single,Bakes Bread,,
3,4,1900,3,3,3,0.0,Jay,Vanvalkenburg,32,32.0,Son,Ezra,Vanvalkenburg,M,White,20,Single,Baker,,
4,5,1900,4,4,4,0.0,Jessie,Vanvalkenburg,32,32.0,Son,Ezra,Vanvalkenburg,F,White,15,Single,At School,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,184,1965,980,980,980,,James A.,Weems,,,,,,,,,,,,
981,185,1965,981,981,981,,James W.,Weems,,,,,,,,,,,,
982,186,1965,982,982,982,,Mary E.,Weems,,,,,,,,,,,,
983,187,1965,983,983,983,,Mary M.,Weems,,,,,,,,,,,,


In [23]:
final_df.to_csv('lakeland_combined_census.csv', index=False)

In [24]:
combined_df = pd.read_csv('data/lakeland_combined_census.csv')
combined_df

Unnamed: 0,source_pk,census_year,person_id,record_id,attribute_id,family_id,first_name,last_name,dwelling,family,relation_to_hoh,hoh_first_name,hoh_last_name,sex,race,age,marital_status,work,business,owned_rented
0,1,1900,0,0,0,0.0,Ezra,Vanvalkenburg,32.0,32.0,Head,Ezra,Vanvalkenburg,M,White,58,Married,Merchant,,O
1,2,1900,1,1,1,0.0,Hattie,Vanvalkenburg,32.0,32.0,Wife,Ezra,Vanvalkenburg,F,White,56,Married,,,
2,3,1900,2,2,2,0.0,William,Vanvalkenburg,32.0,32.0,Son,Ezra,Vanvalkenburg,M,White,22,Single,Bakes Bread,,
3,4,1900,3,3,3,0.0,Jay,Vanvalkenburg,32.0,32.0,Son,Ezra,Vanvalkenburg,M,White,20,Single,Baker,,
4,5,1900,4,4,4,0.0,Jessie,Vanvalkenburg,32.0,32.0,Son,Ezra,Vanvalkenburg,F,White,15,Single,At School,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
980,184,1965,980,980,980,,James A.,Weems,,,,,,,,,,,,
981,185,1965,981,981,981,,James W.,Weems,,,,,,,,,,,,
982,186,1965,982,982,982,,Mary E.,Weems,,,,,,,,,,,,
983,187,1965,983,983,983,,Mary M.,Weems,,,,,,,,,,,,


In [25]:
# Find duplicates based on first and last name
duplicates = combined_df[combined_df.duplicated(subset=['first_name', 'last_name'], keep=False)]

# Sort by first and last name to group duplicates together
duplicates = duplicates.sort_values(['first_name', 'last_name'])

duplicates


Unnamed: 0,source_pk,census_year,person_id,record_id,attribute_id,family_id,first_name,last_name,dwelling,family,relation_to_hoh,hoh_first_name,hoh_last_name,sex,race,age,marital_status,work,business,owned_rented
475,54,1940,475,475,475,,Agnes,Forrest,,,Head,Agnes,Forrest,F,Black,63,Widowed,Domestic,Private Home,O
633,151,1950,633,633,633,,Agnes,Forrest,,,Grandmother,Thomas,Randall,F,Black,75,Widowed,,,
537,55,1950,537,537,537,,Agnes,Gross,,,Wife,George H,Gross,F,Black,42,Married,Manager Of Cafeteria,Elementary School,
882,86,1965,882,882,882,,Agnes,Gross,,,,,,,,,,,,
308,123,1920,308,308,308,,Alice,Thomas,,104.0,Daughter,Ellen M,Lee,F,Black,11,Single,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,109,1950,591,591,591,,William,Gross,,,Head,William,Gross,M,Black,72,Widowed,,,
418,47,1930,418,418,418,,William P,Brown,,,Nephew In Law,,,M,Black,22,Married,Chauffeur,Private Family,
420,49,1930,420,420,420,,William P,Brown,,,Grand Nephew,,,M,Black,4/12,Single,,,
531,49,1950,531,531,531,,Willie,Laney,,,Head,Willie,Laney,M,Black,41,Married,Carpenter Worker,Private Construction Company,


In [26]:

# Convert age to numeric, setting non-numeric values to NaN
duplicates['age'] = pd.to_numeric(duplicates['age'], errors='coerce')
duplicates


Unnamed: 0,source_pk,census_year,person_id,record_id,attribute_id,family_id,first_name,last_name,dwelling,family,relation_to_hoh,hoh_first_name,hoh_last_name,sex,race,age,marital_status,work,business,owned_rented
475,54,1940,475,475,475,,Agnes,Forrest,,,Head,Agnes,Forrest,F,Black,63.0,Widowed,Domestic,Private Home,O
633,151,1950,633,633,633,,Agnes,Forrest,,,Grandmother,Thomas,Randall,F,Black,75.0,Widowed,,,
537,55,1950,537,537,537,,Agnes,Gross,,,Wife,George H,Gross,F,Black,42.0,Married,Manager Of Cafeteria,Elementary School,
882,86,1965,882,882,882,,Agnes,Gross,,,,,,,,,,,,
308,123,1920,308,308,308,,Alice,Thomas,,104.0,Daughter,Ellen M,Lee,F,Black,11.0,Single,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
591,109,1950,591,591,591,,William,Gross,,,Head,William,Gross,M,Black,72.0,Widowed,,,
418,47,1930,418,418,418,,William P,Brown,,,Nephew In Law,,,M,Black,22.0,Married,Chauffeur,Private Family,
420,49,1930,420,420,420,,William P,Brown,,,Grand Nephew,,,M,Black,,Single,,,
531,49,1950,531,531,531,,Willie,Laney,,,Head,Willie,Laney,M,Black,41.0,Married,Carpenter Worker,Private Construction Company,


In [27]:
duplicates.to_csv('lakeland_duplicates.csv', index=False)


In [30]:
duplicates
# Find duplicates where first name, last name AND age are the same
exact_duplicates = duplicates[duplicates.duplicated(subset=['first_name', 'last_name', 'age'], keep=False)]

# Sort by first and last name to group duplicates together
exact_duplicates = exact_duplicates.sort_values(['first_name', 'last_name', 'age'])

exact_duplicates



Unnamed: 0,source_pk,census_year,person_id,record_id,attribute_id,family_id,first_name,last_name,dwelling,family,relation_to_hoh,hoh_first_name,hoh_last_name,sex,race,age,marital_status,work,business,owned_rented
