# ACS Data Cleaning and Exploration

## Importing Required Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import pandas_gbq
from google.cloud import bigquery
from varname import nameof
%load_ext google.cloud.bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '../zori-data-extr-be793d5c3325.json'

# Set your default project here
pandas_gbq.context.project = 'bigquery-public-data'
pandas_gbq.context.dialect = 'standard'

## Using Google BigQuery to Download ACS Data

In [2]:
%%bigquery --use_rest_api ACS_2018
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2018_5yr`

Query complete after 0.03s: 100%|██████████| 2/2 [00:00<00:00, 463.48query/s]                         
Downloading: 100%|██████████| 33120/33120 [00:05<00:00, 5694.27rows/s]


In [3]:
%%bigquery --use_rest_api ACS_2017
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2017_5yr`

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 471.48query/s]                         
Downloading: 100%|██████████| 33120/33120 [00:09<00:00, 3658.63rows/s]


In [4]:
%%bigquery --use_rest_api ACS_2016
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2016_5yr`

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 518.01query/s]                         
Downloading: 100%|██████████| 33120/33120 [00:06<00:00, 5361.62rows/s]


In [5]:
%%bigquery --use_rest_api ACS_2015
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2015_5yr`

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 486.10query/s]                         
Downloading: 100%|██████████| 33120/33120 [00:10<00:00, 3151.05rows/s]


In [6]:
%%bigquery --use_rest_api ACS_2014
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2014_5yr`

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 530.92query/s]                         
Downloading: 100%|██████████| 33120/33120 [00:05<00:00, 5996.65rows/s] 


## ACS Data Cleaning and Pre-Processing

In [7]:
pd.set_option('display.max_columns', None)
#Looking at the shape of each dataframe
print(ACS_2018.shape)
print(ACS_2017.shape)
print(ACS_2016.shape)
print(ACS_2015.shape)
print(ACS_2014.shape, '\n')

(33120, 240)
(33120, 252)
(33120, 252)
(33120, 247)
(33120, 252) 



Based on the data shown above, a number of columns are missing from the 2018 data set. Since the 2018 data set is the latest, all additional columns seen in prior year surveys will be dropped for consistency.

In [8]:
def clean_columns(df_year1, df_year2):
    """
    This function cleans the second dataframe to only include columns of the first dataframe.
    
    Arguments:
    
    df_year1 : (Pandas dataframe) dataframe containing the columns of interest
    df_year2 : (Pandas dataframe) dataframe containing the columns of interest + additional
    
    Returns
    df_year2 : (Pandas dataframe) modified version of the df_year2 dataframe from inputs
    
    """
    diff = np.setdiff1d(df_year2.columns, df_year1.columns)
    print(f'Columns from {nameof(df_year2)} non included in {nameof(df_year1)}:\n\nTotal of {len(diff)}\n\n{diff}\n\n')
    print('Removing columns...')
    
    #Dropping the columns from df_year2 that are not present in the df_year1
    df_year2 = df_year2.drop(columns = diff, inplace = True)
    
    print('Process complete\n\n')
    
    return df_year2

In [9]:
#Using the above defined function to clean columns
clean_columns(ACS_2018, ACS_2017)
clean_columns(ACS_2018, ACS_2016)
clean_columns(ACS_2018, ACS_2015)
clean_columns(ACS_2018, ACS_2014)

Columns from df_year2 non included in df_year1:

Total of 12

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins' 'commute_90_more_mins'
 'households_retirement_income' 'male_60_61' 'male_62_64'
 'white_including_hispanic']


Removing columns...
Process complete


Columns from df_year2 non included in df_year1:

Total of 12

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins' 'commute_90_more_mins'
 'households_retirement_income' 'male_60_61' 'male_62_64'
 'white_including_hispanic']


Removing columns...
Process complete


Columns from df_year2 non included in df_year1:

Total of 13

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins'

In [10]:
#Looking at the shape of each dataframe to confirm column cleaning has been done appropriately
print(ACS_2018.shape)
print(ACS_2017.shape)
print(ACS_2016.shape)
print(ACS_2015.shape)
print(ACS_2014.shape, '\n')

(33120, 240)
(33120, 240)
(33120, 240)
(33120, 234)
(33120, 240) 



In [11]:
diff = np.setdiff1d(ACS_2018.columns, ACS_2015.columns)
diff

array(['pop_15_and_over', 'pop_divorced', 'pop_never_married',
       'pop_now_married', 'pop_separated', 'pop_widowed'], dtype=object)

As can be seen above for year 2015 there are less columns than in any other year. Before dropping the columns in all other survey years, looking at missingness of those 6 columns would be best.

In [206]:
#Adding a year column to each survey dataframe
ACS_2018['year'] = 2018
ACS_2017['year'] = 2017
ACS_2016['year'] = 2016
ACS_2015['year'] = 2015
ACS_2014['year'] = 2014

#Combining all ACS dataframes to one global ACS dataframe
ACS = pd.concat([ACS_2018, ACS_2017, ACS_2016, ACS_2015, ACS_2014], ignore_index = True)
ACS = ACS.reset_index().drop(columns = 'index')

### Keeping the Focus on Top 10 Zillow Metro Areas

The project is looking at predicting 2019 data as a base model (no Covid-19 effect) by using zip codes from the top 10 metro areas in the currently available ZRI index data. 

In [207]:
#Reading in the Focus Zipcode file
focus_zip = pd.read_csv('../Data/focus_zipcode.csv')

#Creating a list of pertinent zip codes in str format
focus_zip_l = focus_zip.RegionName.apply(lambda x: str(x)).to_list()

#Keeping rows that have focus zipcodes in created ACS dataframes
ACS = ACS.loc[ACS['geo_id'].isin(focus_zip_l)]

### Dropping Income Columns

Partly Parrots will be looking at IRS data for everything income related, as such all income related columns will be dropped.

In [208]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'income')))]

### Looking at Missingness

In [209]:
def missingness(df):
    """
    This function looks at the number of missing values in a dataframe
    
    Arguments:
    
    df          : (Pandas dataframe) dataframe of interest
    
    Returns
    
    missing_col : (dictionary) column name and number of missing values 
    
    """
    all_cols_miss = df.isna().mean().sort_values(ascending = False)

    missing_col_list = []
    missing_col = {}

    for i in range(all_cols_miss[all_cols_miss > 0].shape[0]):
        missing_col_list.append([all_cols_miss[all_cols_miss > 0].index[i], \
                                round(all_cols_miss[all_cols_miss > 0][i]*100, 2)])

    missing_col.update(missing_col_list)
    return missing_col

In [210]:
#Looking at missing values in data
missingness(ACS)

{'pop_never_married': 80.0,
 'pop_now_married': 80.0,
 'pop_separated': 80.0,
 'pop_widowed': 80.0,
 'pop_divorced': 80.0,
 'pop_15_and_over': 80.0,
 'pop_5_years_over': 60.0,
 'speak_spanish_at_home': 60.0,
 'speak_only_english_at_home': 60.0,
 'speak_spanish_at_home_low_english': 60.0,
 'aggregate_travel_time_to_work': 1.36,
 'median_rent': 0.46,
 'renter_occupied_housing_units_paying_cash_median_gross_rent': 0.4,
 'owner_occupied_housing_units_lower_value_quartile': 0.14,
 'median_year_structure_built': 0.1,
 'owner_occupied_housing_units_median_value': 0.07,
 'owner_occupied_housing_units_upper_value_quartile': 0.06}

##### Taking a closer look into the high missingness columns<br>
##### Marital Status

In [238]:
ACS.loc[ACS['pop_now_married'].isna()]

Unnamed: 0,geo_id,do_date,total_pop,households,median_age,white_pop,population_1_year_and_over,population_3_years_over,pop_5_years_over,pop_15_and_over,pop_16_over,pop_25_years_over,pop_25_64,pop_never_married,pop_now_married,pop_separated,pop_widowed,pop_divorced,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,gini_index,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children,children_in_single_female_hh,median_rent,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,commuters_16_over,walked_to_work,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,commuters_by_public_transportation,commuters_by_bus,commuters_by_car_truck_van,commuters_by_carpool,commuters_by_subway_or_elevated,commuters_drove_alone,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,employed_pop,unemployed_pop,pop_in_labor_force,not_in_labor_force,workers_16_and_over,armed_forces,civilian_labor_force,employed_agriculture_forestry_fishing_hunting_mining,employed_arts_entertainment_recreation_accommodation_food,employed_construction,employed_education_health_social,employed_finance_insurance_real_estate,employed_information,employed_manufacturing,employed_other_services_not_public_admin,employed_public_administration,employed_retail_trade,employed_science_management_admin_waste,employed_transportation_warehousing_utilities,employed_wholesale_trade,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english,year,male_x_age,male_under_18,male_18_to_24,male_25_to_49,male_50_to_66,male_above_67,female_pop_check,female_x_age,female_under_18,female_18_to_24,female_25_to_49,female_50_to_66,female_above_67
539,11959,2014-01-01,784,327,51.9,697,774,774,,,628,593,367,,,,,,0,18,30,33,0,0,6,33,751,11,0,0,1,0,0,96,40,784,66,0.5107,1571,1630,776000,1.0469e+06,2e+06,327,76,1244,0,0,1535,0,18,0,18,0,0,0,0,0,30,40,1975,212,82,245,10,0,0,167,29,1400,0,19,0,21,0,0,0,22,11,3,251,56,125,9,0,55,39,39,0,0,0,16,6,6,34,33,33,40,8,16,8,42,50,264,0,66,10,10,62,174,38,43,,11,2,249,13,0,236,9,39,174,60,25,161,47,10,63,174,269,235,111,11,56,57,5,2,10,7,148,336,0,336,292,330,0,336,4,14,11,71,74,11,15,11,5,31,56,19,14,179,15,23,106,13,179,106,0,70,34,194,32,,,,2018,23.0,65,24,71,109,71,421.0,0.0,102.0,0.0,87.0,116.0,86.0
866,48320,2014-01-01,4839,2406,41.5,4264,4802,4703,,,4107,3696,3013,,,,,,111,201,44,295,5,10,20,295,4544,0,0,18,0,0,5,295,346,4828,559,0.4864,2591,935,102100,175500,269500,2406,767,185,26,44,2004,44,15,148,55,191,16,54,64,73,0,178,1959,855,1216,1190,138,6,0,834,223,808,33,127,72,26,82,106,55,148,104,14,1639,16,1159,578,99,264,217,163,54,0,0,47,0,0,196,400,283,385,275,426,175,205,285,2630,81,110,59,102,1185,797,243,79,76080,0,0,2497,92,0,2405,0,210,889,622,215,475,842,230,736,889,1204,574,1267,16,163,103,8,9,119,246,664,2777,167,2944,1163,2740,0,2944,10,343,168,533,210,64,266,158,158,408,324,45,90,1266,158,255,620,478,1266,620,145,155,202,829,181,,,,2018,163.0,417,208,872,469,258,2452.0,0.0,417.0,101.0,977.0,442.0,318.0
877,19074,2014-01-01,5890,2055,39.8,5175,5785,5652,,,4939,4038,3356,,,,,,49,100,269,104,1,0,241,104,5786,30,0,39,0,0,0,359,389,5853,362,0.3378,2238,958,140800,165600,197000,2055,501,183,59,21,1125,640,143,92,37,166,35,0,0,0,0,400,1951,975,517,1538,187,0,0,1191,430,844,28,53,42,58,31,75,54,120,33,7,1554,0,1088,245,81,290,81,67,7,0,7,209,17,17,335,382,485,402,138,495,439,185,343,3204,106,33,65,133,659,844,301,118,84650,278,92,2786,424,0,2362,5,405,535,1685,270,159,517,241,1813,535,690,257,1192,70,121,20,13,59,388,150,821,3290,294,3584,1355,3237,0,3584,0,340,380,889,213,38,219,178,108,326,293,260,46,1039,323,445,880,603,1039,880,126,320,360,1273,316,,,,2018,161.0,611,268,1009,617,165,3059.0,0.0,580.0,393.0,983.0,610.0,287.0
881,48215,2014-01-01,11979,4895,39.3,858,11779,11330,,,9101,7837,6062,,,,,,37,10740,26,163,0,0,192,163,11816,0,0,649,726,53,0,80,166,11917,5009,0.5847,7379,743,30000,56400,118500,4895,2769,2484,131,0,4037,1610,179,144,88,261,245,815,0,26,154,1013,1948,837,2471,2424,2242,1,3,3189,2143,567,378,955,230,176,215,225,176,163,172,79,2126,18,748,232,1020,1097,54,0,27,11,16,1043,110,62,301,241,579,830,328,460,242,267,282,3530,111,189,548,1582,2070,961,221,61,98185,257,257,3075,571,0,2504,119,428,661,2228,331,421,1648,1587,2701,661,904,481,2407,60,94,127,87,363,599,359,1689,3746,606,4352,4749,3719,0,4352,0,594,109,1062,212,93,409,146,164,273,336,336,12,852,155,789,725,1225,852,725,759,594,643,2652,384,,,,2018,406.0,1886,414,1559,1011,551,6152.0,0.0,1303.0,539.0,1813.0,1042.0,963.0
892,60457,2014-01-01,14018,4957,39.2,10845,13859,13724,,,11247,9609,7436,,,,,,1295,436,87,2515,23,0,102,2515,11503,15,9,34,62,171,93,929,674,13946,1286,0.4274,5480,942,165400,223700,281200,4957,1381,523,127,41,3459,96,22,129,737,766,249,22,0,0,0,123,1969,2753,1330,3627,467,9,0,3202,361,859,70,338,138,137,145,93,133,99,194,34,3576,0,2114,869,47,963,747,511,198,38,0,216,117,76,277,658,709,1083,301,1041,533,996,714,6312,4,107,22,191,1451,2101,806,408,207890,305,0,5966,387,108,5579,88,895,1687,2703,712,570,1412,1223,2958,1687,1969,722,3019,195,413,107,97,163,521,508,2004,6711,457,7168,4079,6419,0,7168,0,461,502,1037,503,106,1089,439,193,768,806,643,164,1945,897,1177,1575,1117,1945,1575,694,797,824,3857,850,,,,2018,384.0,1539,824,2492,1256,724,6799.0,0.0,1663.0,383.0,2135.0,1180.0,1018.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131167,11960,20112015,597.000000000,301.000000000,58.600000000,552.000000000,597.000000000,597.000000000,597.000000000,,569.000000000,540.000000000,321.000000000,,,,,,9.000000000,0E-9,0E-9,45.000000000,0E-9,0E-9,0E-9,45.000000000,552.000000000,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,49.000000000,56.000000000,597.000000000,20.000000000,0.371600000,973.000000000,,525900.000000000,718400.000000000,1012500.000000000,301.000000000,33.000000000,672.000000000,8.000000000,38.000000000,973.000000000,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,37.000000000,1975.000000000,182.000000000,89.000000000,212.000000000,0E-9,0E-9,4.000000000,32.000000000,15.000000000,,9.000000000,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,12.000000000,0E-9,12.000000000,268.000000000,66.000000000,83.000000000,11.000000000,3.000000000,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,102.000000000,56.000000000,35.000000000,12.000000000,20.000000000,44.000000000,0E-9,42.000000000,23.000000000,334.000000000,28.000000000,13.000000000,0E-9,0E-9,97.000000000,155.000000000,37.000000000,12.000000000,,15.000000000,0E-9,291.000000000,31.000000000,0E-9,260.000000000,0E-9,69.000000000,122.000000000,76.000000000,24.000000000,136.000000000,65.000000000,0E-9,89.000000000,122.000000000,208.000000000,171.000000000,158.000000000,12.000000000,55.000000000,20.000000000,0E-9,0E-9,18.000000000,0E-9,105.000000000,352.000000000,20.000000000,372.000000000,197.000000000,347.000000000,0E-9,372.000000000,0E-9,31.000000000,33.000000000,85.000000000,6.000000000,12.000000000,40.000000000,9.000000000,9.000000000,15.000000000,95.000000000,17.000000000,0E-9,230.000000000,39.000000000,28.000000000,40.000000000,15.000000000,230.000000000,40.000000000,7.000000000,3.000000000,22.000000000,53.000000000,21.000000000,567.000000000,4.000000000,0E-9,2015,42.0,27,0,62,44,104,318.0,0.0,5.0,25.0,83.0,72.0,104.0
131346,18435,20112015,660.000000000,284.000000000,50.700000000,554.000000000,660.000000000,654.000000000,654.000000000,,567.000000000,480.000000000,291.000000000,,,,,,27.000000000,0E-9,0E-9,77.000000000,0E-9,29.000000000,0E-9,77.000000000,583.000000000,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,32.000000000,63.000000000,650.000000000,12.000000000,0.407700000,1253.000000000,,201700.000000000,249600.000000000,351900.000000000,284.000000000,27.000000000,969.000000000,0E-9,0E-9,1248.000000000,5.000000000,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,1989.000000000,177.000000000,107.000000000,177.000000000,50.000000000,0E-9,0E-9,121.000000000,0E-9,,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,27.000000000,0E-9,0E-9,0E-9,257.000000000,0E-9,175.000000000,30.000000000,0E-9,16.000000000,16.000000000,0E-9,0E-9,16.000000000,0E-9,0E-9,0E-9,0E-9,139.000000000,46.000000000,16.000000000,40.000000000,0E-9,11.000000000,5.000000000,48.000000000,22.000000000,327.000000000,0E-9,39.000000000,0E-9,0E-9,81.000000000,83.000000000,87.000000000,33.000000000,,0E-9,0E-9,327.000000000,49.000000000,0E-9,278.000000000,0E-9,72.000000000,81.000000000,65.000000000,72.000000000,48.000000000,66.000000000,51.000000000,83.000000000,81.000000000,94.000000000,55.000000000,210.000000000,0E-9,25.000000000,17.000000000,0E-9,0E-9,24.000000000,37.000000000,103.000000000,376.000000000,6.000000000,382.000000000,185.000000000,366.000000000,0E-9,382.000000000,0E-9,65.000000000,116.000000000,50.000000000,0E-9,63.000000000,0E-9,0E-9,4.000000000,31.000000000,47.000000000,0E-9,0E-9,141.000000000,100.000000000,12.000000000,41.000000000,82.000000000,141.000000000,41.000000000,35.000000000,26.000000000,44.000000000,115.000000000,0E-9,609.000000000,7.000000000,0E-9,2015,12.0,86,37,60,86,58,321.0,0.0,35.0,22.0,76.0,75.0,79.0
131572,77362,20112015,5012.000000000,1500.000000000,33.500000000,3474.000000000,4943.000000000,4943.000000000,4721.000000000,,3675.000000000,2828.000000000,2303.000000000,,,,,,274.000000000,6.000000000,0E-9,1341.000000000,16.000000000,0E-9,175.000000000,1341.000000000,3671.000000000,0E-9,0E-9,0E-9,0E-9,30.000000000,0E-9,429.000000000,262.000000000,4993.000000000,1055.000000000,0.359700000,1553.000000000,,100400.000000000,131600.000000000,175300.000000000,1500.000000000,196.000000000,53.000000000,53.000000000,0E-9,1208.000000000,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,0E-9,345.000000000,0E-9,93.000000000,0E-9,2000.000000000,1093.000000000,322.000000000,1178.000000000,274.000000000,0E-9,0E-9,1700.000000000,22.000000000,,39.000000000,0E-9,35.000000000,0E-9,53.000000000,0E-9,0E-9,0E-9,69.000000000,0E-9,1304.000000000,0E-9,964.000000000,282.000000000,256.000000000,315.000000000,251.000000000,44.000000000,207.000000000,0E-9,0E-9,64.000000000,19.000000000,19.000000000,147.000000000,168.000000000,328.000000000,153.000000000,208.000000000,343.000000000,115.000000000,512.000000000,170.000000000,2144.000000000,16.000000000,19.000000000,103.000000000,127.000000000,464.000000000,481.000000000,372.000000000,56.000000000,76345.000000000,0E-9,0E-9,2027.000000000,70.000000000,0E-9,1957.000000000,19.000000000,219.000000000,299.000000000,840.000000000,391.000000000,39.000000000,414.000000000,459.000000000,987.000000000,299.000000000,232.000000000,59.000000000,1024.000000000,51.000000000,56.000000000,42.000000000,30.000000000,118.000000000,352.000000000,125.000000000,774.000000000,2163.000000000,54.000000000,2217.000000000,1458.000000000,2163.000000000,0E-9,2217.000000000,195.000000000,0E-9,187.000000000,596.000000000,0E-9,62.000000000,198.000000000,82.000000000,0E-9,195.000000000,153.000000000,402.000000000,93.000000000,569.000000000,290.000000000,664.000000000,418.000000000,222.000000000,569.000000000,418.000000000,229.000000000,535.000000000,625.000000000,1840.000000000,369.000000000,3954.000000000,731.000000000,242.000000000,2015,149.0,885,276,717,415,237,2333.0,0.0,815.0,208.0,781.0,199.0,206.0
131775,10930,20112015,9021.000000000,2797.000000000,39.700000000,7080.000000000,8958.000000000,8653.000000000,8371.000000000,,6967.000000000,5759.000000000,4673.000000000,,,,,,604.000000000,317.000000000,370.000000000,1171.000000000,0E-9,41.000000000,42.000000000,1171.000000000,7850.000000000,21.000000000,44.000000000,74.000000000,8.000000000,224.000000000,2.000000000,727.000000000,419.000000000,9021.000000000,438.000000000,0.332900000,3104.000000000,,249600.000000000,321000.000000000,399100.000000000,2797.000000000,235.000000000,307.000000000,26.000000000,0E-9,2317.000000000,422.000000000,99.000000000,0E-9,229.000000000,0E-9,0E-9,7.000000000,30.000000000,0E-9,14.000000000,68.000000000,1979.000000000,1952.000000000,544.000000000,2253.000000000,84.000000000,9.000000000,3.000000000,2447.000000000,274.000000000,1401.000000000,46.000000000,21.000000000,0E-9,24.000000000,0E-9,35.000000000,22.000000000,19.000000000,0E-9,68.000000000,2562.000000000,52.000000000,1882.000000000,257.000000000,7.000000000,906.000000000,879.000000000,619.000000000,251.000000000,9.000000000,0E-9,27.000000000,0E-9,0E-9,350.000000000,497.000000000,356.000000000,306.000000000,229.000000000,661.000000000,293.000000000,1113.000000000,546.000000000,4351.000000000,14.000000000,284.000000000,89.000000000,82.000000000,524.000000000,1375.000000000,468.000000000,348.000000000,163840.000000000,275.000000000,139.000000000,4027.000000000,362.000000000,0E-9,3665.000000000,17.000000000,680.000000000,1383.000000000,966.000000000,399.000000000,894.000000000,717.000000000,316.000000000,1011.000000000,1383.000000000,2414.000000000,1253.000000000,1796.000000000,180.000000000,356.000000000,362.000000000,9.000000000,9.000000000,216.000000000,372.000000000,1504.000000000,4650.000000000,339.000000000,5024.000000000,1943.000000000,4635.000000000,35.000000000,4989.000000000,8.000000000,305.000000000,285.000000000,1034.000000000,353.000000000,133.000000000,247.000000000,193.000000000,168.000000000,711.000000000,612.000000000,410.000000000,191.000000000,1896.000000000,272.000000000,446.000000000,1530.000000000,506.000000000,1896.000000000,1530.000000000,461.000000000,470.000000000,703.000000000,2657.000000000,505.000000000,6520.000000000,752.000000000,105.000000000,2015,185.0,1425,279,1420,867,422,4423.0,0.0,1022.0,536.0,1385.0,867.0,479.0


Based on the results above, it seems like the data pertaining to marital status was either not reported by the individuals completing the survey or was not collected by the survey department. With an 80% reported missingness, with no way of comfortably imputing, the columns pertaining to marital status will be dropped.

In [240]:
#Dropping marital status columns
ACS = ACS.drop(columns = ['pop_never_married', 'pop_now_married', 'pop_separated', 'pop_widowed', 'pop_divorced'])

### Population Age Groups

To further reduce the number of columns in the ACS data set, columns reporting various population groups will be further grouped, with the total values removed from the data set.<br><br>
Before doing so, total values will be checked with the "dummified" groupings.

#### Checking Male Population Grouping and Totals

In [212]:
ACS['male_pop'] = ACS['male_pop'].astype(float)

#Creating a list of all columns containing the word 'male'
male_cols = [x for x in ACS.columns if x.startswith('male_')]

#Creating a list of male population grouped by age
male_ages = male_cols[1:male_cols.index('male_male_households')]

#Converting male population grouped by age columns to int 
ACS[male_ages] = ACS[male_ages].astype(int)
ACS['male_pop_check'] = ACS[male_ages].sum(axis = 1)
ACS['male_pop_check'] = ACS['male_pop_check'].astype(float)

#Comparing total male population from ACS with sum of male population grouped by age
print(f'Total rows in ACS: {ACS.shape[0]}')
print(f'Total rows with unmatched male population: {ACS.loc[ACS1.male_pop != ACS.male_pop_check].shape[0]}\n\n')

#Creating an 'unknown' male age column to later drop the total male population column
ACS['male_x_age'] = ACS['male_pop'] - ACS['male_pop_check']
ACS['male_x_age'] = ACS['male_x_age'].astype(float)

#Reporting percentage of males that haven't reported their age
perc_male_x_age = round(ACS['male_x_age'].sum() / ACS['male_pop'].sum() *100,2)
print(f'Total percentage of males that didn\'t have an age group: {perc_male_x_age}%')

Total rows in ACS: 11165
Total rows with unmatched male population: 11164


Total percentage of males that didn't report their age: 5.24%


Age groups will further be combined into the following:<br>
- Males under 18
- Males from 18 to 24
- Males from 25 to 49
- Males from 50 to 66
- Males above 67

In [216]:
#Combining age groups
ACS['male_under_18'] = ACS['male_under_5'] + ACS['male_5_to_9'] + ACS['male_10_to_14'] + ACS['male_15_to_17']
ACS['male_18_to_24'] = ACS['male_18_to_19'] + ACS['male_20'] + ACS['male_21'] + ACS['male_22_to_24']
ACS['male_25_to_49'] = ACS['male_25_to_29'] + ACS['male_30_to_34'] + ACS['male_35_to_39'] + ACS['male_40_to_44'] +\
                        ACS['male_45_to_49']
ACS['male_50_to_66'] = ACS['male_50_to_54'] + ACS['male_55_to_59'] + ACS['male_65_to_66']
ACS['male_above_67'] = ACS['male_67_to_69'] + ACS['male_70_to_74'] + ACS['male_75_to_79'] + ACS['male_80_to_84'] +\
                        ACS['male_85_and_over']

In [218]:
#Dropping all other columns related to male population by age group
ACS = ACS.drop(columns = male_ages)
ACS = ACS.drop(columns = ['male_pop_check', 'male_pop'])

#### Checking Female Population Grouping and Totals

In [230]:
ACS['female_pop'] = ACS['female_pop'].astype(float)

#Creating a list of all columns containing the word 'male'
female_cols = [x for x in ACS.columns if x.startswith('female_')]

#Creating a list of male population grouped by age
female_ages = female_cols[1:female_cols.index('female_female_households')]

#Converting male population grouped by age columns to int 
ACS[female_ages] = ACS[female_ages].astype(float)
ACS['female_pop_check'] = ACS[female_ages].sum(axis = 1)
ACS['female_pop_check'] = ACS['female_pop_check'].astype(float)

#Comparing total male population from ACS with sum of male population grouped by age
print(f'Total rows in ACS: {ACS.shape[0]}')
print(f'Total rows with unmatched female population: {ACS.loc[ACS.female_pop != ACS.female_pop_check].shape[0]}\n\n')

#Creating an 'unknown' female age column to later drop the total male population column
ACS['female_x_age'] = ACS['female_pop'] - ACS['female_pop_check']
ACS['female_x_age'] = ACS['female_x_age'].astype(float)

#Reporting percentage of males that haven't reported their age
perc_female_x_age = round(ACS['female_x_age'].sum() / ACS['female_pop'].sum() *100,2)
print(f'Total percentage of females that didn\'t have an age group: {perc_female_x_age}%')

Total rows in ACS: 11165
Total rows with unmatched female population: 0


Total percentage of females that didn't have an age group: 0.0%


Seems like women always have a reported age! Very different from the male population.<br><br>
Age groups will further be combined into the following:<br>
- Females under 18
- Females from 18 to 24
- Females from 25 to 49
- Females from 50 to 66
- Females above 67

In [233]:
#Combining age groups
ACS['female_under_18'] = ACS['female_under_5'] + ACS['female_5_to_9'] + ACS['female_10_to_14'] + ACS['female_15_to_17']
ACS['female_18_to_24'] = ACS['female_18_to_19'] + ACS['female_20'] + ACS['female_21'] + ACS['female_22_to_24']
ACS['female_25_to_49'] = ACS['female_25_to_29'] + ACS['female_30_to_34'] + ACS['female_35_to_39'] + ACS['female_40_to_44'] +\
                        ACS['female_45_to_49']
ACS['female_50_to_66'] = ACS['female_50_to_54'] + ACS['female_55_to_59'] + ACS['female_65_to_66']
ACS['female_above_67'] = ACS['female_67_to_69'] + ACS['female_70_to_74'] + ACS['female_75_to_79'] + ACS['female_80_to_84'] +\
                        ACS['female_85_and_over']

In [234]:
#Dropping all other columns related to male population by age group
ACS = ACS.drop(columns = female_ages)
ACS = ACS.drop(columns = 'female_pop')