# ACS Data Cleaning - All Zip Codes

## Importing Required Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import pandas_gbq
import statistics
from google.cloud import bigquery
from varname import nameof
from collections import Counter
%load_ext google.cloud.bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '../zori-data-extr-be793d5c3325.json'

# Set your default project here
pandas_gbq.context.project = 'bigquery-public-data'
pandas_gbq.context.dialect = 'standard'

## Using Google BigQuery to Download ACS Data

In [2]:
%%bigquery --use_rest_api ACS_2018
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2018_5yr`

Query complete after 0.05s: 100%|██████████| 1/1 [00:00<00:00, 215.06query/s]
Downloading: 100%|██████████| 33120/33120 [00:05<00:00, 6603.19rows/s] 


In [3]:
%%bigquery --use_rest_api ACS_2017
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2017_5yr`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 178.25query/s] 
Downloading: 100%|██████████| 33120/33120 [00:05<00:00, 6481.99rows/s] 


In [4]:
%%bigquery --use_rest_api ACS_2016
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2016_5yr`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 353.00query/s]
Downloading: 100%|██████████| 33120/33120 [00:05<00:00, 6518.59rows/s] 


In [5]:
%%bigquery --use_rest_api ACS_2015
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2015_5yr`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 586.21query/s] 
Downloading: 100%|██████████| 33120/33120 [00:08<00:00, 3856.02rows/s]


In [6]:
%%bigquery --use_rest_api ACS_2014
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2014_5yr`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 234.41query/s]
Downloading: 100%|██████████| 33120/33120 [00:05<00:00, 6277.52rows/s] 


## ACS Data Pre-Processing

In [7]:
pd.set_option('display.max_columns', None)
#Looking at the shape of each dataframe
print(ACS_2018.shape)
print(ACS_2017.shape)
print(ACS_2016.shape)
print(ACS_2015.shape)
print(ACS_2014.shape, '\n')

(33120, 240)
(33120, 252)
(33120, 252)
(33120, 247)
(33120, 252) 



Based on the data shown above, a number of columns are missing from the 2018 data set. Since the 2018 data set is the latest, all additional columns seen in prior year surveys will be dropped for consistency.

In [8]:
def clean_columns(df_year1, df_year2):
    """
    This function cleans the second dataframe to only include columns of the first dataframe.
    
    Arguments
    
    df_year1 : (Pandas dataframe) dataframe containing the columns of interest
    df_year2 : (Pandas dataframe) dataframe containing the columns of interest + additional
    
    Returns
    
    df_year2 : (Pandas dataframe) modified version of the df_year2 dataframe from inputs
    
    """
    diff = np.setdiff1d(df_year2.columns, df_year1.columns)
    print(f'Columns from {nameof(df_year2)} non included in {nameof(df_year1)}:\n\nTotal of {len(diff)}\n\n{diff}\n\n')
    print('Removing columns...')
    
    #Dropping the columns from df_year2 that are not present in the df_year1
    df_year2 = df_year2.drop(columns = diff, inplace = True)
    
    print('Process complete\n\n')
    
    return df_year2

In [9]:
#Using the above defined function to clean columns
clean_columns(ACS_2018, ACS_2017)
clean_columns(ACS_2018, ACS_2016)
clean_columns(ACS_2018, ACS_2015)
clean_columns(ACS_2018, ACS_2014)

Columns from df_year2 non included in df_year1:

Total of 12

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins' 'commute_90_more_mins'
 'households_retirement_income' 'male_60_61' 'male_62_64'
 'white_including_hispanic']


Removing columns...
Process complete


Columns from df_year2 non included in df_year1:

Total of 12

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins' 'commute_90_more_mins'
 'households_retirement_income' 'male_60_61' 'male_62_64'
 'white_including_hispanic']


Removing columns...
Process complete


Columns from df_year2 non included in df_year1:

Total of 13

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins'

In [10]:
#Looking at the shape of each dataframe to confirm column cleaning has been done appropriately
print(ACS_2018.shape)
print(ACS_2017.shape)
print(ACS_2016.shape)
print(ACS_2015.shape)
print(ACS_2014.shape, '\n')

(33120, 240)
(33120, 240)
(33120, 240)
(33120, 234)
(33120, 240) 



In [11]:
diff = np.setdiff1d(ACS_2018.columns, ACS_2015.columns)
diff

array(['pop_15_and_over', 'pop_divorced', 'pop_never_married',
       'pop_now_married', 'pop_separated', 'pop_widowed'], dtype=object)

As can be seen above for year 2015 there are less columns than in any other year. Before dropping the columns in all other survey years, looking at missingness of those 6 columns would be best.

In [12]:
#Adding a year column to each survey dataframe
ACS_2018['year'] = 2018
ACS_2017['year'] = 2017
ACS_2016['year'] = 2016
ACS_2015['year'] = 2015
ACS_2014['year'] = 2014

#Combining all ACS dataframes to one global ACS dataframe
ACS = pd.concat([ACS_2018, ACS_2017, ACS_2016, ACS_2015, ACS_2014], ignore_index = True)
ACS.reset_index().drop(columns = 'index', inplace = True)

In [13]:
ACS.shape

(165600, 241)

### ACS Operational Data

Columns such as *gini_index* and *do_date* are ACS operational columns that are not required for this project. As such, they will be dropped.

In [14]:
#Dropping ACS operational columns
ACS.drop(columns = ['gini_index', 'do_date'], inplace = True)

### Income Columns

Partly Parrots will be looking at IRS data for everything income related, as such all income related columns will be dropped.

In [15]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'income')))]

In [16]:
ACS.shape

(165600, 220)

### Looking at Missingness

In [17]:
def missingness(df):
    """
    This function looks at the number of missing values in a dataframe
    
    Arguments
    
    df          : (Pandas dataframe) dataframe of interest
    
    Returns
    
    missing_col : (dictionary) column name and number of missing values 
    
    """
    all_cols_miss = df.isna().mean().sort_values(ascending = False)

    missing_col_list = []
    missing_col = {}

    for i in range(all_cols_miss[all_cols_miss > 0].shape[0]):
        missing_col_list.append([all_cols_miss[all_cols_miss > 0].index[i], \
                                round(all_cols_miss[all_cols_miss > 0][i]*100, 2)])

    missing_col.update(missing_col_list)
    return missing_col

In [24]:
#Looking at missing values in data
m1 = missingness(ACS)
m1

{'pop_15_and_over': 80.0,
 'pop_never_married': 80.0,
 'pop_now_married': 80.0,
 'pop_separated': 80.0,
 'pop_widowed': 80.0,
 'pop_divorced': 80.0,
 'speak_spanish_at_home': 59.99,
 'speak_only_english_at_home': 59.99,
 'pop_5_years_over': 59.99,
 'speak_spanish_at_home_low_english': 59.99,
 'aggregate_travel_time_to_work': 38.96,
 'median_rent': 17.76,
 'renter_occupied_housing_units_paying_cash_median_gross_rent': 17.31,
 'owner_occupied_housing_units_lower_value_quartile': 8.03,
 'owner_occupied_housing_units_upper_value_quartile': 6.78,
 'owner_occupied_housing_units_median_value': 6.76,
 'median_year_structure_built': 2.76,
 'median_age': 1.61,
 'not_us_citizen_pop': 0.4,
 'some_college_and_associates_degree': 0.4,
 'graduate_professional_degree': 0.4,
 'population_1_year_and_over': 0.4,
 'bachelors_degree_2': 0.4,
 'high_school_including_ged': 0.4,
 'less_than_high_school_graduate': 0.4,
 'different_house_year_ago_different_city': 0.4,
 'different_house_year_ago_same_city': 0.4}

A lot more missing values, although luckily for us, most of the values are less than 1% missingness. Below we will check whether all of the rows with 0.02% missingness are repeated in the same spots.

In [26]:
m1.median_age.value()

AttributeError: 'dict' object has no attribute 'median_age'

In [None]:
def get_keys(dictionary, value):
    keys = []
    items = dictionary.items()
    
    for item  in items:
        if item[1] == value:
            keys.append(item[0])
            
    return  keys

In [20]:
ACS.loc[ACS['commuters_16_over'].isna()].shape

Unnamed: 0,geo_id,total_pop,households,male_pop,female_pop,median_age,male_under_5,male_5_to_9,male_10_to_14,male_15_to_17,male_18_to_19,male_20,male_21,male_22_to_24,male_25_to_29,male_30_to_34,male_35_to_39,male_40_to_44,male_45_to_49,male_50_to_54,male_55_to_59,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_under_5,female_5_to_9,female_10_to_14,female_15_to_17,female_18_to_19,female_20,female_21,female_22_to_24,female_25_to_29,female_30_to_34,female_35_to_39,female_40_to_44,female_45_to_49,female_50_to_54,female_55_to_59,female_60_to_61,female_62_to_64,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,white_pop,population_1_year_and_over,population_3_years_over,pop_5_years_over,pop_15_and_over,pop_16_over,pop_25_years_over,pop_25_64,pop_never_married,pop_now_married,pop_separated,pop_widowed,pop_divorced,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children,children_in_single_female_hh,median_rent,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,commuters_16_over,walked_to_work,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,commuters_by_public_transportation,commuters_by_bus,commuters_by_car_truck_van,commuters_by_carpool,commuters_by_subway_or_elevated,commuters_drove_alone,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,employed_pop,unemployed_pop,pop_in_labor_force,not_in_labor_force,workers_16_and_over,armed_forces,civilian_labor_force,employed_agriculture_forestry_fishing_hunting_mining,employed_arts_entertainment_recreation_accommodation_food,employed_construction,employed_education_health_social,employed_finance_insurance_real_estate,employed_information,employed_manufacturing,employed_other_services_not_public_admin,employed_public_administration,employed_retail_trade,employed_science_management_admin_waste,employed_transportation_warehousing_utilities,employed_wholesale_trade,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english,year
0,87537,2510,856,1283,1227,42.1,58,75,87,52,13,7,10,16,38,84,9,61,43,133,15,55,40,40,69,8,25,115,229,93,29,0,0,18,56,19,133,100,46,9,60,42,58,51,17,75,25,37,4,11,329,,2361,,,,1652,,,,,,,82,6,0,2134,32,0,9,2134,376,0,0,0,0,133,355,40,5,,,1443,623.0,104700.0,144200.0,246900.0,856,188,587,86,2,599,3,3,36,0,0,0,0,802,0,2,55,1979.0,288,326,530,,1,0,738,150,428.0,,,,,,,,,,,668,3,349,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6,143,168,198,438,62,307,,,,,,,0,52,0,41,7,37,399,536,,,,,,,,,,,,,,,,,,,,,,,,,,,,304,171,80,593,20,,,,2018
1,87017,346,112,150,196,50.9,0,0,12,10,9,0,0,0,0,14,0,0,35,0,30,10,0,0,0,0,4,0,0,15,5,32,0,0,8,0,0,11,14,0,9,23,0,9,0,0,0,0,70,0,33,,346,,,,255,,,,,,,61,0,52,261,0,0,0,261,85,4,0,0,0,31,56,0,0,,,240,1045.0,114200.0,170800.0,199700.0,112,23,128,6,5,111,0,0,0,0,0,0,0,129,0,0,0,1976.0,45,36,76,,0,0,42,20,,,,,,,,,,,,89,0,31,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,20,161,28,6,0,,,,,,,0,6,0,0,11,60,14,91,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,27,15,96,40,,,,2018
2,87528,3505,727,1758,1747,27.9,260,194,159,121,38,5,0,112,202,56,106,66,37,111,89,12,11,85,10,9,0,151,165,189,82,45,1,57,44,106,110,103,35,173,181,104,22,41,21,44,55,13,5,0,84,,3311,,,,1882,,,,,,,34,16,0,775,2566,10,54,775,2730,0,0,0,7,10,37,0,8,,,1024,501.0,43800.0,92700.0,163800.0,727,227,297,18,0,684,2,64,19,5,0,0,0,250,23,26,0,1984.0,236,274,453,,0,0,1321,568,364.0,,,,,,,,,,,500,0,66,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,41,142,115,678,86,78,511,,,,,,,22,37,0,3,32,149,69,312,,,,,,,,,,,,,,,,,,,,,,,,,,,,293,266,298,1237,138,,,,2018
3,87533,133,58,49,84,25.8,0,12,8,4,8,0,0,0,3,3,3,5,0,0,3,0,0,0,0,0,0,6,22,0,3,0,0,0,2,2,6,14,9,0,0,7,0,3,3,0,0,0,3,4,7,,133,,,,68,,,,,,,0,0,0,126,0,0,0,126,7,0,0,0,0,0,3,0,0,,,68,738.0,13400.0,17900.0,,58,19,10,0,0,8,0,0,0,0,0,0,0,60,0,0,2,1993.0,8,36,22,,0,0,55,37,347.0,,,,,,,,,,,39,0,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,6,9,12,15,3,13,,,,,,,0,3,0,0,0,0,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,24,8,7,58,3,,,,2018
4,87511,2896,787,1177,1719,36.0,142,63,44,123,0,92,0,0,28,2,49,122,36,196,44,5,64,29,0,48,0,135,255,65,98,3,128,2,125,52,86,52,77,46,90,259,0,20,68,29,5,4,120,0,324,,2756,,,,1621,,,,,,,102,37,0,2532,0,0,0,2532,364,0,0,0,0,200,134,32,0,,,1295,540.0,81400.0,134800.0,172100.0,787,227,508,0,46,445,4,0,0,0,0,0,0,846,0,48,4,1990.0,440,155,632,,0,0,925,269,,,,,,,,,,,,560,0,335,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7,165,112,374,313,60,428,,,,,,,50,45,0,0,49,141,81,366,,,,,,,,,,,,,,,,,,,,,,,,,,,,263,107,163,1008,279,,,,2018
5,87520,1237,518,633,604,56.1,30,71,47,22,29,0,0,0,0,19,11,15,87,26,51,16,36,24,42,4,11,0,30,0,22,0,0,0,0,0,117,23,0,14,23,133,33,60,22,27,24,36,21,19,299,,1207,,,,986,,,,,,,0,0,0,863,61,14,0,863,374,0,0,0,0,90,34,23,99,,,1275,720.0,132400.0,240000.0,328600.0,518,66,757,9,38,892,15,9,52,82,0,0,0,225,45,10,32,1982.0,214,246,272,,0,0,222,62,518.0,,,,,,,,,,,452,11,205,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,73,161,254,114,154,65,,,,,,,10,66,45,0,23,112,0,256,,,,,,,,,,,,,,,,,,,,,,,,,,,,83,33,73,230,0,,,,2018
6,87532,18756,6429,9280,9476,39.3,602,704,660,347,319,227,101,282,510,606,465,588,422,624,783,273,224,340,228,213,161,875,552,592,329,313,77,100,234,528,430,638,439,455,659,638,278,481,391,314,380,239,248,286,2605,,17844,,,,12442,,,,,,,839,104,164,14465,1272,39,107,14465,4291,6,22,0,8,824,1072,145,193,,,8522,687.0,109400.0,174500.0,260900.0,6429,2127,2093,266,95,4685,97,273,123,135,15,66,14,3095,30,91,329,1981.0,2427,2626,3803,,0,19,4661,2192,536.0,,,,,,,,,,,4302,14,1775,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,96,1097,1587,3145,921,755,2170,,,,,,,212,264,167,188,281,757,561,2430,,,,,,,,,,,,,,,,,,,,,,,,,,,,1018,1011,877,4517,882,,,,2018
7,87013,5350,1542,2698,2652,36.5,154,288,296,124,72,42,38,102,151,182,125,238,158,178,167,43,33,69,71,82,10,167,172,144,167,51,29,31,79,194,119,172,210,137,142,164,71,50,49,80,158,217,23,26,270,,5154,,,,3394,,,,,,,7,0,0,762,4270,0,48,762,4588,0,0,0,0,66,73,19,6,,,2181,554.0,17500.0,63300.0,145900.0,1542,465,639,27,28,1449,41,32,7,0,0,0,0,650,5,22,27,1985.0,640,479,1063,,0,3,1512,501,378.0,,,,,,,,,,,1077,0,148,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15,157,163,951,68,80,432,,,,,,,20,36,24,40,186,188,84,578,,,,,,,,,,,,,,,,,,,,,,,,,,,,416,355,372,1341,63,,,,2018
8,87521,835,370,382,453,48.5,7,12,7,40,3,0,0,4,6,13,14,13,44,49,37,3,36,12,11,5,51,7,17,8,33,0,5,0,7,13,18,7,109,38,22,39,2,5,41,6,8,53,5,10,17,,824,,,,685,,,,,,,2,0,0,770,47,0,0,770,65,0,0,0,0,82,49,0,3,,,507,713.0,,,173100.0,370,57,137,0,0,334,0,1,1,0,0,0,0,171,4,0,47,1969.0,108,182,188,,0,0,131,27,456.0,,,,,,,,,,,313,0,70,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,50,24,145,21,18,143,,,,,,,7,6,3,0,19,65,45,145,,,,,,,,,,,,,,,,,,,,,,,,,,,,19,14,77,124,1,,,,2018
9,87566,3612,806,1988,1624,34.6,115,92,143,63,39,33,17,131,187,257,97,146,87,207,152,10,55,31,21,19,32,74,93,98,84,56,17,9,94,132,107,59,62,151,130,76,54,51,22,50,42,50,82,31,252,,3522,,,,2454,,,,,,,79,47,17,1891,1346,0,57,1891,1721,1,0,0,16,188,95,20,4,,,1179,650.0,114200.0,236900.0,2000000.0,806,93,373,9,52,737,18,0,29,6,1,0,0,388,0,12,49,1978.0,281,324,482,,0,8,762,248,500.0,,,,,,,,,,,713,0,125,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,339,116,140,846,178,80,597,,,,,,,15,8,18,13,58,225,163,500,,,,,,,,,,,,,,,,,,,,,,,,,,,,165,173,204,776,106,,,,2018


In [21]:
def clean_missingness_rows(col_names, df):
    """
    This function drops rows with NaN values from a specific list of columns
    
    Arguments
    
    col_names : (list) list of columns with riws containing NaN values
    df        : (Pandas dataframe) dataframe of interest
    
    Returns
    
    df        : (Pandas dataframe) cleaned version of inputted dataframe
    
    """
    df.dropna(axis = 0, subset = col_names, inplace = True)
    for col in df[col_names]:
        missing_rows = col.count('Nan')
        if missing_rows > 0:
            raise ValueError('Cleaning incomplete!')
        else:
            print(f'Rows with missingness in {col}: {missing_rows}\nCleaning complete.\n\n')

In [22]:
clean_missingness_rows(['commute_45_59_mins'], ACS)

Rows with missingness in commute_45_59_mins: 0
Cleaning complete.




In [23]:
missingness(ACS)

{'pop_15_and_over': 80.0,
 'pop_never_married': 80.0,
 'pop_now_married': 80.0,
 'pop_separated': 80.0,
 'pop_widowed': 80.0,
 'pop_divorced': 80.0,
 'speak_spanish_at_home': 59.99,
 'speak_only_english_at_home': 59.99,
 'pop_5_years_over': 59.99,
 'speak_spanish_at_home_low_english': 59.99,
 'aggregate_travel_time_to_work': 38.96,
 'median_rent': 17.76,
 'renter_occupied_housing_units_paying_cash_median_gross_rent': 17.31,
 'owner_occupied_housing_units_lower_value_quartile': 8.03,
 'owner_occupied_housing_units_upper_value_quartile': 6.78,
 'owner_occupied_housing_units_median_value': 6.76,
 'median_year_structure_built': 2.76,
 'median_age': 1.61,
 'not_us_citizen_pop': 0.4,
 'some_college_and_associates_degree': 0.4,
 'graduate_professional_degree': 0.4,
 'population_1_year_and_over': 0.4,
 'bachelors_degree_2': 0.4,
 'high_school_including_ged': 0.4,
 'less_than_high_school_graduate': 0.4,
 'different_house_year_ago_different_city': 0.4,
 'different_house_year_ago_same_city': 0.4}

##### Taking a closer look into low missingness rows<br>
##### Aggregate Travel Time to Work

*Typically during census data collection, questions might change from year-to-year, leading to missingness in data througohut a longer time period (years). As such, checking missingness by year is crucial to have a better understanding of the data collection process. As the percentage for missing data in aggregate travel time to work is small, this check will not be required (less than 10% of a years worth of questions).*

In [None]:
ACS.loc[ACS['aggregate_travel_time_to_work'].isna()].head(3)

In [None]:
ACS.loc[~ACS['aggregate_travel_time_to_work'].isna()].head(3)

Looking at how the aggregate time to work is calculated, the missing values in this column can be imputed by multiplying the number of commuters in a certain time frame with the average commute time.

In [None]:
commute_l = ACS.filter(regex = 'commute_').columns.to_list()

#Extracting time segments from column titles
commute_times = list(map(lambda x: x.split(sep = '_'), commute_l))

#Calculating average commute times for the time segments
commute_times = list(map(lambda y: statistics.mean(y), 
                         list(map(lambda x: [int(word) for word in x if word.isdigit()], commute_times))))

print(f'Average commute times: {commute_times}')

#Creating a dataframe focusing on missing data in aggregate travel time to work
commute_df = ACS.loc[ACS['aggregate_travel_time_to_work'].isna()][commute_l].astype(float)

#Only looking at commute time columns
commute_df.iloc[:, :] *= commute_times

#Calculating the aggregate time to work
commute_df['aggregate_travel_time_to_work'] = commute_df.sum(axis = 1)

#Dropping all columns but aggregate time to work
commute_df.drop(columns = commute_l, inplace = True)

#Replacing missing values with imputed values
ACS.loc[ACS['aggregate_travel_time_to_work'].isna(), 'aggregate_travel_time_to_work'] = commute_df

missing_rows = ACS.loc[ACS['aggregate_travel_time_to_work'].isna()].shape[0]

#Checking for missingness
print(f'Rows with missingness in agg travel time to work: {missing_rows}')

#Resetting index of dataframe
ACS.reset_index(inplace = True)
ACS.drop(columns = 'index', inplace = True)

##### Median Rent

*Checking in what year missingness occurred will not be required for this portion as the percentage of missingness is very small.*