# ACS Data Cleaning - All Zip Codes

## Importing Required Libraries

In [1]:
import os
import numpy as np
import pandas as pd
import pandas_gbq
import statistics
from google.cloud import bigquery
from varname import nameof
from collections import Counter
%load_ext google.cloud.bigquery

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '../zori-data-extr-be793d5c3325.json'

# Set your default project here
pandas_gbq.context.project = 'bigquery-public-data'
pandas_gbq.context.dialect = 'standard'

## Using Google BigQuery to Download ACS Data

In [2]:
%%bigquery --use_rest_api ACS_2018
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2018_5yr`

Query complete after 0.03s: 100%|██████████| 1/1 [00:00<00:00, 458.74query/s]
Downloading: 100%|██████████| 33120/33120 [00:06<00:00, 4825.32rows/s]


In [3]:
%%bigquery --use_rest_api ACS_2017
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2017_5yr`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 406.50query/s]
Downloading: 100%|██████████| 33120/33120 [00:08<00:00, 4019.30rows/s]


In [4]:
%%bigquery --use_rest_api ACS_2016
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2016_5yr`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 262.28query/s]
Downloading: 100%|██████████| 33120/33120 [00:06<00:00, 5207.00rows/s]


In [5]:
%%bigquery --use_rest_api ACS_2015
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2015_5yr`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 178.44query/s]
Downloading: 100%|██████████| 33120/33120 [00:13<00:00, 2438.86rows/s]


In [6]:
%%bigquery --use_rest_api ACS_2014
SELECT *
FROM `bigquery-public-data.census_bureau_acs.zip_codes_2014_5yr`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 84.40query/s] 
Downloading: 100%|██████████| 33120/33120 [00:07<00:00, 4285.71rows/s]


## ACS Data Pre-Processing

In [7]:
pd.set_option('display.max_columns', None)
#Looking at the shape of each dataframe
print(ACS_2018.shape)
print(ACS_2017.shape)
print(ACS_2016.shape)
print(ACS_2015.shape)
print(ACS_2014.shape, '\n')

(33120, 240)
(33120, 252)
(33120, 252)
(33120, 247)
(33120, 252) 



Based on the data shown above, a number of columns are missing from the 2018 data set. Since the 2018 data set is the latest, all additional columns seen in prior year surveys will be dropped for consistency.

In [8]:
def clean_columns(df_year1, df_year2):
    """
    This function cleans the second dataframe to only include columns of the first dataframe.
    
    Arguments
    
    df_year1 : (Pandas dataframe) dataframe containing the columns of interest
    df_year2 : (Pandas dataframe) dataframe containing the columns of interest + additional
    
    Returns
    
    df_year2 : (Pandas dataframe) modified version of the df_year2 dataframe from inputs
    
    """
    diff = np.setdiff1d(df_year2.columns, df_year1.columns)
    print(f'Columns from {nameof(df_year2)} non included in {nameof(df_year1)}:\n\nTotal of {len(diff)}\n\n{diff}\n\n')
    print('Removing columns...')
    
    #Dropping the columns from df_year2 that are not present in the df_year1
    df_year2 = df_year2.drop(columns = diff, inplace = True)
    
    print('Process complete\n\n')
    
    return df_year2

In [9]:
#Using the above defined function to clean columns
clean_columns(ACS_2018, ACS_2017)
clean_columns(ACS_2018, ACS_2016)
clean_columns(ACS_2018, ACS_2015)
clean_columns(ACS_2018, ACS_2014)

Columns from df_year2 non included in df_year1:

Total of 12

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins' 'commute_90_more_mins'
 'households_retirement_income' 'male_60_61' 'male_62_64'
 'white_including_hispanic']


Removing columns...
Process complete


Columns from df_year2 non included in df_year1:

Total of 12

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins' 'commute_90_more_mins'
 'households_retirement_income' 'male_60_61' 'male_62_64'
 'white_including_hispanic']


Removing columns...
Process complete


Columns from df_year2 non included in df_year1:

Total of 13

['amerindian_including_hispanic' 'asian_including_hispanic'
 'black_including_hispanic' 'commute_35_39_mins' 'commute_40_44_mins'
 'commute_5_9_mins' 'commute_60_89_mins'

In [10]:
#Looking at the shape of each dataframe to confirm column cleaning has been done appropriately
print(ACS_2018.shape)
print(ACS_2017.shape)
print(ACS_2016.shape)
print(ACS_2015.shape)
print(ACS_2014.shape, '\n')

(33120, 240)
(33120, 240)
(33120, 240)
(33120, 234)
(33120, 240) 



In [11]:
diff = np.setdiff1d(ACS_2018.columns, ACS_2015.columns)
diff

array(['pop_15_and_over', 'pop_divorced', 'pop_never_married',
       'pop_now_married', 'pop_separated', 'pop_widowed'], dtype=object)

As can be seen above for year 2015 there are less columns than in any other year. Before dropping the columns in all other survey years, looking at missingness of those 6 columns would be best.

In [12]:
#Adding a year column to each survey dataframe
ACS_2018['year'] = 2018
ACS_2017['year'] = 2017
ACS_2016['year'] = 2016
ACS_2015['year'] = 2015
ACS_2014['year'] = 2014

#Combining all ACS dataframes to one global ACS dataframe
ACS = pd.concat([ACS_2018, ACS_2017, ACS_2016, ACS_2015, ACS_2014], ignore_index = True)
ACS.reset_index().drop(columns = 'index', inplace = True)

In [13]:
ACS.shape

(165600, 241)

### ACS Operational Data

Columns such as *gini_index* and *do_date* are ACS operational columns that are not required for this project. As such, they will be dropped.

In [14]:
#Dropping ACS operational columns
ACS.drop(columns = ['gini_index', 'do_date'], inplace = True)

### Income Columns

Partly Parrots will be looking at IRS data for everything income related, as such all income related columns will be dropped.

In [15]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'income')))]

In [16]:
ACS.shape

(165600, 220)

### Looking at Missingness

In [17]:
def missingness(df):
    """
    This function looks at the number of missing values in a dataframe
    
    Arguments
    
    df          : (Pandas dataframe) dataframe of interest
    
    Returns
    
    missing_col : (dictionary) column name and number of missing values 
    
    """
    all_cols_miss = df.isna().mean().sort_values(ascending = False)

    missing_col_list = []
    missing_col = {}

    for i in range(all_cols_miss[all_cols_miss > 0].shape[0]):
        missing_col_list.append([all_cols_miss[all_cols_miss > 0].index[i], \
                                round(all_cols_miss[all_cols_miss > 0][i]*100, 2)])

    missing_col.update(missing_col_list)
    return missing_col

In [18]:
#Looking at missing values in data
missingness(ACS)

{'pop_divorced': 80.0,
 'pop_never_married': 80.0,
 'pop_15_and_over': 80.0,
 'pop_widowed': 80.0,
 'pop_separated': 80.0,
 'pop_now_married': 80.0,
 'speak_spanish_at_home_low_english': 60.0,
 'pop_5_years_over': 60.0,
 'speak_only_english_at_home': 60.0,
 'speak_spanish_at_home': 60.0,
 'aggregate_travel_time_to_work': 38.97,
 'median_rent': 17.78,
 'renter_occupied_housing_units_paying_cash_median_gross_rent': 17.32,
 'owner_occupied_housing_units_lower_value_quartile': 8.03,
 'owner_occupied_housing_units_upper_value_quartile': 6.78,
 'owner_occupied_housing_units_median_value': 6.76,
 'median_year_structure_built': 2.76,
 'median_age': 1.61,
 'graduate_professional_degree': 0.42,
 'different_house_year_ago_different_city': 0.42,
 'bachelors_degree_2': 0.42,
 'high_school_including_ged': 0.42,
 'less_than_high_school_graduate': 0.42,
 'different_house_year_ago_same_city': 0.42,
 'some_college_and_associates_degree': 0.42,
 'population_1_year_and_over': 0.42,
 'not_us_citizen_pop': 

A lot more missing values, although luckily for us, most of the values are less than 1% missingness. Below we will check whether all of the rows with 0.02% missingness are repeated in the same spots.

In [19]:
def get_keys(dictionary, val):
    """
    This function gets the keys with the same specified value
    
    Arguments
    
    dictionary : (dict) dictionary with total missingness % as values and column names as keys - use
                        previously defined missingness function for this
    val        : (str)  value of interest
    
    Returns
    
    keys       : (list) list of keys with the same specified value
    
    """    
    keys = []
    items = dictionary.items()
    
    for item  in items:
        if item[1] == val:
            keys.append(item[0])
            
    return  keys

In [20]:
def check_miss_rows(dictionary, row):
    """
    This function checks for missingness in rows with the exact same missingness as the requested row
    
    Arguments
    
    dictionary : (dict) dictionary with total missingness % as values and column names as keys - use
                        previously defined missingness function for this
    row        : (str) row name of interest - must be in quotations!
    
    Other Functions Used - THESE MUST BE RUN PRIOR TO RUNNING THIS FUNCTION
    missingness
    get_keys
    
    """
    val = missingness(dictionary).get(row)
    keys = get_keys(missingness(dictionary), val)
    
    #Checking missingness percentage in other columns
    print(f'Number of columns with the same missingness percentage as {row}: {len(keys)}\n')
    
    #Selecting the keys from the dataframe and checking the unique values of missingness
    print(f'Unique values of missingess for selected columns: {ACS[keys].isna().sum().unique()}\n')
    
    #Checking whether the missingness occurs in the same rows
    ACS[keys].isna().sum(axis = 1).unique()
    print(f'Each row has this count of missing values: {ACS[keys].isna().sum(axis = 1).unique()}')

The same number of rows at the same index are affected by missingness. As such, these rows will be dropped.

In [21]:
def clean_missingness_rows(col_names, df):
    """
    This function drops rows with NaN values from a specific list of columns
    
    Arguments
    
    col_names : (list) list of columns with riws containing NaN values
    df        : (Pandas dataframe) dataframe of interest
    
    Returns
    
    df        : (Pandas dataframe) cleaned version of inputted dataframe
    
    """
    df.dropna(axis = 0, subset = col_names, inplace = True)
    for col in df[col_names]:
        missing_rows = col.count('Nan')
        if missing_rows > 0:
            raise ValueError('Cleaning incomplete!')
        else:
            print(f'Rows with missingness in {col}: {missing_rows}\nCleaning complete.\n\n')

In [22]:
clean_missingness_rows(['commuters_16_over'], ACS)

Rows with missingness in commuters_16_over: 0
Cleaning complete.




In [23]:
check_miss_rows(ACS, 'not_us_citizen_pop')

Number of columns with the same missingness percentage as not_us_citizen_pop: 9

Unique values of missingess for selected columns: [655]

Each row has this count of missing values: [0 9]


The same number of rows at the same index are affected by missingness. As such, these rows will be dropped.

In [24]:
clean_missingness_rows(['not_us_citizen_pop'], ACS)

Rows with missingness in not_us_citizen_pop: 0
Cleaning complete.




##### Taking a closer look into low missingness rows<br>
##### Aggregate Travel Time to Work

*Typically during census data collection, questions might change from year-to-year, leading to missingness in data througohut a longer time period (years). As such, checking missingness by year is crucial to have a better understanding of the data collection process. As the percentage for missing data in aggregate travel time to work is small, this check will not be required (less than 10% of a years worth of questions).*

In [25]:
ACS.loc[ACS['aggregate_travel_time_to_work'].isna()].head(3)

Unnamed: 0,geo_id,total_pop,households,male_pop,female_pop,median_age,male_under_5,male_5_to_9,male_10_to_14,male_15_to_17,male_18_to_19,male_20,male_21,male_22_to_24,male_25_to_29,male_30_to_34,male_35_to_39,male_40_to_44,male_45_to_49,male_50_to_54,male_55_to_59,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_under_5,female_5_to_9,female_10_to_14,female_15_to_17,female_18_to_19,female_20,female_21,female_22_to_24,female_25_to_29,female_30_to_34,female_35_to_39,female_40_to_44,female_45_to_49,female_50_to_54,female_55_to_59,female_60_to_61,female_62_to_64,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,white_pop,population_1_year_and_over,population_3_years_over,pop_5_years_over,pop_15_and_over,pop_16_over,pop_25_years_over,pop_25_64,pop_never_married,pop_now_married,pop_separated,pop_widowed,pop_divorced,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children,children_in_single_female_hh,median_rent,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,commuters_16_over,walked_to_work,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,commuters_by_public_transportation,commuters_by_bus,commuters_by_car_truck_van,commuters_by_carpool,commuters_by_subway_or_elevated,commuters_drove_alone,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,employed_pop,unemployed_pop,pop_in_labor_force,not_in_labor_force,workers_16_and_over,armed_forces,civilian_labor_force,employed_agriculture_forestry_fishing_hunting_mining,employed_arts_entertainment_recreation_accommodation_food,employed_construction,employed_education_health_social,employed_finance_insurance_real_estate,employed_information,employed_manufacturing,employed_other_services_not_public_admin,employed_public_administration,employed_retail_trade,employed_science_management_admin_waste,employed_transportation_warehousing_utilities,employed_wholesale_trade,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english,year
10,55111,20,20,15,5,62.5,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,5,20,20,,,20,20,15,,,,,,0,15,0,0,0,0,0,0,20,0,0,0,5,0,0,5,0,20,5,20,625,,,,20,20,0,0,0,0,0,0,0,0,5,0,15,0,15,0,0,18,0,20,0,5,0,0,0,0,575,5,10,0,0,5,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,15,0,5,0,0,,0,0,0,0,0,0,0,10,0,0,0,0,10,0,0,0,0,0,20,5,0,0,0,0,0,5,10,0,0,0,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,2018
11,96759,360,118,204,156,75.6,0,0,39,0,0,0,0,0,0,0,0,0,0,0,0,0,44,0,39,39,43,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,37,39,42,0,0,360,360,,,283,283,0,,,,,,85,0,161,0,0,0,199,0,360,0,0,0,0,0,0,0,0,360,85,118,1109,,,,118,118,0,0,0,81,0,37,0,0,0,0,0,0,0,0,0,0,118,0,118,0,0,0,77,0,944,0,42,0,37,0,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,42,39,37,0,0,,0,0,0,0,0,0,0,0,0,78,0,0,0,205,78,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,283,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,77,0,77,0,,,,2018
12,40041,385,282,84,301,81.2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,15,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,22,57,74,32,111,366,385,385,,,385,385,17,,,,,,0,9,0,0,0,0,10,0,385,0,0,0,4,0,0,0,8,311,65,321,1150,,,371100.0,282,216,39,29,0,17,0,0,0,13,91,102,81,17,11,73,0,0,28,254,28,9,0,0,0,0,942,0,135,25,13,15,0,0,0,11,17,66,0,0,73,63,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,92,173,17,0,0,,0,0,0,0,0,0,74,15,83,91,32,45,85,23,91,83,8,56,132,0,0,8,0,0,0,4,12,9,0,9,376,0,0,9,0,0,0,9,0,0,0,0,0,0,0,0,0,9,0,0,0,0,9,0,0,0,0,0,0,,,,2018


In [26]:
ACS.loc[~ACS['aggregate_travel_time_to_work'].isna()].head(3)

Unnamed: 0,geo_id,total_pop,households,male_pop,female_pop,median_age,male_under_5,male_5_to_9,male_10_to_14,male_15_to_17,male_18_to_19,male_20,male_21,male_22_to_24,male_25_to_29,male_30_to_34,male_35_to_39,male_40_to_44,male_45_to_49,male_50_to_54,male_55_to_59,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_under_5,female_5_to_9,female_10_to_14,female_15_to_17,female_18_to_19,female_20,female_21,female_22_to_24,female_25_to_29,female_30_to_34,female_35_to_39,female_40_to_44,female_45_to_49,female_50_to_54,female_55_to_59,female_60_to_61,female_62_to_64,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,white_pop,population_1_year_and_over,population_3_years_over,pop_5_years_over,pop_15_and_over,pop_16_over,pop_25_years_over,pop_25_64,pop_never_married,pop_now_married,pop_separated,pop_widowed,pop_divorced,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children,children_in_single_female_hh,median_rent,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,commuters_16_over,walked_to_work,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,commuters_by_public_transportation,commuters_by_bus,commuters_by_car_truck_van,commuters_by_carpool,commuters_by_subway_or_elevated,commuters_drove_alone,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,employed_pop,unemployed_pop,pop_in_labor_force,not_in_labor_force,workers_16_and_over,armed_forces,civilian_labor_force,employed_agriculture_forestry_fishing_hunting_mining,employed_arts_entertainment_recreation_accommodation_food,employed_construction,employed_education_health_social,employed_finance_insurance_real_estate,employed_information,employed_manufacturing,employed_other_services_not_public_admin,employed_public_administration,employed_retail_trade,employed_science_management_admin_waste,employed_transportation_warehousing_utilities,employed_wholesale_trade,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english,year
17,99638,26,12,15,11,33.5,2,1,2,0,0,0,0,0,4,1,0,0,0,0,0,0,1,2,2,0,0,0,1,0,1,0,0,0,1,0,1,2,0,0,1,4,0,0,0,0,0,0,0,0,6,26,26,,,19,18,13,,,,,,0,0,0,0,17,0,3,0,26,0,0,0,0,0,0,0,0,26,4,20,950,,20000.0,27500.0,12,6,8,0,0,20,0,0,0,0,0,0,0,0,0,0,0,1984,8,4,8,1,0,0,7,0,,2,1,0,0,0,2,0,0,1,0,6,0,3,5,0,2,2,0,2,0,0,0,0,0,10,0,0,0,0,0,0,0,0,10,4,2,9,10,2,0,0,0,40,0,0,1,0,0,1,0,0,2,14,0,0,0,2,14,2,2,0,0,0,0,0,0,0,0,0,0,12,0,12,7,12,0,12,2,1,0,1,0,0,0,0,8,0,0,0,0,4,3,2,1,2,4,1,1,3,1,6,1,,,,2018
130,98286,185,90,100,85,63.1,1,1,9,5,0,0,0,7,4,4,2,0,6,3,1,3,7,14,8,2,8,0,0,0,5,0,2,0,3,2,2,0,2,3,4,11,7,2,12,15,8,2,0,5,170,185,184,,,171,152,68,,,,,,2,8,4,1,0,0,2,1,184,3,0,0,0,0,0,6,16,185,8,243,820,465900.0,741100.0,1187500.0,90,21,153,4,1,239,0,0,0,0,0,0,0,4,1,4,9,1980,52,34,56,2,0,0,21,2,760.0,7,5,1,0,0,0,3,0,3,2,69,14,27,18,0,1,1,1,0,0,0,0,0,0,11,11,6,0,0,0,0,5,6,39,6,19,1,3,33,31,10,13,1040,6,0,25,5,0,20,9,8,32,8,6,30,34,7,8,32,36,57,48,0,2,13,0,2,2,6,25,60,0,60,111,58,0,60,3,2,10,8,9,0,5,3,6,3,7,4,0,31,6,3,9,11,31,9,2,10,8,26,6,,,,2018
133,31547,1420,101,1181,239,21.1,22,19,20,0,182,303,224,254,87,35,23,12,0,0,0,0,0,0,0,0,0,55,57,32,0,0,0,0,7,26,16,43,3,0,0,0,0,0,0,0,0,0,0,0,1143,1396,1364,,,1215,245,245,,,,,,26,97,0,166,0,0,14,166,1254,0,0,0,0,0,0,0,0,395,10,151,1411,,,,101,101,50,50,0,54,21,16,44,16,0,0,0,0,0,8,0,1987,74,6,95,0,0,0,205,0,1337.0,0,10,5,0,16,6,45,19,0,0,0,0,0,614,0,93,93,20,28,0,45,0,0,0,311,412,96,241,0,0,0,0,0,1060,853,0,0,0,6,90,5,0,11720,0,0,207,15,0,192,1025,6,53,55,23,18,66,24,55,53,71,18,95,0,0,0,0,0,0,0,0,49,13,1104,111,1060,1042,62,0,6,0,12,6,0,0,0,15,0,0,0,10,27,0,10,6,6,27,6,61,33,19,170,5,,,,2018


Looking at how the aggregate time to work is calculated, the missing values in this column can be imputed by multiplying the number of commuters in a certain time frame with the average commute time.

In [27]:
commute_l = ACS.filter(regex = 'commute_').columns.to_list()

#Extracting time segments from column titles
commute_times = list(map(lambda x: x.split(sep = '_'), commute_l))

#Calculating average commute times for the time segments
commute_times = list(map(lambda y: statistics.mean(y), 
                         list(map(lambda x: [int(word) for word in x if word.isdigit()], commute_times))))

print(f'Average commute times: {commute_times}')

#Creating a dataframe focusing on missing data in aggregate travel time to work
commute_df = ACS.loc[ACS['aggregate_travel_time_to_work'].isna()][commute_l].astype(float)

#Only looking at commute time columns
commute_df.iloc[:, :] *= commute_times

#Calculating the aggregate time to work
commute_df['aggregate_travel_time_to_work'] = commute_df.sum(axis = 1)

#Dropping all columns but aggregate time to work
commute_df.drop(columns = commute_l, inplace = True)

#Replacing missing values with imputed values
ACS.loc[ACS['aggregate_travel_time_to_work'].isna(), 'aggregate_travel_time_to_work'] = commute_df

missing_rows = ACS.loc[ACS['aggregate_travel_time_to_work'].isna()].shape[0]

#Checking for missingness
print(f'Rows with missingness in agg travel time to work: {missing_rows}')

#Resetting index of dataframe
ACS.reset_index(inplace = True)
ACS.drop(columns = 'index', inplace = True)

Average commute times: [10, 12, 17, 22, 27, 32, 39.5, 60, 52]
Rows with missingness in agg travel time to work: 0


##### Median Rent

*Checking in what year missingness occurred will not be required for this portion as the percentage of missingness is very small.* <br>
Since rent data will be coming from Zillow, this column will be dropped.

In [28]:
ACS.drop(columns = 'median_rent', inplace = True)

##### Other

*Checking in what year missingness occurred will not be required for this portion as the percentage of missingness is very small.*<br><br>
For columns *renter_occupied_housing_units_paying_cash_median_gross_rent*, *owner_occupied_housing_units_lower_value_quartile*, *median_year_structure_built*, *owner_occupied_housing_units_median_value*, and *owner_occupied_housing_units_upper_value_quartile* it will not be feasible to impute these values as they are single value and not derived from other columns based on Partly Parrots's analysis. As such, these rows will be dropped.

In [29]:
clean_missingness_rows(['renter_occupied_housing_units_paying_cash_median_gross_rent',
                              'owner_occupied_housing_units_lower_value_quartile', 
                              'median_year_structure_built', 'owner_occupied_housing_units_median_value',
                              'owner_occupied_housing_units_upper_value_quartile'], ACS)

Rows with missingness in renter_occupied_housing_units_paying_cash_median_gross_rent: 0
Cleaning complete.


Rows with missingness in owner_occupied_housing_units_lower_value_quartile: 0
Cleaning complete.


Rows with missingness in median_year_structure_built: 0
Cleaning complete.


Rows with missingness in owner_occupied_housing_units_median_value: 0
Cleaning complete.


Rows with missingness in owner_occupied_housing_units_upper_value_quartile: 0
Cleaning complete.




##### Taking a closer look into the high missingness columns<br>
##### Marital Status

In [30]:
ACS.loc[ACS['pop_now_married'].isna()]

Unnamed: 0,geo_id,total_pop,households,male_pop,female_pop,median_age,male_under_5,male_5_to_9,male_10_to_14,male_15_to_17,male_18_to_19,male_20,male_21,male_22_to_24,male_25_to_29,male_30_to_34,male_35_to_39,male_40_to_44,male_45_to_49,male_50_to_54,male_55_to_59,male_65_to_66,male_67_to_69,male_70_to_74,male_75_to_79,male_80_to_84,male_85_and_over,female_under_5,female_5_to_9,female_10_to_14,female_15_to_17,female_18_to_19,female_20,female_21,female_22_to_24,female_25_to_29,female_30_to_34,female_35_to_39,female_40_to_44,female_45_to_49,female_50_to_54,female_55_to_59,female_60_to_61,female_62_to_64,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,white_pop,population_1_year_and_over,population_3_years_over,pop_5_years_over,pop_15_and_over,pop_16_over,pop_25_years_over,pop_25_64,pop_never_married,pop_now_married,pop_separated,pop_widowed,pop_divorced,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children,children_in_single_female_hh,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,commuters_16_over,walked_to_work,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,commuters_by_public_transportation,commuters_by_bus,commuters_by_car_truck_van,commuters_by_carpool,commuters_by_subway_or_elevated,commuters_drove_alone,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,employed_pop,unemployed_pop,pop_in_labor_force,not_in_labor_force,workers_16_and_over,armed_forces,civilian_labor_force,employed_agriculture_forestry_fishing_hunting_mining,employed_arts_entertainment_recreation_accommodation_food,employed_construction,employed_education_health_social,employed_finance_insurance_real_estate,employed_information,employed_manufacturing,employed_other_services_not_public_admin,employed_public_administration,employed_retail_trade,employed_science_management_admin_waste,employed_transportation_warehousing_utilities,employed_wholesale_trade,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english,year
4,51341,25,21,19,6,60.6,0,0,0,0,0,0,0,0,0,0,0,0,0,7,3,1,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,1,25,25,25,,,25,25,18,,,,,,0,0,0,0,0,0,0,0,25,0,0,0,0,0,0,7,7,25,4,25,556,26900,32500,45000,21,7,4,0,0,24,1,0,0,0,0,0,0,0,0,0,7,1950,2,14,7,1,0,0,0,0,0,2,0,0,0,4,0,0,1,0,14,0,0,0,5,0,0,0,0,0,0,0,0,0,6,2,4,0,0,0,0,0,0,12,1,0,1,1,7,6,0,7,152,0,0,11,0,0,11,0,0,1,16,0,0,2,6,16,1,1,0,2,0,1,0,2,2,7,2,14,12,0,12,13,12,0,12,0,0,0,0,0,0,0,5,0,0,5,2,0,1,2,0,5,4,1,5,0,0,0,0,0,,,,2018
5,38578,158,62,56,102,79.3,0,0,0,0,0,0,0,2,0,0,0,0,5,0,3,0,10,3,2,20,8,6,0,0,0,2,0,0,0,0,0,0,0,0,9,5,0,0,0,13,13,6,19,29,158,158,152,,,152,148,25,,,,,,0,0,0,0,0,0,0,0,158,0,0,0,0,0,0,5,6,108,14,100,768,105200,145800,189100,62,13,38,0,21,68,0,26,6,0,0,0,0,0,0,4,0,1983,38,24,38,11,0,0,6,0,0,3,0,0,5,5,0,0,0,0,49,0,19,13,3,6,6,0,6,0,0,0,0,0,5,0,0,7,0,0,0,0,0,12,0,0,0,5,30,27,0,0,204,0,0,12,0,0,12,50,10,32,19,3,28,13,33,19,32,3,38,26,0,3,0,0,0,3,5,11,12,0,12,140,12,0,12,0,0,0,2,0,0,1,2,0,0,0,5,2,2,0,8,0,2,2,0,0,0,0,0,0,,,,2018
8,88353,152,84,43,109,68.3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,7,14,5,0,6,0,0,0,0,0,0,0,0,0,30,0,0,0,10,17,2,0,4,17,8,8,13,0,30,152,152,,,152,152,65,,,,,,0,0,0,122,0,0,0,122,30,0,0,0,0,0,2,0,4,152,23,288,325,39400,45200,52500,84,38,204,6,0,232,0,16,0,0,0,0,0,40,0,5,62,1954,37,47,37,17,0,0,0,0,23,0,5,0,5,0,5,0,0,0,46,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,2,0,5,12,0,2,0,5,37,42,0,0,389,0,0,12,0,0,12,0,21,6,82,0,0,5,34,86,6,2,0,26,0,0,0,0,0,6,0,6,14,0,14,138,14,0,14,2,7,0,0,0,0,0,5,0,0,0,0,0,9,0,0,0,5,9,0,0,0,0,0,0,,,,2018
9,89039,140,101,62,78,70.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,11,10,14,0,4,0,0,0,0,0,0,0,0,5,0,0,0,0,0,3,5,3,0,12,30,7,9,4,131,140,140,,,140,140,32,,,,,,0,4,0,5,0,0,0,5,135,0,0,0,0,0,0,0,16,140,25,169,633,66500,85000,139900,101,12,68,0,7,42,0,0,0,0,0,0,0,127,0,0,0,1987,14,77,24,5,0,0,0,0,0,0,0,0,4,0,8,0,0,0,89,0,16,5,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,4,0,9,0,0,0,4,34,53,3,7,300,0,0,9,0,0,9,0,5,0,48,5,12,43,14,61,0,12,12,53,0,0,12,0,0,4,0,16,14,0,14,126,9,0,14,0,5,0,5,0,0,0,0,0,0,0,0,4,0,0,4,0,10,0,0,0,0,0,0,0,,,,2018
17,74761,72,36,36,36,45.5,0,0,5,2,0,0,0,0,0,6,3,0,7,2,0,0,2,2,0,2,2,4,2,4,0,0,0,0,2,2,3,1,1,2,2,1,0,0,0,2,0,1,6,3,66,71,68,,,55,53,33,,,,,,0,0,0,0,5,0,1,0,72,0,0,0,0,0,0,9,3,72,31,46,488,31900,73300,118800,36,10,10,2,0,35,0,0,0,0,0,0,0,11,2,0,2,1981,16,17,19,13,0,0,17,4,6,0,0,0,0,0,0,2,2,0,26,0,11,7,0,4,3,3,0,0,0,1,0,0,2,2,0,4,0,0,2,4,4,18,0,0,0,4,18,6,5,3,659,0,0,16,0,0,16,0,4,6,19,3,1,5,15,19,6,6,1,12,0,4,0,0,2,1,5,12,18,0,18,37,18,0,18,0,0,0,3,2,0,4,0,4,2,3,0,0,4,4,0,6,4,4,6,4,7,2,16,3,,,,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126236,75231,37979.000000000,15314.000000000,19713.000000000,18266.000000000,30.300000000,2122.000000000,1677.000000000,980.000000000,620.000000000,375.000000000,243.000000000,301.000000000,1184.000000000,2299.000000000,1978.000000000,1537.000000000,1357.000000000,1256.000000000,1097.000000000,722.000000000,288.000000000,230.000000000,322.000000000,228.000000000,109.000000000,153.000000000,2161.000000000,1460.000000000,983.000000000,496.000000000,350.000000000,263.000000000,196.000000000,1073.000000000,2010.000000000,1719.000000000,1403.000000000,851.000000000,980.000000000,1049.000000000,1069.000000000,232.000000000,420.000000000,105.000000000,275.000000000,367.000000000,255.000000000,201.000000000,348.000000000,10612.000000000,37174.000000000,35482.000000000,33696.000000000,,28228.000000000,23495.000000000,20614.000000000,,,,,,10883.000000000,9635.000000000,2386.000000000,14232.000000000,54.000000000,67.000000000,993.000000000,14232.000000000,23747.000000000,163.000000000,52.000000000,777.000000000,393.000000000,693.000000000,166.000000000,666.000000000,703.000000000,37382.000000000,13000.000000000,18490.000000000,722.000000000,103500.000000000,262700.000000000,373900.000000000,15314.000000000,12240.000000000,3176.000000000,2137.000000000,101.000000000,2665.000000000,514.000000000,341.000000000,1003.000000000,4861.000000000,4337.000000000,2341.000000000,2389.000000000,39.000000000,15.000000000,101.000000000,51.000000000,1977.000000000,4843.000000000,7021.000000000,8293.000000000,3630.000000000,103.000000000,14.000000000,10499.000000000,3490.000000000,574.000000000,3265.000000000,1010.000000000,864.000000000,883.000000000,1472.000000000,1363.000000000,1266.000000000,1137.000000000,406.000000000,3074.000000000,9.000000000,1849.000000000,2276.000000000,5075.000000000,4842.000000000,2722.000000000,977.000000000,1722.000000000,15.000000000,8.000000000,2120.000000000,557.000000000,370.000000000,1050.000000000,1730.000000000,2377.000000000,3312.000000000,1372.000000000,3196.000000000,982.000000000,1329.000000000,1607.000000000,16955.000000000,343.000000000,674.000000000,2084.000000000,2980.000000000,7625.000000000,4044.000000000,492.000000000,173.000000000,467585.000000000,1998.000000000,1380.000000000,14026.000000000,2382.000000000,154.000000000,11644.000000000,465.000000000,1102.000000000,4437.000000000,4483.000000000,870.000000000,1692.000000000,3335.000000000,6184.000000000,5080.000000000,4437.000000000,5692.000000000,2487.000000000,5307.000000000,202.000000000,551.000000000,629.000000000,466.000000000,459.000000000,747.000000000,656.000000000,3710.000000000,17868.000000000,1428.000000000,19354.000000000,8874.000000000,17629.000000000,58.000000000,19296.000000000,132.000000000,3236.000000000,1761.000000000,2613.000000000,1253.000000000,263.000000000,1232.000000000,969.000000000,282.000000000,1907.000000000,3135.000000000,707.000000000,378.000000000,5208.000000000,1884.000000000,2019.000000000,3786.000000000,4971.000000000,5208.000000000,3786.000000000,2451.000000000,1516.000000000,1426.000000000,8863.000000000,1509.000000000,17361.000000000,11501.000000000,6130.000000000,2015
126237,33458,52303.000000000,19951.000000000,25529.000000000,26774.000000000,40.800000000,1537.000000000,1654.000000000,1622.000000000,985.000000000,505.000000000,163.000000000,164.000000000,655.000000000,1763.000000000,2053.000000000,1588.000000000,2137.000000000,1887.000000000,2010.000000000,1853.000000000,535.000000000,720.000000000,762.000000000,411.000000000,395.000000000,399.000000000,1720.000000000,1507.000000000,1872.000000000,743.000000000,711.000000000,164.000000000,89.000000000,554.000000000,1796.000000000,1676.000000000,1957.000000000,1721.000000000,2136.000000000,1941.000000000,2226.000000000,698.000000000,1104.000000000,472.000000000,796.000000000,918.000000000,576.000000000,687.000000000,710.000000000,39303.000000000,51766.000000000,50422.000000000,49046.000000000,,41821.000000000,37658.000000000,30277.000000000,,,,,,4760.000000000,1631.000000000,1677.000000000,8929.000000000,0E-9,92.000000000,671.000000000,8929.000000000,43374.000000000,28.000000000,153.000000000,111.000000000,52.000000000,724.000000000,362.000000000,2988.000000000,3020.000000000,51914.000000000,4904.000000000,22314.000000000,1405.000000000,171300.000000000,276900.000000000,441700.000000000,19951.000000000,6210.000000000,2363.000000000,168.000000000,53.000000000,13028.000000000,3679.000000000,529.000000000,1615.000000000,1875.000000000,891.000000000,306.000000000,275.000000000,116.000000000,148.000000000,591.000000000,69.000000000,1991.000000000,10417.000000000,6577.000000000,13374.000000000,1089.000000000,24.000000000,18.000000000,11640.000000000,2101.000000000,247.000000000,1823.000000000,649.000000000,433.000000000,431.000000000,733.000000000,645.000000000,741.000000000,425.000000000,83.000000000,13741.000000000,241.000000000,9581.000000000,4770.000000000,1586.000000000,3746.000000000,3121.000000000,1658.000000000,1370.000000000,59.000000000,34.000000000,625.000000000,162.000000000,162.000000000,3108.000000000,4410.000000000,4152.000000000,4561.000000000,2619.000000000,3645.000000000,1030.000000000,1191.000000000,804.000000000,25520.000000000,300.000000000,1894.000000000,1283.000000000,1023.000000000,6730.000000000,9723.000000000,1852.000000000,623.000000000,586735.000000000,28.000000000,15.000000000,23881.000000000,2077.000000000,0E-9,21804.000000000,379.000000000,3465.000000000,9946.000000000,7550.000000000,1908.000000000,3803.000000000,5050.000000000,3488.000000000,8186.000000000,9946.000000000,13160.000000000,5615.000000000,10423.000000000,689.000000000,2108.000000000,1217.000000000,236.000000000,312.000000000,1538.000000000,1381.000000000,7481.000000000,27804.000000000,1512.000000000,29334.000000000,12487.000000000,27414.000000000,18.000000000,29316.000000000,71.000000000,3772.000000000,2174.000000000,5443.000000000,2545.000000000,368.000000000,1669.000000000,1246.000000000,507.000000000,2869.000000000,5029.000000000,1453.000000000,658.000000000,11930.000000000,2380.000000000,1705.000000000,6665.000000000,5124.000000000,11930.000000000,6665.000000000,2451.000000000,2946.000000000,2479.000000000,11967.000000000,2091.000000000,39787.000000000,6544.000000000,3507.000000000,2015
126238,60047,42330.000000000,13866.000000000,21125.000000000,21205.000000000,42.500000000,985.000000000,1665.000000000,1712.000000000,1384.000000000,477.000000000,300.000000000,153.000000000,803.000000000,749.000000000,792.000000000,1055.000000000,1496.000000000,1808.000000000,2277.000000000,1885.000000000,468.000000000,360.000000000,617.000000000,206.000000000,199.000000000,212.000000000,993.000000000,1342.000000000,1779.000000000,1067.000000000,631.000000000,168.000000000,146.000000000,534.000000000,652.000000000,944.000000000,1324.000000000,1571.000000000,2021.000000000,2325.000000000,1939.000000000,712.000000000,685.000000000,442.000000000,581.000000000,461.000000000,311.000000000,229.000000000,348.000000000,34011.000000000,41985.000000000,41190.000000000,40352.000000000,,32996.000000000,28191.000000000,23757.000000000,,,,,,2321.000000000,651.000000000,3963.000000000,2955.000000000,8.000000000,51.000000000,676.000000000,2955.000000000,39375.000000000,272.000000000,277.000000000,33.000000000,29.000000000,207.000000000,157.000000000,3531.000000000,2944.000000000,41668.000000000,1477.000000000,14446.000000000,1242.000000000,290100.000000000,408400.000000000,605800.000000000,13866.000000000,1043.000000000,580.000000000,21.000000000,159.000000000,13199.000000000,561.000000000,0E-9,82.000000000,69.000000000,155.000000000,119.000000000,157.000000000,83.000000000,9.000000000,102.000000000,279.000000000,1985.000000000,10534.000000000,2083.000000000,11783.000000000,523.000000000,0E-9,45.000000000,10927.000000000,797.000000000,99.000000000,160.000000000,22.000000000,60.000000000,39.000000000,134.000000000,265.000000000,166.000000000,92.000000000,6.000000000,12823.000000000,330.000000000,9775.000000000,2287.000000000,234.000000000,2524.000000000,2251.000000000,1426.000000000,723.000000000,95.000000000,7.000000000,273.000000000,144.000000000,144.000000000,2136.000000000,1400.000000000,1843.000000000,2168.000000000,1337.000000000,2960.000000000,3033.000000000,2816.000000000,2596.000000000,20289.000000000,96.000000000,1668.000000000,227.000000000,203.000000000,2101.000000000,6993.000000000,3412.000000000,1157.000000000,674655.000000000,996.000000000,33.000000000,18696.000000000,1027.000000000,34.000000000,17669.000000000,654.000000000,1727.000000000,10165.000000000,3682.000000000,1303.000000000,5438.000000000,2834.000000000,1252.000000000,4069.000000000,10165.000000000,15132.000000000,6841.000000000,5864.000000000,351.000000000,2871.000000000,2001.000000000,64.000000000,243.000000000,920.000000000,1042.000000000,7492.000000000,22216.000000000,1550.000000000,23766.000000000,9230.000000000,21957.000000000,0E-9,23766.000000000,34.000000000,1544.000000000,1079.000000000,4215.000000000,2347.000000000,687.000000000,3210.000000000,716.000000000,270.000000000,2545.000000000,3400.000000000,877.000000000,1292.000000000,12051.000000000,847.000000000,1274.000000000,5844.000000000,2200.000000000,12051.000000000,5844.000000000,2451.000000000,2722.000000000,3432.000000000,12010.000000000,1817.000000000,31934.000000000,2089.000000000,837.000000000,2015
126239,70560,42450.000000000,14798.000000000,20357.000000000,22093.000000000,34.700000000,1599.000000000,1668.000000000,1465.000000000,809.000000000,459.000000000,289.000000000,261.000000000,824.000000000,1372.000000000,1520.000000000,1353.000000000,1189.000000000,1297.000000000,1457.000000000,1191.000000000,464.000000000,319.000000000,687.000000000,513.000000000,191.000000000,162.000000000,1943.000000000,1774.000000000,1307.000000000,1045.000000000,620.000000000,305.000000000,449.000000000,999.000000000,1160.000000000,1495.000000000,1038.000000000,1342.000000000,1234.000000000,1582.000000000,1628.000000000,423.000000000,642.000000000,388.000000000,428.000000000,890.000000000,531.000000000,361.000000000,509.000000000,22486.000000000,41543.000000000,40397.000000000,38908.000000000,,32273.000000000,26634.000000000,21191.000000000,,,,,,1329.000000000,15623.000000000,1357.000000000,1921.000000000,56.000000000,45.000000000,962.000000000,1921.000000000,40529.000000000,84.000000000,132.000000000,891.000000000,604.000000000,26.000000000,50.000000000,1691.000000000,1614.000000000,41450.000000000,9829.000000000,17127.000000000,656.000000000,42900.000000000,84700.000000000,144200.000000000,14798.000000000,5221.000000000,2329.000000000,645.000000000,106.000000000,10334.000000000,94.000000000,439.000000000,668.000000000,637.000000000,290.000000000,203.000000000,360.000000000,4040.000000000,0E-9,246.000000000,934.000000000,1977.000000000,6138.000000000,4592.000000000,10206.000000000,3607.000000000,86.000000000,27.000000000,11610.000000000,5202.000000000,1015.000000000,1023.000000000,458.000000000,229.000000000,397.000000000,341.000000000,534.000000000,451.000000000,436.000000000,337.000000000,9577.000000000,10.000000000,4200.000000000,4435.000000000,1862.000000000,4070.000000000,1386.000000000,752.000000000,599.000000000,15.000000000,20.000000000,2684.000000000,498.000000000,468.000000000,2982.000000000,3512.000000000,3341.000000000,2010.000000000,476.000000000,1834.000000000,888.000000000,1053.000000000,532.000000000,16628.000000000,385.000000000,213.000000000,818.000000000,1745.000000000,5208.000000000,5398.000000000,1670.000000000,777.000000000,379895.000000000,132.000000000,132.000000000,15593.000000000,2151.000000000,0E-9,13442.000000000,880.000000000,1078.000000000,1811.000000000,9047.000000000,1011.000000000,788.000000000,3104.000000000,7243.000000000,11336.000000000,1811.000000000,2199.000000000,1051.000000000,5193.000000000,56.000000000,251.000000000,105.000000000,825.000000000,694.000000000,2624.000000000,658.000000000,5213.000000000,17128.000000000,2344.000000000,19479.000000000,12794.000000000,16841.000000000,7.000000000,19472.000000000,2872.000000000,1331.000000000,1228.000000000,3141.000000000,856.000000000,184.000000000,1499.000000000,1077.000000000,526.000000000,2120.000000000,938.000000000,781.000000000,575.000000000,3800.000000000,3139.000000000,2649.000000000,4405.000000000,3135.000000000,3800.000000000,4405.000000000,2451.000000000,2417.000000000,2300.000000000,10290.000000000,1165.000000000,33056.000000000,1575.000000000,1022.000000000,2015


In [31]:
ACS.loc[~ACS['pop_now_married'].isna()]['year'].unique()

array([2014])

Based on the results above, it seems like the data pertaining to marital status was either not reported by the individuals completing the survey or was not collected by the survey department, with the latter being the most probable example. With an 80% reported missingness, with no way of comfortably imputing, the columns pertaining to marital status will be dropped.

In [32]:
#Dropping marital status columns
ACS.drop(columns = ['pop_never_married', 'pop_now_married', 'pop_separated', 'pop_widowed', 'pop_divorced'], 
         inplace = True)

##### Languages Spoken at Home

In [33]:
ACS.loc[~ACS['speak_spanish_at_home'].isna()]['year'].unique()

array([2015, 2014])

In [34]:
ACS.loc[~ACS['speak_only_english_at_home'].isna()]['year'].unique()

array([2015, 2014])

In [35]:
ACS.loc[~ACS['speak_only_english_at_home'].isna()]['year'].unique()

array([2015, 2014])

In [36]:
ACS.loc[~ACS['speak_spanish_at_home_low_english'].isna()]['year'].unique()

array([2015, 2014])

Based on the above results, it can be concluded that the only in 2014 and 2015 data about languages spoken at home was collected. With a 60% reported missingness, with no way of comfortably imputing, the columns pertaining to languages spoken at home will be dropped.

In [37]:
ACS.drop(columns = ['speak_spanish_at_home', 'speak_only_english_at_home', 'speak_only_english_at_home',
                   'speak_spanish_at_home_low_english'], inplace = True)

### Population Age Groups

To further reduce the number of columns in the ACS data set, columns reporting various population groups will be further grouped, with the total values removed from the data set.<br><br>
Before doing so, total values will be checked with the "dummified" groupings.<br>
#### Checking Male Population Grouping and Totals
Age groups will further be combined into the following:<br>
- Males under 18
- Males from 18 to 24
- Males from 25 to 49
- Males from 50 to 66
- Males above 67

In [38]:
#Combining age groups
ACS['male_under_18'] = ACS['male_under_5'] + ACS['male_5_to_9'] + ACS['male_10_to_14'] + ACS['male_15_to_17']
ACS['male_18_to_24'] = ACS['male_18_to_19'] + ACS['male_20'] + ACS['male_21'] + ACS['male_22_to_24']
ACS['male_25_to_49'] = ACS['male_25_to_29'] + ACS['male_30_to_34'] + ACS['male_35_to_39'] + ACS['male_40_to_44'] +\
                        ACS['male_45_to_49']
ACS['male_50_to_66'] = ACS['male_50_to_54'] + ACS['male_55_to_59'] + ACS['male_65_to_66']
ACS['male_above_67'] = ACS['male_67_to_69'] + ACS['male_70_to_74'] + ACS['male_75_to_79'] + ACS['male_80_to_84'] +\
                        ACS['male_85_and_over']

In [39]:
def clean_group_distribution(df, col_start, check_on, bound_low = None, bound_high = None, new_group = None):
    """
    This function cleans up "dummified" values of a set of columns by checking whether the 
    aggregate of the columns in question add up to the total in the original dataframe. Should 
    it not add up, a new column is generated with the difference between the sum and total.
    
    Arguments
    
    df         : (Pandas dataframe) dataframe of interest
    col_start  : (str) string indicating what each dummified column begins with - must be in quotations! 
    check_on   : (Pandas dataframe col) column to check against the dummified columns
    bound_low  : (int) value at which dummified column list begins (default = None)
    bound_high : (str) name of first column after dummified columns of interest (default = None) - 
                       must be in quotations!
    new_group  : (int) If 1 is inputted, the function assumes a new grouping has been introduced for the 
                       dummified columns in question (default = None)
    
    Returns
    
    df         : (Pandas dataframe) cleaned version of inputted dataframe
    
    """
    check_on = check_on.astype(float)
    
    #Creating a list of columns of interest
    cols_interest = [x for x in df.columns if x.startswith(col_start)]
    
    #Additional cleaning if required (specified by bounds low and high)
    if bound_low:
        cols_interest = cols_interest[bound_low:cols_interest.index(bound_high)]
    
    #Creating a check column
    df[cols_interest] = df[cols_interest].astype(float)
    df['check'] = df[cols_interest].sum(axis = 1)
    df['check'] = df['check'].astype(float)
    
    #Comparing total of check_on with check values
    print(f'Total rows in requested dataframe: {df.shape[0]}')
    print(f'Total rows with unmatched employed population: {df.loc[check_on != df.check].shape[0]}\n\n')
    
    #Creating the remainder of specified group as column
    df['x'] = abs(check_on - df['check'])
    df['x'] = df['x'].astype(float)
    
    #Reporting percentage of values that haven't been appropriately reported
    perc_x = round(df['x'].sum() / check_on.sum() * 100, 2)
    print(f'Total percentage of values that haven\'t been appropriately reported: {perc_x}%')
    
    #Dropping intermediary operation columns
    df.drop(columns = ['check', check_on.name], inplace = True)
    
    #Dropping splitting columns
    if new_group == 1:
        df.drop(columns = cols_interest, inplace = True)
    
    if perc_x == 0:
        df.drop(columns = 'x', inplace = True)
        print('Creating difference column in not necessary.')
        
    #Renaming remainder column
    df.rename(columns = {'x': col_start+'x'}, inplace = True)
    
    return df.head(3)

In [40]:
clean_group_distribution(ACS, 'male_', ACS['male_pop'], 1, 'male_male_households', 1)

Total rows in requested dataframe: 132049
Total rows with unmatched employed population: 131230


Total percentage of values that haven't been appropriately reported: 5.75%


Unnamed: 0,geo_id,total_pop,households,female_pop,median_age,female_under_5,female_5_to_9,female_10_to_14,female_15_to_17,female_18_to_19,female_20,female_21,female_22_to_24,female_25_to_29,female_30_to_34,female_35_to_39,female_40_to_44,female_45_to_49,female_50_to_54,female_55_to_59,female_60_to_61,female_62_to_64,female_65_to_66,female_67_to_69,female_70_to_74,female_75_to_79,female_80_to_84,female_85_and_over,white_pop,population_1_year_and_over,population_3_years_over,pop_5_years_over,pop_15_and_over,pop_16_over,pop_25_years_over,pop_25_64,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children,children_in_single_female_hh,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,commuters_16_over,walked_to_work,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,commuters_by_public_transportation,commuters_by_bus,commuters_by_car_truck_van,commuters_by_carpool,commuters_by_subway_or_elevated,commuters_drove_alone,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,employed_pop,unemployed_pop,pop_in_labor_force,not_in_labor_force,workers_16_and_over,armed_forces,civilian_labor_force,employed_agriculture_forestry_fishing_hunting_mining,employed_arts_entertainment_recreation_accommodation_food,employed_construction,employed_education_health_social,employed_finance_insurance_real_estate,employed_information,employed_manufacturing,employed_other_services_not_public_admin,employed_public_administration,employed_retail_trade,employed_science_management_admin_waste,employed_transportation_warehousing_utilities,employed_wholesale_trade,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,year,male_under_18,male_18_to_24,male_25_to_49,male_50_to_66,male_above_67,male_x
4,51341,25,21,6,60.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,0,0,0,1,25,25,25,,,25,25,18,0,0,0,0,0,0,0,0,25,0,0,0,0,0,0,7,7,25,4,25,556,26900,32500,45000,21,7,4,0,0,24,1,0,0,0,0,0,0,0,0,0,7,1950,2,14,7,1,0,0,0,0,0,2,0,0,0,4,0,0,1,0,14,0,0,0,5,0,0,0,0,0,0,0,0,0,6,2,4,0,0,0,0,0,0,12,1,0,1,1,7,6,0,7,152,0,0,11,0,0,11,0,0,1,16,0,0,2,6,16,1,1,0,2,0,1,0,2,2,7,2,14,12,0,12,13,12,0,12,0,0,0,0,0,0,0,5,0,0,5,2,0,1,2,0,5,4,1,5,0,0,0,0,0,2018,0,0,0,11,4,4.0
5,38578,158,62,102,79.3,6,0,0,0,2,0,0,0,0,0,0,0,0,9,5,0,0,0,13,13,6,19,29,158,158,152,,,152,148,25,0,0,0,0,0,0,0,0,158,0,0,0,0,0,0,5,6,108,14,100,768,105200,145800,189100,62,13,38,0,21,68,0,26,6,0,0,0,0,0,0,4,0,1983,38,24,38,11,0,0,6,0,0,3,0,0,5,5,0,0,0,0,49,0,19,13,3,6,6,0,6,0,0,0,0,0,5,0,0,7,0,0,0,0,0,12,0,0,0,5,30,27,0,0,204,0,0,12,0,0,12,50,10,32,19,3,28,13,33,19,32,3,38,26,0,3,0,0,0,3,5,11,12,0,12,140,12,0,12,0,0,0,2,0,0,1,2,0,0,0,5,2,2,0,8,0,2,2,0,0,0,0,0,0,2018,0,2,5,3,43,3.0
8,88353,152,84,109,68.3,0,0,0,0,0,0,0,0,0,30,0,0,0,10,17,2,0,4,17,8,8,13,0,30,152,152,,,152,152,65,0,0,0,122,0,0,0,122,30,0,0,0,0,0,2,0,4,152,23,288,325,39400,45200,52500,84,38,204,6,0,232,0,16,0,0,0,0,0,40,0,5,62,1954,37,47,37,17,0,0,0,0,23,0,5,0,5,0,5,0,0,0,46,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,2,0,5,12,0,2,0,5,37,42,0,0,389,0,0,12,0,0,12,0,21,6,82,0,0,5,34,86,6,2,0,26,0,0,0,0,0,6,0,6,14,0,14,138,14,0,14,2,7,0,0,0,0,0,5,0,0,0,0,0,9,0,0,0,5,9,0,0,0,0,0,0,2018,0,0,0,5,32,6.0


#### Checking Female Population Grouping and Totals
Age groups will further be combined into the following:<br>
- Females under 18
- Females from 18 to 24
- Females from 25 to 49
- Females from 50 to 66
- Females above 67

In [41]:
#Combining age groups
ACS['female_under_18'] = ACS['female_under_5'] + ACS['female_5_to_9'] + ACS['female_10_to_14'] + ACS['female_15_to_17']
ACS['female_18_to_24'] = ACS['female_18_to_19'] + ACS['female_20'] + ACS['female_21'] + ACS['female_22_to_24']
ACS['female_25_to_49'] = ACS['female_25_to_29'] + ACS['female_30_to_34'] + ACS['female_35_to_39'] + ACS['female_40_to_44'] +\
                        ACS['female_45_to_49']
ACS['female_50_to_66'] = ACS['female_50_to_54'] + ACS['female_55_to_59'] + ACS['female_65_to_66']
ACS['female_above_67'] = ACS['female_67_to_69'] + ACS['female_70_to_74'] + ACS['female_75_to_79'] + ACS['female_80_to_84'] +\
                        ACS['female_85_and_over']

In [42]:
clean_group_distribution(ACS, 'female_', ACS['female_pop'], 1, 'female_female_households', 1)

Total rows in requested dataframe: 132049
Total rows with unmatched employed population: 0


Total percentage of values that haven't been appropriately reported: 0.0%
Creating difference column in not necessary.


Unnamed: 0,geo_id,total_pop,households,median_age,white_pop,population_1_year_and_over,population_3_years_over,pop_5_years_over,pop_15_and_over,pop_16_over,pop_25_years_over,pop_25_64,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children,children_in_single_female_hh,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,commuters_16_over,walked_to_work,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,commuters_by_public_transportation,commuters_by_bus,commuters_by_car_truck_van,commuters_by_carpool,commuters_by_subway_or_elevated,commuters_drove_alone,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,employed_pop,unemployed_pop,pop_in_labor_force,not_in_labor_force,workers_16_and_over,armed_forces,civilian_labor_force,employed_agriculture_forestry_fishing_hunting_mining,employed_arts_entertainment_recreation_accommodation_food,employed_construction,employed_education_health_social,employed_finance_insurance_real_estate,employed_information,employed_manufacturing,employed_other_services_not_public_admin,employed_public_administration,employed_retail_trade,employed_science_management_admin_waste,employed_transportation_warehousing_utilities,employed_wholesale_trade,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,management_business_sci_arts_employed,sales_office_employed,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,year,male_under_18,male_18_to_24,male_25_to_49,male_50_to_66,male_above_67,male_x,female_under_18,female_18_to_24,female_25_to_49,female_50_to_66,female_above_67
4,51341,25,21,60.6,25,25,25,,,25,25,18,0,0,0,0,0,0,0,0,25,0,0,0,0,0,0,7,7,25,4,25,556,26900,32500,45000,21,7,4,0,0,24,1,0,0,0,0,0,0,0,0,0,7,1950,2,14,7,1,0,0,0,0,0,2,0,0,0,4,0,0,1,0,14,0,0,0,5,0,0,0,0,0,0,0,0,0,6,2,4,0,0,0,0,0,0,12,1,0,1,1,7,6,0,7,152,0,0,11,0,0,11,0,0,1,16,0,0,2,6,16,1,1,0,2,0,1,0,2,2,7,2,14,12,0,12,13,12,0,12,0,0,0,0,0,0,0,5,0,0,5,2,0,1,2,0,5,4,1,5,0,0,0,0,0,2018,0,0,0,11,4,4.0,0,0,0,0,2
5,38578,158,62,79.3,158,158,152,,,152,148,25,0,0,0,0,0,0,0,0,158,0,0,0,0,0,0,5,6,108,14,100,768,105200,145800,189100,62,13,38,0,21,68,0,26,6,0,0,0,0,0,0,4,0,1983,38,24,38,11,0,0,6,0,0,3,0,0,5,5,0,0,0,0,49,0,19,13,3,6,6,0,6,0,0,0,0,0,5,0,0,7,0,0,0,0,0,12,0,0,0,5,30,27,0,0,204,0,0,12,0,0,12,50,10,32,19,3,28,13,33,19,32,3,38,26,0,3,0,0,0,3,5,11,12,0,12,140,12,0,12,0,0,0,2,0,0,1,2,0,0,0,5,2,2,0,8,0,2,2,0,0,0,0,0,0,2018,0,2,5,3,43,3.0,6,2,0,14,80
8,88353,152,84,68.3,30,152,152,,,152,152,65,0,0,0,122,0,0,0,122,30,0,0,0,0,0,2,0,4,152,23,288,325,39400,45200,52500,84,38,204,6,0,232,0,16,0,0,0,0,0,40,0,5,62,1954,37,47,37,17,0,0,0,0,23,0,5,0,5,0,5,0,0,0,46,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,2,0,5,12,0,2,0,5,37,42,0,0,389,0,0,12,0,0,12,0,21,6,82,0,0,5,34,86,6,2,0,26,0,0,0,0,0,6,0,6,14,0,14,138,14,0,14,2,7,0,0,0,0,0,5,0,0,0,0,0,9,0,0,0,5,9,0,0,0,0,0,0,2018,0,0,0,5,32,6.0,0,0,30,31,46


Seems like women always have a reported age! Very different from the male population.

#### Population Over Certain Age Grouping
There are columns indicating the total number of people over a certain age group. As age grouping has been performed prior, these columns will be dropped along with median age (this can be calculated outside of modelling).

In [43]:
ACS.drop(columns = ['median_age', 'population_1_year_and_over', 'population_3_years_over', 
                          'pop_5_years_over', 'pop_15_and_over', 'pop_16_over', 'pop_25_years_over', 
                          'pop_25_64'], inplace = True)

Additionally, since all children are defined as people less than 18 years old, this has already been encompassed in the male / female population split. As such, the *children* column will be dropped. 

In [44]:
#Additional check that all males and females under 18 are the same as the children column
children_df = ACS[['male_under_18', 'female_under_18']].copy()
children_df['sum'] = children_df['male_under_18'] + children_df['female_under_18']
children_df['children'] = ACS['children'].copy()
children_df.loc[children_df['sum'] != children_df['children']]

Unnamed: 0,male_under_18,female_under_18,sum,children


In [45]:
#Dropping the children column
ACS.drop(columns = 'children', inplace = True)

### Employment Columns
Both *employed_* and *occupation_* type columns cover the same material - which sector a working individual is working. To have a general understanding of occupation, all *employed_* columns will be dropped.

In [46]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'employed')))]

Additionally, all columns relating to total employment, such as *civilian_labor_force* among others, will be dropped. Before doing so, a check will be performed on all occupations to ensure no employed worker is missed.

In [47]:
clean_group_distribution(ACS, 'occupation_', ACS['pop_in_labor_force'])

Total rows in requested dataframe: 132049
Total rows with unmatched employed population: 129589


Total percentage of values that haven't been appropriately reported: 7.85%


Unnamed: 0,geo_id,total_pop,households,white_pop,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children_in_single_female_hh,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,commuters_16_over,walked_to_work,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,commuters_by_public_transportation,commuters_by_bus,commuters_by_car_truck_van,commuters_by_carpool,commuters_by_subway_or_elevated,commuters_drove_alone,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,not_in_labor_force,workers_16_and_over,armed_forces,civilian_labor_force,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_school,in_undergrad_college,year,male_under_18,male_18_to_24,male_25_to_49,male_50_to_66,male_above_67,male_x,female_under_18,female_18_to_24,female_25_to_49,female_50_to_66,female_above_67,occupation_x
4,51341,25,21,25,0,0,0,0,0,0,0,0,25,0,0,0,0,0,0,7,7,25,4,25,556,26900,32500,45000,21,7,4,0,0,24,1,0,0,0,0,0,0,0,0,0,7,1950,2,14,7,1,0,0,0,0,2,0,0,0,4,0,0,1,0,14,0,0,0,5,0,0,0,0,0,0,0,0,0,6,2,4,0,0,0,0,0,0,12,1,0,1,1,7,6,0,7,152,0,0,11,0,0,11,0,0,1,16,0,0,2,6,16,1,1,0,2,0,1,0,2,2,7,2,14,13,12,0,12,1.0,2.0,0.0,5.0,4.0,0,0,0,0,0,2018,0,0,0,11,4,4.0,0,0,0,0,2,0.0
5,38578,158,62,158,0,0,0,0,0,0,0,0,158,0,0,0,0,0,0,5,6,108,14,100,768,105200,145800,189100,62,13,38,0,21,68,0,26,6,0,0,0,0,0,0,4,0,1983,38,24,38,11,0,0,0,0,3,0,0,5,5,0,0,0,0,49,0,19,13,3,6,6,0,6,0,0,0,0,0,5,0,0,7,0,0,0,0,0,12,0,0,0,5,30,27,0,0,204,0,0,12,0,0,12,50,10,32,19,3,28,13,33,19,32,3,38,26,0,3,0,0,0,3,5,11,140,12,0,12,2.0,0.0,8.0,0.0,2.0,0,0,0,0,0,2018,0,2,5,3,43,3.0,6,2,0,14,80,0.0
8,88353,152,84,30,0,0,0,122,0,0,0,122,30,0,0,0,0,0,2,0,4,152,23,288,325,39400,45200,52500,84,38,204,6,0,232,0,16,0,0,0,0,0,40,0,5,62,1954,37,47,37,17,0,0,0,23,0,5,0,5,0,5,0,0,0,46,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,2,0,5,12,0,2,0,5,37,42,0,0,389,0,0,12,0,0,12,0,21,6,82,0,0,5,34,86,6,2,0,26,0,0,0,0,0,6,0,6,138,14,0,14,9.0,0.0,0.0,0.0,5.0,0,0,0,0,0,2018,0,0,0,5,32,6.0,0,0,30,31,46,0.0


In [48]:
#Dropping all other columns
ACS = ACS.drop(columns = ['civilian_labor_force', 'workers_16_and_over'])

### Commuters
For the columns looking at the number of commuters, typically when people look at housing (rent or purchase), it is *time* to workplace / school / etc that plays a pivotal role. As such, all of the columns pertaining to the *type* of commuter will be dropped.

In [49]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'commuters_')))]
ACS.drop(columns = 'walked_to_work', inplace = True)

### People in Schools
For the columns looking at the number of people in school, the total column will be dropped. Before doing so, a check will be performed on all grade types to ensure no student is missed.

In [50]:
clean_group_distribution(ACS, 'in_grades_', ACS['in_school'])

Total rows in requested dataframe: 132049
Total rows with unmatched employed population: 131378


Total percentage of values that haven't been appropriately reported: 39.02%


Unnamed: 0,geo_id,total_pop,households,white_pop,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children_in_single_female_hh,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,rent_20_to_25_percent,rent_15_to_20_percent,rent_10_to_15_percent,rent_under_10_percent,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,not_in_labor_force,armed_forces,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_undergrad_college,year,male_under_18,male_18_to_24,male_25_to_49,male_50_to_66,male_above_67,male_x,female_under_18,female_18_to_24,female_25_to_49,female_50_to_66,female_above_67,occupation_x,in_grades_x
4,51341,25,21,25,0,0,0,0,0,0,0,0,25,0,0,0,0,0,0,7,7,25,4,25,556,26900,32500,45000,21,7,4,0,0,24,1,0,0,0,0,0,0,0,0,0,7,1950,2,14,7,1,0,0,0,0,2,0,0,0,4,0,0,1,0,14,0,0,0,5,0,0,0,0,0,0,0,0,0,6,2,4,0,0,0,0,0,0,0,1,1,7,6,0,7,152,0,0,1,16,0,0,2,6,16,1,1,0,2,0,1,0,2,2,7,2,14,13,0,1.0,2.0,0.0,5.0,4.0,0.0,0.0,0.0,0,2018,0,0,0,11,4,4.0,0,0,0,0,2,0.0,0.0
5,38578,158,62,158,0,0,0,0,0,0,0,0,158,0,0,0,0,0,0,5,6,108,14,100,768,105200,145800,189100,62,13,38,0,21,68,0,26,6,0,0,0,0,0,0,4,0,1983,38,24,38,11,0,0,0,0,3,0,0,5,5,0,0,0,0,49,0,19,13,3,6,6,0,6,0,0,0,0,0,5,0,0,7,0,0,0,0,0,0,0,5,30,27,0,0,204,50,10,32,19,3,28,13,33,19,32,3,38,26,0,3,0,0,0,3,5,11,140,0,2.0,0.0,8.0,0.0,2.0,0.0,0.0,0.0,0,2018,0,2,5,3,43,3.0,6,2,0,14,80,0.0,0.0
8,88353,152,84,30,0,0,0,122,0,0,0,122,30,0,0,0,0,0,2,0,4,152,23,288,325,39400,45200,52500,84,38,204,6,0,232,0,16,0,0,0,0,0,40,0,5,62,1954,37,47,37,17,0,0,0,23,0,5,0,5,0,5,0,0,0,46,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,2,0,5,2,0,5,37,42,0,0,389,0,21,6,82,0,0,5,34,86,6,2,0,26,0,0,0,0,0,6,0,6,138,0,9.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0,2018,0,0,0,5,32,6.0,0,0,30,31,46,0.0,0.0


### Income Spent on Rent
Rent and income related data will be coming from ZRI and IRS data respectively. As such, columns relating to how much income is spent on rent will be dropped.

In [51]:
ACS.drop(columns = ['rent_burden_not_computed', 'rent_over_50_percent', 'rent_40_to_50_percent',
                    'rent_35_to_40_percent', 'rent_30_to_35_percent', 'rent_25_to_30_percent',
                    'rent_20_to_25_percent', 'rent_15_to_20_percent', 'rent_10_to_15_percent',
                    'rent_under_10_percent'], inplace = True)

### Housing Units

In [52]:
clean_group_distribution(ACS, 'vacant_housing_units_', ACS['vacant_housing_units'])

Total rows in requested dataframe: 132049
Total rows with unmatched employed population: 130162


Total percentage of values that haven't been appropriately reported: 73.16%


Unnamed: 0,geo_id,total_pop,households,white_pop,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop,asian_male_45_54,asian_male_55_64,black_male_45_54,black_male_55_64,hispanic_male_45_54,hispanic_male_55_64,white_male_45_54,white_male_55_64,pop_determined_poverty_status,poverty,housing_units,renter_occupied_housing_units_paying_cash_median_gross_rent,owner_occupied_housing_units_lower_value_quartile,owner_occupied_housing_units_median_value,owner_occupied_housing_units_upper_value_quartile,occupied_housing_units,housing_units_renter_occupied,vacant_housing_units_for_rent,vacant_housing_units_for_sale,dwellings_1_units_detached,dwellings_1_units_attached,dwellings_2_units,dwellings_3_to_4_units,dwellings_5_to_9_units,dwellings_10_to_19_units,dwellings_20_to_49_units,dwellings_50_or_more_units,mobile_homes,housing_built_2005_or_later,housing_built_2000_to_2004,housing_built_1939_or_earlier,median_year_structure_built,married_households,nonfamily_households,family_households,households_public_asst_or_food_stamps,male_male_households,female_female_households,children_in_single_female_hh,owner_occupied_housing_units,million_dollar_housing_units,mortgaged_housing_units,different_house_year_ago_different_city,different_house_year_ago_same_city,families_with_young_children,two_parent_families_with_young_children,two_parents_in_labor_force_families_with_young_children,two_parents_father_in_labor_force_families_with_young_children,two_parents_mother_in_labor_force_families_with_young_children,two_parents_not_in_labor_force_families_with_young_children,one_parent_families_with_young_children,father_one_parent_families_with_young_children,father_in_labor_force_one_parent_families_with_young_children,commute_less_10_mins,commute_10_14_mins,commute_15_19_mins,commute_20_24_mins,commute_25_29_mins,commute_30_34_mins,commute_35_44_mins,commute_60_more_mins,commute_45_59_mins,worked_at_home,no_car,no_cars,one_car,two_cars,three_cars,four_more_cars,aggregate_travel_time_to_work,group_quarters,associates_degree,bachelors_degree,high_school_diploma,less_one_year_college,masters_degree,one_year_more_college,less_than_high_school_graduate,high_school_including_ged,bachelors_degree_2,bachelors_degree_or_higher_25_64,graduate_professional_degree,some_college_and_associates_degree,male_45_64_associates_degree,male_45_64_bachelors_degree,male_45_64_graduate_degree,male_45_64_less_than_9_grade,male_45_64_grade_9_12,male_45_64_high_school,male_45_64_some_college,male_45_to_64,not_in_labor_force,armed_forces,occupation_management_arts,occupation_natural_resources_construction_maintenance,occupation_production_transportation_material,occupation_sales_office,occupation_services,in_grades_1_to_4,in_grades_5_to_8,in_grades_9_to_12,in_undergrad_college,year,male_under_18,male_18_to_24,male_25_to_49,male_50_to_66,male_above_67,male_x,female_under_18,female_18_to_24,female_25_to_49,female_50_to_66,female_above_67,occupation_x,in_grades_x,vacant_housing_units_x
4,51341,25,21,25,0,0,0,0,0,0,0,0,25,0,0,0,0,0,0,7,7,25,4,25,556,26900,32500,45000,21,7,0.0,0.0,24,1,0,0,0,0,0,0,0,0,0,7,1950,2,14,7,1,0,0,0,14,0,0,0,5,0,0,0,0,0,0,0,0,0,6,2,4,0,0,0,0,0,0,0,1,1,7,6,0,7,152,0,0,1,16,0,0,2,6,16,1,1,0,2,0,1,0,2,2,7,2,14,13,0,1.0,2.0,0.0,5.0,4.0,0.0,0.0,0.0,0,2018,0,0,0,11,4,4.0,0,0,0,0,2,0.0,0.0,4.0
5,38578,158,62,158,0,0,0,0,0,0,0,0,158,0,0,0,0,0,0,5,6,108,14,100,768,105200,145800,189100,62,13,0.0,21.0,68,0,26,6,0,0,0,0,0,0,4,0,1983,38,24,38,11,0,0,0,49,0,19,13,3,6,6,0,6,0,0,0,0,0,5,0,0,7,0,0,0,0,0,0,0,5,30,27,0,0,204,50,10,32,19,3,28,13,33,19,32,3,38,26,0,3,0,0,0,3,5,11,140,0,2.0,0.0,8.0,0.0,2.0,0.0,0.0,0.0,0,2018,0,2,5,3,43,3.0,6,2,0,14,80,0.0,0.0,17.0
8,88353,152,84,30,0,0,0,122,0,0,0,122,30,0,0,0,0,0,2,0,4,152,23,288,325,39400,45200,52500,84,38,6.0,0.0,232,0,16,0,0,0,0,0,40,0,5,62,1954,37,47,37,17,0,0,0,46,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,2,0,5,2,0,5,37,42,0,0,389,0,21,6,82,0,0,5,34,86,6,2,0,26,0,0,0,0,0,6,0,6,138,0,9.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0,2018,0,0,0,5,32,6.0,0,0,30,31,46,0.0,0.0,198.0


In [53]:
#Dropping the 'dummified' version of owner occupied housing units
ACS.drop(columns = ['owner_occupied_housing_units_lower_value_quartile',
                   'owner_occupied_housing_units_upper_value_quartile'], inplace = True)

The column *housing_units_renter_occupied* covers more renter occupied homes than the values shown in column *renter_occupied_housing_units_paying_cash_median_gross_rent*. As such, the latter column will be dropped. 

In [54]:
ACS.drop(columns = 'renter_occupied_housing_units_paying_cash_median_gross_rent', inplace = True)

In [55]:
ACS.drop(columns = ['occupied_housing_units'], inplace = True)

Median structure built will indicate a general idea of house age, as such housing built in specific years will be dropped.

In [56]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'housing_built_')))]

Additionally, economic description of housing units will be dropped as this information can be seen through in the IRS data.

In [57]:
ACS.drop(columns = ['group_quarters', 'million_dollar_housing_units', 'mortgaged_housing_units'], inplace = True)

### Men in Ages 45-65
There were a significant number of features focusing on men ranging from 45 to 65 years of age. Since this data is encompassed in the general male population, these features will be dropped.

In [58]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'male_45_64')))]
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'male_45_54')))]
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'male_55_64')))]
ACS.drop(columns = 'male_45_to_64', inplace = True)

### Poverty

In [59]:
ACS[['pop_determined_poverty_status', 'poverty']].head(3)

Unnamed: 0,pop_determined_poverty_status,poverty
4,25,4
5,108,14
8,152,23


Looking at the two poverty columns, as well as the schema for ACS, these can be combined to achieve total number of persons living in poverty.

In [60]:
#Summing the two poverty columns
ACS['poverty_total'] = ACS['pop_determined_poverty_status'] + ACS['poverty']

#Dropping the two and keeping the total column
ACS.drop(columns = ['pop_determined_poverty_status', 'poverty'], inplace = True)
ACS.drop(columns = 'households_public_asst_or_food_stamps', inplace = True)

### Families with Young Children
The total column will be kept for this, whereas the detailed breakdown will be removed.

In [61]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'with_young_children')))]
ACS.drop(columns = 'children_in_single_female_hh', inplace = True)

### Population by Race

In [62]:
ACS[['white_pop', 'not_us_citizen_pop', 'black_pop', 'asian_pop', 'hispanic_pop', 'amerindian_pop',
     'other_race_pop', 'two_or_more_races_pop', 'hispanic_any_race', 'not_hispanic_pop']].head(3)

Unnamed: 0,white_pop,not_us_citizen_pop,black_pop,asian_pop,hispanic_pop,amerindian_pop,other_race_pop,two_or_more_races_pop,hispanic_any_race,not_hispanic_pop
4,25,0,0,0,0,0,0,0,0,25
5,158,0,0,0,0,0,0,0,0,158
8,30,0,0,0,122,0,0,0,122,30


Based on the results above, columns *hispanic_pop* and *hispanic_any_race* are the same, and column *not_hispanic_pop* is the sum of all races that are not hispanic. As such, the two columns will be dropped.

In [63]:
ACS.drop(columns = ['hispanic_any_race', 'not_hispanic_pop'], inplace = True)

### Household Types

In [64]:
ACS = ACS[ACS.columns.drop(list(ACS.filter(regex = 'households')))]

### No Car(s)
Both columns *no_car* and *no_cars* describe the same phenomena, although cars is focused on households without cars, rather than people. This value will be kept for the analysis.

In [65]:
ACS.drop(columns = 'no_cars', inplace = True)

### Missingness and Shape Final Check

In [66]:
missingness(ACS)

{}

In [67]:
ACS.shape

(132049, 83)

## Exporting ACS Data

In [68]:
ACS.to_csv('ACS_all_zipcodes.csv')