In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import pandas as pd
import us
from us import states
from census import Census
import json
# Census API Key
from config import api_key
c = Census(api_key, year=2021)

In [3]:
# Years targeted for collection in census data
years=[2018,2019,2020]

complete_years=pd.DataFrame()
for curr_year in years:
    census_data = c.acs5.state_county(fields = ('NAME', "B01003_001E", "B01002_001E", "B19301_001E", "B17001_002E", "B27019_001E"),
                                      state_fips = "*",
                                      county_fips = "*", year = curr_year)

    census_data_curr_year=pd.DataFrame(census_data)

    # Column Reordering and assign the data-types appropriate names
    census_data_curr_year.rename(columns={"B01003_001E": "Population",
                                     "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                     "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                     "B27019_001E" : "Educational Attainment"
                                     },inplace=True)

    census_data_curr_year["Year"]=curr_year
    frames=[complete_years,census_data_curr_year]
    complete_years=pd.concat(frames)
complete_years.reset_index()
complete_years.head()

Unnamed: 0,NAME,Population,Median Age,Per Capita Income,Poverty Count,Educational Attainment,state,county,Year
0,"Sedgwick County, Kansas",512064.0,35.2,28673.0,70537.0,317622.0,20,173,2018
1,"Republic County, Kansas",4686.0,51.1,28901.0,490.0,3352.0,20,157,2018
2,"Graham County, Kansas",2545.0,51.9,26685.0,235.0,1860.0,20,65,2018
3,"Douglas County, Kansas",119319.0,29.5,30315.0,20749.0,65877.0,20,45,2018
4,"Sheridan County, Kansas",2506.0,44.3,31526.0,126.0,1774.0,20,179,2018


In [4]:
# Convert to Data to Json
complete_years.to_json("complete_years.json",orient="records")

In [6]:
# Open Census Json Data, store in varaible
states_json_file = open("complete_years.json")
state_json_data = json.load(states_json_file)

# Open GeoJson Data, store in varaible
geojson_file = open("static/data/GeoJsons.json")
geojson_data = json.load(geojson_file)

In [7]:
## This block is to condense the State Census Jsons. 
## Outputs a single Json per county, with a key-value pair for each year of data

# Utility List to track which counties have been collected
counties_collected = []

# List to store the outputed dictionaries
counties_data = []


for county in state_json_data:
    
    #If this is the first time encountering a particular county, creates a new dictionary for it, stores data
    if county["NAME"] not in counties_collected:
        counties_collected.append(county["NAME"])
        json = {}
        json["name"] = county["NAME"],
        json["state"] = county["state"]
        json["county"] = county["county"]
        information = {}
        information["Population"] = county["Population"],
        information["Median Age"] = county["Median Age"]
        information["Per Capita Income"] = county["Per Capita Income"]
        information["Poverty Count"] = county["Poverty Count"]
        information['Educational Attainment'] = county['Educational Attainment']
        json[county["Year"]] = information
        counties_data.append(json)
        
    # If it has arleady encounterd the county, appends the standing entry and stores the new year's data
    else:
        location = counties_collected.index(county["NAME"])
        json = counties_data[location]
        information = {}
        information["Population"] = county["Population"],
        information["Median Age"] = county["Median Age"]
        information["Per Capita Income"] = county["Per Capita Income"]
        information["Poverty Count"] = county["Poverty Count"]
        information['Educational Attainment'] = county['Educational Attainment']
        json[county["Year"]] = information
        
#Sample Output
counties_data[0]

{'name': ('Sedgwick County, Kansas',),
 'state': '20',
 'county': '173',
 2018: {'Population': (512064.0,),
  'Median Age': 35.2,
  'Per Capita Income': 28673.0,
  'Poverty Count': 70537.0,
  'Educational Attainment': 317622.0},
 2019: {'Population': (513375.0,),
  'Median Age': 35.4,
  'Per Capita Income': 29530.0,
  'Poverty Count': 69506.0,
  'Educational Attainment': 320007.0},
 2020: {'Population': (515416.0,),
  'Median Age': 35.5,
  'Per Capita Income': 30340.0,
  'Poverty Count': 67860.0,
  'Educational Attainment': 322046.0}}

In [8]:
# Create a list containing unique identifies for each county. 
# These values are a string of state+county (EX: State: '20', County: "173" >>> 20173)

county_ids = []
for county in counties_data:
    county_id = county["state"]+county["county"]
    county_ids.append(county_id)

In [9]:
#Iterate through each GeoJson, pair the corresponding CensusDataJson

#Track Failed which could not pair
fails =[]


for county in geojson_data:
    
    #Generate the countyID as above for each GeoJson
    geojson_county_id = county["properties"]['STATE'] + county["properties"]['COUNTY']
    try:
        #Attempts to locate any county which has the same county_id in the censusJson List
        location = county_ids.index(str(geojson_county_id))
        matching_county_data = counties_data[location]
        county["properties"][2018] = matching_county_data[2018]
        county["properties"][2019] = matching_county_data[2019]
        county["properties"][2020] = matching_county_data[2020]
    except Exception:
        
        #If it cannot locate it stores the id for this county in the fail list
        fails.append(geojson_county_id)

In [14]:
#number of fail counties
print(len(geojson_data))
print(len(fails))


3221
4


In [15]:
#Remove counties from GeoJson list which failed

for county in geojson_data:
    #Generate the countyID as above for each GeoJson
    geojson_county_id = county["properties"]['STATE'] + county["properties"]['COUNTY']
    if geojson_county_id in fails:
        geojson_data.remove(county)

#Length of final geojson_data list
print(len(geojson_data))

3218
