In [34]:
import pandas as pd
import os
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
import contextily as ctx
import matplotlib as mpl
import pathlib
# from pandarallel import pandarallel

# Generate Empty (Go through each county. If there is a file, join on geoid, if there is not, create empty columns)

In [35]:
prefix = 'https://www2.census.gov/geo/tiger/TIGER2020PL/LAYER/TABBLOCK/2020/'
r= requests.get(prefix)
soup = BeautifulSoup(r.content)
valid_zips = {}

for a in soup.find_all('a', href=True):
    if not '.zip' in a['href'] or len(a['href'])!= 28: # filter for the entire state ones
        continue
    valid_zips[a['href'].split('_')[2]] = prefix+a['href']
print(len(valid_zips))

3221


In [36]:
pbar = tqdm(valid_zips)
i = 0
for county in pbar:
    i+= 1
    
    # Assuming manually downloaded
    county_shape_df = gpd.read_file('../../data/shapefiles/tl_2020_{county}_tabblock20.zip'.format(county=county))    
    if os.path.isfile('../../data/address/%s.csv.xz' % county):
        cdf = pd.read_csv('../../data/address/%s.csv.xz' % county, dtype={'GEOID20':object})
        
        # Skip if the length is already the same
        if len(cdf['GEOID20'].unique()) == len(county_shape_df['GEOID20'].unique()):
            pbar.set_description('%s geoid length same, skipping' % county)
            continue
        pbar.set_description('%s geoid not the same, merging' % county)
        mdf = pd.merge(cdf,county_shape_df[['GEOID20']],on='GEOID20', how='right') # create empty rows based on authority shape fiels
    else:
        mdf = county_shape_df[['GEOID20']]
        pbar.set_description('%s address not found, adding' % county)
    mdf = mdf.reindex(columns= ['address', 'GEOID20', 'longitude', 'latitude'])
    
    # Fail if the lengths are not the same
    assert len(mdf['GEOID20'].unique()) == len(county_shape_df['GEOID20'].unique()), print('%s: %s' % (len(mdf['GEOID20'].unique()), len(county_shape_df['GEOID20'].unique())))
    pbar.set_description('%s length of file same, saving' % county)
    mdf.to_csv('../data/address/%s.csv.xz' % county, index =False)

72153 geoid length same, skipping: 100%|█████████████████████████| 3221/3221 [18:45<00:00,  2.86it/s]


# Generate Report

In [37]:
# df['address_count'] = df['address'].apply(lambda x: 0 if pd.isnull(x) else 1)
# tdf = df.groupby('GEOID20')['address_count'].sum().reset_index()
# assert df['GEOID20'].unique().size == len(tdf) # make sure size of unique GEOIDs are the same

d = '../../data/address'
dfs= []

pbar = tqdm(sorted(pathlib.Path('../../data/address').glob('*.csv.xz')))
for file in pbar:
    pbar.set_description('Reading: %s' % file)
    df = pd.read_csv(os.path.join(d,file), dtype={'GEOID20': object})
    
    assert all(df['GEOID20'].map(len)==15), print(file)
    
    df['address_count'] = df['address'].apply(lambda x: 0 if pd.isnull(x) else 1)
    report = df.groupby('GEOID20')['address_count'].sum().reset_index()    
    dfs.append(report)

final_df = pd.concat(dfs)
assert len(final_df) == 8174955 # Hard check
final_df = final_df.sort_values(by=['GEOID20', 'address_count'])
final_df.to_csv('../../data/coverage_report.csv.xz', index=False) # save file to report
final_df

Reading: ../../data/address/72153.csv.xz: 100%|██████████████████| 3221/3221 [02:27<00:00, 21.84it/s]


Unnamed: 0,GEOID20,address_count
0,010010201001000,1
1,010010201001001,1
2,010010201001002,1
3,010010201001003,1
4,010010201001004,1
...,...,...
592,721537506022011,0
593,721537506022012,0
594,721537506022013,0
595,721537506022014,0


In [38]:
final_df.to_csv('../../data/coverage_report.csv.xz', index=False) # save file to report
final_df

Unnamed: 0,GEOID20,address_count
0,010010201001000,1
1,010010201001001,1
2,010010201001002,1
3,010010201001003,1
4,010010201001004,1
...,...,...
592,721537506022011,0
593,721537506022012,0
594,721537506022013,0
595,721537506022014,0


In [40]:
len(final_df[final_df['address_count'] >= 1]) / len(final_df) * 100

41.13802461297952