In [46]:
import geopandas as gpd
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import io

# For each address.csv.xz, check that the set of geoids match the set of geoids from the shapefiles. If not, fix those rows

In [13]:
address_dir = '../../data/address/'
shapefile_dir = '../../data/shapefiles'

In [19]:
p = Path(address_dir).iterdir()
xz_files = sorted([x for x in p if x.is_file() and x.suffix=='.xz'])
xz_files

[PosixPath('../../data/address/01001.csv.xz'),
 PosixPath('../../data/address/01003.csv.xz'),
 PosixPath('../../data/address/01005.csv.xz'),
 PosixPath('../../data/address/01007.csv.xz'),
 PosixPath('../../data/address/01009.csv.xz'),
 PosixPath('../../data/address/01011.csv.xz'),
 PosixPath('../../data/address/01013.csv.xz'),
 PosixPath('../../data/address/01015.csv.xz'),
 PosixPath('../../data/address/01017.csv.xz'),
 PosixPath('../../data/address/01019.csv.xz'),
 PosixPath('../../data/address/01021.csv.xz'),
 PosixPath('../../data/address/01023.csv.xz'),
 PosixPath('../../data/address/01025.csv.xz'),
 PosixPath('../../data/address/01027.csv.xz'),
 PosixPath('../../data/address/01029.csv.xz'),
 PosixPath('../../data/address/01031.csv.xz'),
 PosixPath('../../data/address/01033.csv.xz'),
 PosixPath('../../data/address/01035.csv.xz'),
 PosixPath('../../data/address/01037.csv.xz'),
 PosixPath('../../data/address/01039.csv.xz'),
 PosixPath('../../data/address/01041.csv.xz'),
 PosixPath('.

In [None]:
pbar = tqdm(xz_files)
error_files = {}
for file in pbar:
    try:
        county_fip = file.name.split('.')[0]
        pbar.set_description('(%s) Checking county: %s, %s' % (len(error_files), file, county_fip))
        adf = pd.read_csv(file, dtype={'GEOID20':object})
        sdf = gpd.read_file(os.path.join(shapefile_dir,'%s.geojson' % county_fip))
        
        errors = []
            
    except Exception as e:
        errors.append('Exception occured: %s' % e)
    finally:
        if not set(sdf['GEOID20']) == set(adf['GEOID20']):
            errors.append('GEOID column mismatch')
            
        if county_fip != sdf['GEOID20'].values[0][:5]:
            errors.append('County prefix mismatch')
        
        if len(errors) > 0:
            error_files[county_fip] = errors
print('Mismatch in files: %s' % len(error_files))

(0) Checking county: ../../data/address/72113.csv.xz, 72113:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎| 3200/3221 [58:01<00:02,  9.08it/s]

In [96]:
error_files

{'01001': [],
 '01003': [],
 '01005': [],
 '01007': [],
 '01009': [],
 '01011': [],
 '01013': [],
 '01015': [],
 '01017': [],
 '01019': [],
 '01021': [],
 '01023': [],
 '01025': [],
 '01027': [],
 '01029': [],
 '01031': [],
 '01033': [],
 '01035': []}

# For every error, fix it!!

In [52]:
def get_geojson(fip, save_dir ='../../data/shapefiles/'):
    prefix = 'https://www2.census.gov/geo/tiger/TIGER2020PL/LAYER/TABBLOCK/2020/'
    r= requests.get(prefix)
    soup = BeautifulSoup(r.content)
    shape_zips = []
    for a in soup.find_all('a', href=True):
        if not '.zip' in a['href'] or len(a['href'])!= 28: # filter for the entire state ones
            continue
        shape_zips.append(a['href'])
    to_download = [v for v in shape_zips if fip in v][0]
    gdf = gpd.read_file(prefix+to_download)
    save_filepath = os.path.join(save_dir, '%s.geojson' % fip)
    gdf.to_file(save_filepath)
    assert os.path.isfile(save_filepath)

In [76]:
get_geojson('06037')

In [72]:
error_files

['09001', '39097']

In [67]:
def download_missing_geojsons(fips, shapefile_dir='../../data/shapefiles/'):
    for fip in fips:
        if os.path.isfile(os.path.join(shapefile_dir, '%s.geojson' % fip)):
            continue # skip if file already exists
        print('Downloading: %s' % fip)
        get_geojson(fip)

In [75]:
download_missing_geojsons(error_files)

Downloading: 39097


In [88]:
def generate_empty_xz_file(fip, address_dir = '../../data/address/'):
    print('Generating .xz for: %s' % fip)
    assert os.path.isfile(os.path.join(shapefile_dir, '%s.geojson' % fip)), print('No geojson for: %s' % fip)
    sdf = gpd.read_file(os.path.join(shapefile_dir,'%s.geojson' % fip))
    cols = ['address','GEOID20', 'longitude', 'latitude']
    adf = sdf.reindex(columns=cols)
    # print(adf)
    adf.to_csv(os.path.join(address_dir, '%s.csv.xz' % fip), index=False)
    assert os.path.isfile(os.path.join(address_dir, '%s.csv.xz' % fip))

In [89]:
for e in error_files:
    generate_empty_xz_file(e)

Generating .xz for: 09001
Generating .xz for: 39097


In [94]:
# def fix_xz_file(fip, address_dir = '../../data/address/', shapefile_dir='../../data/shapefiles/'):
#     assert os.path.isfile(os.path.join(address_dir, '%s.csv.xz' % fip)), print('No csv.xz for: %s' % fip)
#     assert os.path.isfile(os.path.join(shapefile_dir, '%s.geojson' % fip)), print('No geojson for: %s' % fip)
#     adf = pd.read_csv(os.path.join(address_dir, '%s.csv.xz'%fip), dtype={'GEOID20':object})
#     sdf = gpd.read_file(os.path.join(shapefile_dir,'%s.geojson' % fip))
#     # print(fip)
#     print(os.path.join(shapefile_dir, '%s.geojson' % fip))
#     # print(set(sdf['GEOID20']).symmetric_difference(set(adf['GEOID20'])))
#     print('[%s]: %s: %s' % (fip, len(sdf['GEOID20'].unique()), len(adf['GEOID20'].unique())))

# for e in error_files:
#     fix_xz_file(e)

../../data/shapefiles/09001.geojson
[09001]: 10997: 10997
../../data/shapefiles/39097.geojson
[39097]: 1373: 1373
