In [1]:
import pandas as pd
import requests
import json
from tqdm import tqdm
import time
import os
import traceback
import geopandas as gpd
from shapely.geometry import Point

In [2]:
state = 'va'

In [3]:
with open('../data/state_county.json', 'r') as f:
    scj = json.load(f)
len(scj)

1585

In [4]:
state_files = [v for v in scj if v.split('_')[0] == state]
len(state_files)

133

In [5]:
state_files

['va_accomack',
 'va_albemarle',
 'va_alexandria_city',
 'va_alleghany',
 'va_amelia',
 'va_amherst',
 'va_appomattox',
 'va_arlington',
 'va_augusta',
 'va_bath',
 'va_bedford',
 'va_bland',
 'va_botetourt',
 'va_bristol_city',
 'va_brunswick',
 'va_buchanan',
 'va_buckingham',
 'va_buena_vista_city',
 'va_campbell',
 'va_caroline',
 'va_carroll',
 'va_charles_city',
 'va_charlotte',
 'va_charlottesville_city',
 'va_chesapeake_city',
 'va_chesterfield',
 'va_clarke',
 'va_colonial_heights_city',
 'va_covington_city',
 'va_craig',
 'va_culpeper',
 'va_cumberland',
 'va_danville_city',
 'va_dickenson',
 'va_dinwiddie',
 'va_emporia_city',
 'va_essex',
 'va_fairfax',
 'va_fairfax_city',
 'va_falls_church_city',
 'va_fauquier',
 'va_floyd',
 'va_fluvanna',
 'va_franklin',
 'va_franklin_city',
 'va_frederick',
 'va_fredericksburg_city',
 'va_galax_city',
 'va_giles',
 'va_gloucester',
 'va_goochland',
 'va_grayson',
 'va_greene',
 'va_greensville',
 'va_halifax',
 'va_hampton_city',
 'va_h

In [6]:
county_fips = pd.read_csv('../data/fips_county.csv.xz', dtype={'fips':object}) # read fips as a string
county_fips

Unnamed: 0,fips,county
0,01000,alabama
1,01001,autauga_county
2,01003,baldwin_county
3,01005,barbour_county
4,01007,bibb_county
...,...,...
3190,56037,sweetwater_county
3191,56039,teton_county
3192,56041,uinta_county
3193,56043,washakie_county


In [7]:
test_fips = county_fips

In [8]:
test_fips['county'] = test_fips['county'].apply(lambda x: x.replace('_county',''))
test_fips['county'] = test_fips['county'].apply(lambda x: x.replace('_city',''))
test_fips[test_fips['county'].duplicated()==True]

Unnamed: 0,fips,county
114,05001,arkansas
120,05013,calhoun
124,05021,clay
125,05023,cleburne
133,05039,dallas
...,...,...
3183,56023,lincoln
3186,56029,park
3187,56031,platte
3188,56033,sheridan


In [9]:
def get_fips(county_name, county_fips):
    # county_name = county_name.replace('_county', '') # because cities also match
    matched_fips = county_fips[county_fips['county'].str.contains(county_name)]
    if len(matched_fips) > 1:
        return matched_fips
    if len(matched_fips) <= 0:
        return None
    return matched_fips['fips'].values[0]

In [10]:
get_fips('williamsburg_county', county_fips)

In [11]:
get_fips('williamsburg_city', county_fips)

In [12]:
get_fips('williamsburg', county_fips)

Unnamed: 0,fips,county
2400,45089,williamsburg
3000,51830,williamsburg


In [13]:
print(get_fips('blah', county_fips))

None


# Remove non virginia fips from selection

In [14]:
state_fips = pd.read_csv('../data/fips_state.csv', dtype={'fips':object})
state_fips

Unnamed: 0,fips,state,abbr
0,1,alabama,al
1,2,alaska,ak
2,4,arizona,az
3,5,arkansas,ar
4,6,california,ca
5,8,colorado,co
6,9,connecticut,ct
7,10,delaware,de
8,11,district_of_columbia,dc
9,12,florida,fl


In [15]:
state_fip = state_fips[state_fips['abbr']==state]['fips'].values[0]
state_fip

'51'

In [16]:
county_fips['state_fip'] = county_fips['fips'].apply(lambda x: x[:2])
county_fips

Unnamed: 0,fips,county,state_fip
0,01000,alabama,01
1,01001,autauga,01
2,01003,baldwin,01
3,01005,barbour,01
4,01007,bibb,01
...,...,...,...
3190,56037,sweetwater,56
3191,56039,teton,56
3192,56041,uinta,56
3193,56043,washakie,56


In [17]:
county_fips = county_fips[county_fips['state_fip'] == state_fip]
county_fips

Unnamed: 0,fips,county,state_fip
2865,51000,virginia,51
2866,51001,accomack,51
2867,51003,albemarle,51
2868,51005,alleghany,51
2869,51007,amelia,51
...,...,...,...
2997,51800,suffolk,51
2998,51810,virginia_beach,51
2999,51820,waynesboro,51
3000,51830,williamsburg,51


# Import state shape files

In [18]:
fp = "../data/shapefiles/tl_2020_51_tabblock20/tl_2020_51_tabblock20.shp"
block_df = gpd.read_file(fp)
block_df

Unnamed: 0,STATEFP20,COUNTYFP20,TRACTCE20,BLOCKCE20,GEOID20,NAME20,MTFCC20,UR20,UACE20,UATYPE20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,INTPTLON20,HOUSING20,POP20,geometry
0,51,065,020101,1027,510650201011027,Block 1027,G5040,R,,,S,1547604,0,+37.8704905,-078.2665116,18,40,"POLYGON ((-78.27375 37.87051, -78.27324 37.870..."
1,51,197,050101,1061,511970501011061,Block 1061,G5040,U,97453,U,S,15055,0,+36.9409610,-081.0882937,6,8,"POLYGON ((-81.08860 36.94152, -81.08830 36.941..."
2,51,197,050101,3063,511970501013063,Block 3063,G5040,R,,,S,11611,0,+36.9412795,-080.9753199,0,0,"POLYGON ((-80.97608 36.94150, -80.97607 36.941..."
3,51,197,050301,1036,511970503011036,Block 1036,G5040,R,,,S,166245,0,+36.8857859,-081.2774449,3,4,"POLYGON ((-81.28102 36.88490, -81.28071 36.885..."
4,51,199,050500,2052,511990505002052,Block 2052,G5040,R,,,S,292731,0,+37.2259510,-076.5020127,0,0,"POLYGON ((-76.50644 37.22507, -76.50640 37.225..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163486,51,159,040100,3037,511590401003037,Block 3037,G5040,R,,,S,4561610,48673,+37.8986088,-076.7515189,18,38,"POLYGON ((-76.76612 37.88520, -76.76576 37.885..."
163487,51,109,950500,3014,511099505003014,Block 3014,G5040,R,,,S,43464,0,+37.8490036,-077.7906355,3,19,"POLYGON ((-77.79179 37.84769, -77.79164 37.848..."
163488,51,109,950102,1008,511099501021008,Block 1008,G5040,R,,,S,0,4267,+38.0630940,-077.8308694,0,0,"POLYGON ((-77.83180 38.06289, -77.83098 38.063..."
163489,51,650,011800,4009,516500118004009,Block 4009,G5040,U,90892,U,S,81063,0,+37.0129132,-076.3642622,30,188,"POLYGON ((-76.36596 37.01361, -76.36514 37.014..."


In [19]:
block_df['county_fip'] = block_df['STATEFP20'] + block_df['COUNTYFP20']
block_df

Unnamed: 0,STATEFP20,COUNTYFP20,TRACTCE20,BLOCKCE20,GEOID20,NAME20,MTFCC20,UR20,UACE20,UATYPE20,FUNCSTAT20,ALAND20,AWATER20,INTPTLAT20,INTPTLON20,HOUSING20,POP20,geometry,county_fip
0,51,065,020101,1027,510650201011027,Block 1027,G5040,R,,,S,1547604,0,+37.8704905,-078.2665116,18,40,"POLYGON ((-78.27375 37.87051, -78.27324 37.870...",51065
1,51,197,050101,1061,511970501011061,Block 1061,G5040,U,97453,U,S,15055,0,+36.9409610,-081.0882937,6,8,"POLYGON ((-81.08860 36.94152, -81.08830 36.941...",51197
2,51,197,050101,3063,511970501013063,Block 3063,G5040,R,,,S,11611,0,+36.9412795,-080.9753199,0,0,"POLYGON ((-80.97608 36.94150, -80.97607 36.941...",51197
3,51,197,050301,1036,511970503011036,Block 1036,G5040,R,,,S,166245,0,+36.8857859,-081.2774449,3,4,"POLYGON ((-81.28102 36.88490, -81.28071 36.885...",51197
4,51,199,050500,2052,511990505002052,Block 2052,G5040,R,,,S,292731,0,+37.2259510,-076.5020127,0,0,"POLYGON ((-76.50644 37.22507, -76.50640 37.225...",51199
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163486,51,159,040100,3037,511590401003037,Block 3037,G5040,R,,,S,4561610,48673,+37.8986088,-076.7515189,18,38,"POLYGON ((-76.76612 37.88520, -76.76576 37.885...",51159
163487,51,109,950500,3014,511099505003014,Block 3014,G5040,R,,,S,43464,0,+37.8490036,-077.7906355,3,19,"POLYGON ((-77.79179 37.84769, -77.79164 37.848...",51109
163488,51,109,950102,1008,511099501021008,Block 1008,G5040,R,,,S,0,4267,+38.0630940,-077.8308694,0,0,"POLYGON ((-77.83180 38.06289, -77.83098 38.063...",51109
163489,51,650,011800,4009,516500118004009,Block 4009,G5040,U,90892,U,S,81063,0,+37.0129132,-076.3642622,30,188,"POLYGON ((-76.36596 37.01361, -76.36514 37.014...",51650


# Print example address file

In [20]:
eaf = pd.read_csv('../data/va_craig.csv.xz')
eaf

Unnamed: 0,state,county,zip,longitude,latitude,address,id
0,va,craig,24127,-80.346633,37.376760,"758 valley roller mill rd,new castle,va,24127",va_craig
1,va,craig,24128,-80.346100,37.385808,"447 allen hill dr,newport,va,24128",va_craig
2,va,craig,24128,-80.357049,37.386491,"106 allen hill dr,newport,va,24128",va_craig
3,va,craig,24128,-80.367980,37.378768,"477 olde glade trl,newport,va,24128",va_craig
4,va,craig,24128,-80.367238,37.360939,"3923 northside rd,newport,va,24128",va_craig
...,...,...,...,...,...,...,...
3290,va,craig,24131,-80.256852,37.569588,"16068 paint bank rd,paint bank,va,24131",va_craig
3291,va,craig,24128,-80.366784,37.371116,"1039 olde glade trl,newport,va,24128",va_craig
3292,va,craig,24127,-80.136663,37.445012,"1025 waltons mountain rd,new castle,va,24127",va_craig
3293,va,craig,24131,-80.244443,37.570810,"935 hidden meadow ln,paint bank,va,24131",va_craig


In [21]:
eaf.columns

Index(['state', 'county', 'zip', 'longitude', 'latitude', 'address', 'id'], dtype='object')

In [22]:
import warnings
warnings.filterwarnings("ignore")

# You can map fips using the geometry 

In [30]:
# pbar = tqdm(state_files)

for sc in tqdm(state_files):
    save_file = '../data/state_county/%s.csv.xz'%sc        
    scdf = pd.read_csv(save_file)       
    
    # skip if already parsed
    if 'geoid20' in scdf.columns:
        continue
        
    # if upper case, but otherwise parsed
    if 'GEOID20' in scdf.columns:
        scdf = scdf.rename(columns={'GEOID20': 'geoid20'})
        scdf.to_csv(save_file, index=False)
        continue
            
    # convert the state county address file into a geopandas data frame
    gdf = gpd.GeoDataFrame(scdf, geometry=gpd.points_from_xy(scdf.longitude, scdf.latitude))

    # Spatial join the address with points and the virginia block with polygons
    
    county_with_fips = block_df.sjoin(gdf, how='inner',predicate='intersects')
    county_with_fips = county_with_fips[['state', 'county', 'zip', 'longitude', 'latitude', 'address', 'id','GEOID20']]
    
    county_with_fips = county_with_fips.rename(columns={'GEOID20': 'geoid20'})
    
    if (len(scdf)-len(county_with_fips) > len(scdf)*0.05): # losing up to 5% addresses in join is acceptable
        raise Exception('Margins larger 5%% (%s out of %s)' % (len(scdf)-len(county_with_fips), len(scdf)))
    else:
        county_with_fips.to_csv(save_file, index=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [00:09<00:00, 14.01it/s]
