## Iterate across each county, use reverse geocoding on 10 random location per census block and get an address
## Then, remove duplicates and save back to the original data frame. Worry about verifying the freshness of the address later

In [2]:
import pandas as pd
import geopandas as gpd
import random
import numpy as np
from geopy.point import Point
from shapely.geometry import Point as sPoint
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from tqdm import tqdm
import pathlib

In [102]:
def get_lat_longs(gdf, census_block, point_per_block = 10):
    aoi = gdf[gdf['GEOID20'] == census_block]
    aoi_geom = aoi.unary_union

    # find area bounds
    bounds = aoi_geom.bounds
    xmin, ymin, xmax, ymax = bounds

    xext = xmax - xmin
    yext = ymax - ymin

    points = []
    while len(points) < point_per_block:
        # generate a random x and y
        x = xmin + random.random() * xext
        y = ymin + random.random() * yext
        p = sPoint(x, y)
        if aoi_geom.contains(p):  # check if point is inside geometry
            points.append(p)

    return points

In [109]:
geolocator = Nominatim(user_agent="test")

def reverse_geocoding(lat, lon):
    try:
        location = geolocator.reverse(Point(lat, lon))
        return location.raw['display_name']
    except:
        return None

In [10]:
shape_file_url ='https://www2.census.gov/geo/tiger/TIGER2020PL/LAYER/TABBLOCK/2020/tl_2020_{county}_tabblock20.zip'

In [None]:
def fire_in_the_hole(file, county, geo_column_id='GEOID20', num_samples = 10, save=False):
    df = pd.read_csv(file)
    # We are only interested in reverse-geocoding addresses that are empty
    empty_df = df[df['address'].isnull()]
    
    # Skip is there are no empty addresses
    if empty_df.empty:
        return
    
    # Download the shapefiles
    gdf = gpd.read_file(shape_file_url.format(county=county))
    empty_sdf = gdf[gdf[geo_column_id].isin(list(empty_df[geo_column_id].unique()))]        
        
    # fire_in_the_hole(file, '12049')
    df = pd.read_csv(file, dtype={geo_column_id:object})
    # We are only interested in reverse-geocoding addresses that are empty
    empty_df = df[df['address'].isnull()]
    
    dfs = []
    for geoid in tqdm(empty_df[geo_column_id]):
        pdf = empty_df[empty_df[geo_column_id]==geoid]
        pdf =pdf.loc[pdf.index.repeat(num_samples)]
        pdf = pdf.reset_index(drop=True)

        assert geoid in gdf[geo_column_id].unique(), print(geoid)
        pts = get_lat_longs(gdf, geoid, point_per_block = num_samples)
        pdf['geometry'] = pts
        pdf['longitude'] = pdf['geometry'].apply(lambda x:x.x)
        pdf['latitude'] = pdf['geometry'].apply(lambda x:x.y)
        pdf['address'] = np.vectorize(reverse_geocoding)(pdf['latitude'], pdf['longitude'])
        # print(pdf)
        dfs.append(pdf)
    rgeo_df = pd.concat(dfs)
    final_df = pd.concat([df[~df['address'].isnull()], rgeo_df])
    if save:
        final_df.to_csv(file, index=False)   
    return final_df

# Return the results
file = '../data/address/12049.csv.xz'   
num_samples = 10
county = '12049'
geo_column_id='GEOID20'
tdf = fire_in_the_hole(file, county, num_samples=2, True)
tdf

  return None
  return None
  4%|██▌                                                           | 44/1059 [01:05<25:07,  1.48s/it]

In [13]:
p = pathlib.Path('../data/address/').glob('*.xz')
pbar = tqdm(sorted(p))
for file in pbar:
    pbar.set_description("Processing: %s" % file.name)
    # print(file)

Processing: 17201.csv.xz:  13%|████▎                            | 427/3221 [00:00<00:00, 4269.74it/s]

../data/address/01001.csv.xz
../data/address/01003.csv.xz
../data/address/01005.csv.xz
../data/address/01007.csv.xz
../data/address/01009.csv.xz
../data/address/01011.csv.xz
../data/address/01013.csv.xz
../data/address/01015.csv.xz
../data/address/01017.csv.xz
../data/address/01019.csv.xz
../data/address/01021.csv.xz
../data/address/01023.csv.xz
../data/address/01025.csv.xz
../data/address/01027.csv.xz
../data/address/01029.csv.xz
../data/address/01031.csv.xz
../data/address/01033.csv.xz
../data/address/01035.csv.xz
../data/address/01037.csv.xz
../data/address/01039.csv.xz
../data/address/01041.csv.xz
../data/address/01043.csv.xz
../data/address/01045.csv.xz
../data/address/01047.csv.xz
../data/address/01049.csv.xz
../data/address/01051.csv.xz
../data/address/01053.csv.xz
../data/address/01055.csv.xz
../data/address/01057.csv.xz
../data/address/01059.csv.xz
../data/address/01061.csv.xz
../data/address/01063.csv.xz
../data/address/01065.csv.xz
../data/address/01067.csv.xz
../data/addres

Processing: 26021.csv.xz:  37%|███████████▋                    | 1176/3221 [00:00<00:00, 2782.13it/s]

../data/address/17201.csv.xz
../data/address/17203.csv.xz
../data/address/18001.csv.xz
../data/address/18003.csv.xz
../data/address/18005.csv.xz
../data/address/18007.csv.xz
../data/address/18009.csv.xz
../data/address/18011.csv.xz
../data/address/18013.csv.xz
../data/address/18015.csv.xz
../data/address/18017.csv.xz
../data/address/18019.csv.xz
../data/address/18021.csv.xz
../data/address/18023.csv.xz
../data/address/18025.csv.xz
../data/address/18027.csv.xz
../data/address/18029.csv.xz
../data/address/18031.csv.xz
../data/address/18033.csv.xz
../data/address/18035.csv.xz
../data/address/18037.csv.xz
../data/address/18039.csv.xz
../data/address/18041.csv.xz
../data/address/18043.csv.xz
../data/address/18045.csv.xz
../data/address/18047.csv.xz
../data/address/18049.csv.xz
../data/address/18051.csv.xz
../data/address/18053.csv.xz
../data/address/18055.csv.xz
../data/address/18057.csv.xz
../data/address/18059.csv.xz
../data/address/18061.csv.xz
../data/address/18063.csv.xz
../data/addres

Processing: 31103.csv.xz:  45%|██████████████▌                 | 1465/3221 [00:00<00:00, 2313.03it/s]

../data/address/26021.csv.xz
../data/address/26023.csv.xz
../data/address/26025.csv.xz
../data/address/26027.csv.xz
../data/address/26029.csv.xz
../data/address/26031.csv.xz
../data/address/26033.csv.xz
../data/address/26035.csv.xz
../data/address/26037.csv.xz
../data/address/26039.csv.xz
../data/address/26041.csv.xz
../data/address/26043.csv.xz
../data/address/26045.csv.xz
../data/address/26047.csv.xz
../data/address/26049.csv.xz
../data/address/26051.csv.xz
../data/address/26053.csv.xz
../data/address/26055.csv.xz
../data/address/26057.csv.xz
../data/address/26059.csv.xz
../data/address/26061.csv.xz
../data/address/26063.csv.xz
../data/address/26065.csv.xz
../data/address/26067.csv.xz
../data/address/26069.csv.xz
../data/address/26071.csv.xz
../data/address/26073.csv.xz
../data/address/26075.csv.xz
../data/address/26077.csv.xz
../data/address/26079.csv.xz
../data/address/26081.csv.xz
../data/address/26083.csv.xz
../data/address/26085.csv.xz
../data/address/26087.csv.xz
../data/addres

Processing: 41041.csv.xz:  60%|███████████████████▏            | 1933/3221 [00:00<00:00, 2964.11it/s]

../data/address/31103.csv.xz
../data/address/31105.csv.xz
../data/address/31107.csv.xz
../data/address/31109.csv.xz
../data/address/31111.csv.xz
../data/address/31113.csv.xz
../data/address/31115.csv.xz
../data/address/31117.csv.xz
../data/address/31119.csv.xz
../data/address/31121.csv.xz
../data/address/31123.csv.xz
../data/address/31125.csv.xz
../data/address/31127.csv.xz
../data/address/31129.csv.xz
../data/address/31131.csv.xz
../data/address/31133.csv.xz
../data/address/31135.csv.xz
../data/address/31137.csv.xz
../data/address/31139.csv.xz
../data/address/31141.csv.xz
../data/address/31143.csv.xz
../data/address/31145.csv.xz
../data/address/31147.csv.xz
../data/address/31149.csv.xz
../data/address/31151.csv.xz
../data/address/31153.csv.xz
../data/address/31155.csv.xz
../data/address/31157.csv.xz
../data/address/31159.csv.xz
../data/address/31161.csv.xz
../data/address/31163.csv.xz
../data/address/31165.csv.xz
../data/address/31167.csv.xz
../data/address/31169.csv.xz
../data/addres

Processing: 51580.csv.xz:  80%|█████████████████████████▌      | 2577/3221 [00:01<00:00, 2647.53it/s]

../data/address/41041.csv.xz
../data/address/41043.csv.xz
../data/address/41045.csv.xz
../data/address/41047.csv.xz
../data/address/41049.csv.xz
../data/address/41051.csv.xz
../data/address/41053.csv.xz
../data/address/41055.csv.xz
../data/address/41057.csv.xz
../data/address/41059.csv.xz
../data/address/41061.csv.xz
../data/address/41063.csv.xz
../data/address/41065.csv.xz
../data/address/41067.csv.xz
../data/address/41069.csv.xz
../data/address/41071.csv.xz
../data/address/42001.csv.xz
../data/address/42003.csv.xz
../data/address/42005.csv.xz
../data/address/42007.csv.xz
../data/address/42009.csv.xz
../data/address/42011.csv.xz
../data/address/42013.csv.xz
../data/address/42015.csv.xz
../data/address/42017.csv.xz
../data/address/42019.csv.xz
../data/address/42021.csv.xz
../data/address/42023.csv.xz
../data/address/42025.csv.xz
../data/address/42027.csv.xz
../data/address/42029.csv.xz
../data/address/42031.csv.xz
../data/address/42033.csv.xz
../data/address/42035.csv.xz
../data/addres

Processing: 53065.csv.xz:  80%|█████████████████████████▌      | 2577/3221 [00:01<00:00, 2647.53it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

