In [14]:
import pandas as pd
import os
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import geopandas as gpd
import contextily as ctx
import matplotlib as mpl
from fiona.io import ZipMemoryFile
import io
import matplotlib.pyplot as plt
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
valid_files = [
    os.path.join("../data/address", file)
    for file in os.listdir("../data/address")
    if file.split(".")[-1] == "xz"
]

total_dots_df = []

for county_file in tqdm(sorted(valid_files)):
    county_shape_df = pd.read_csv(county_file)
    county_shape_df.crs = "epsg:4326"
    total_dots_df.append(county_shape_df)

total_dots_df = pd.concat(total_dots_df)
print(total_dots_df)

100%|██████████| 3221/3221 [02:15<00:00, 23.79it/s] 


                                          address          GEOID20  longitude  \
0      521 mossy oak ridge, prattville, al, 36066   10010205033012 -86.429760   
1    209 high pointe ridge, prattville, al, 36066   10010205033012 -86.430169   
2    208 high pointe ridge, prattville, al, 36066   10010205033012 -86.430166   
3       101 lake haven way, prattville, al, 36066   10010205033012 -86.433429   
4       103 lake haven way, prattville, al, 36066   10010205033012 -86.433428   
..                                            ...              ...        ...   
592                                           NaN  721537504001005        NaN   
593                                           NaN  721537504004036        NaN   
594                                           NaN  721537501022020        NaN   
595                                           NaN  721537505012011        NaN   
596                                           NaN  721537504003006        NaN   

      latitude  
0    32.46

# Merge the geometries, so that each county has at least one value?

In [9]:
total_dots_df
gdf = gpd.GeoDataFrame(
    total_dots_df, geometry=gpd.points_from_xy(total_dots_df.longitude, total_dots_df.latitude), crs="EPSG:4326"
)
gdf

Unnamed: 0,address,GEOID20,longitude,latitude,geometry
0,"521 mossy oak ridge, prattville, al, 36066",10010205033012,-86.429760,32.468279,POINT (-86.42976 32.46828)
1,"209 high pointe ridge, prattville, al, 36066",10010205033012,-86.430169,32.470367,POINT (-86.43017 32.47037)
2,"208 high pointe ridge, prattville, al, 36066",10010205033012,-86.430166,32.470378,POINT (-86.43017 32.47038)
3,"101 lake haven way, prattville, al, 36066",10010205033012,-86.433429,32.469669,POINT (-86.43343 32.46967)
4,"103 lake haven way, prattville, al, 36066",10010205033012,-86.433428,32.469648,POINT (-86.43343 32.46965)
...,...,...,...,...,...
592,,721537504001005,,,POINT EMPTY
593,,721537504004036,,,POINT EMPTY
594,,721537501022020,,,POINT EMPTY
595,,721537505012011,,,POINT EMPTY


In [16]:
gdf['covered'] = gdf['address'].apply(lambda x: x != np.nan)
gdf

Unnamed: 0,address,GEOID20,longitude,latitude,geometry,covered
0,"521 mossy oak ridge, prattville, al, 36066",10010205033012,-86.429760,32.468279,POINT (-86.42976 32.46828),True
1,"209 high pointe ridge, prattville, al, 36066",10010205033012,-86.430169,32.470367,POINT (-86.43017 32.47037),True
2,"208 high pointe ridge, prattville, al, 36066",10010205033012,-86.430166,32.470378,POINT (-86.43017 32.47038),True
3,"101 lake haven way, prattville, al, 36066",10010205033012,-86.433429,32.469669,POINT (-86.43343 32.46967),True
4,"103 lake haven way, prattville, al, 36066",10010205033012,-86.433428,32.469648,POINT (-86.43343 32.46965),True
...,...,...,...,...,...,...
592,,721537504001005,,,POINT EMPTY,True
593,,721537504004036,,,POINT EMPTY,True
594,,721537501022020,,,POINT EMPTY,True
595,,721537505012011,,,POINT EMPTY,True


In [None]:
# import matplotlib.colors as clrs
# cmap = clrs.ListedColormap(['red', 'green'])

In [10]:
from matplotlib.pyplot import figure

<Figure size 1000x1000 with 0 Axes>

<Figure size 1000x1000 with 0 Axes>

In [20]:
fdf = gdf.drop_duplicates(subset='GEOID20', keep="last")
fdf

Unnamed: 0,address,GEOID20,longitude,latitude,geometry,covered
45,"205 high pointe ridge, prattville, al, 36066",10010205033012,-86.430117,32.470356,POINT (-86.43012 32.47036),True
51,"428 co rd 29, prattville, al, 36067",10010206001009,-86.496015,32.439109,POINT (-86.49602 32.43911),True
65,"209 evergreen st, prattville, al, 36067",10010206003005,-86.481398,32.455588,POINT (-86.48140 32.45559),True
99,"339 pratt st, prattville, al, 36067",10010207002006,-86.464739,32.458606,POINT (-86.46474 32.45861),True
102,"328 moncrief st, prattville, al, 36067",10010207002008,-86.466085,32.458742,POINT (-86.46608 32.45874),True
...,...,...,...,...,...,...
592,,721537504001005,,,POINT EMPTY,True
593,,721537504004036,,,POINT EMPTY,True
594,,721537501022020,,,POINT EMPTY,True
595,,721537505012011,,,POINT EMPTY,True


In [None]:
fdf.to_crs("EPSG:4326")
ax = fdf.plot(markersize=1, color='green', alpha=0.5)
ctx.add_basemap(ax, crs=fdf.crs.to_string(), source=ctx.providers.Stamen.Toner)
# ax.set_xlim([-130, -60])
# ax.set_ylim([20, 55])
# plt.title('Address')
fig =plt.gcf()
fig.set_size_inches(17,11)
plt.savefig('../docs/us_addresses.png', dpi=100)