In [None]:
# pip install geopandas, geopy

In [6]:
import os
# Creating paths
out_folderpath = os.path.join("..", "out")
os.makedirs(out_folderpath, exist_ok=True)

filepath = os.path.join(out_folderpath, "datacenters_usa.geojson")

In [7]:
import geopandas as gpd

# Loading the data into A geopandas dataframe
gdf = gpd.read_file(os.path.join("..", "data", "datacenters.geojson"))

# Filtering the dataframe to only include locations within the US
gdf_usa = gdf[gdf['country'] == 'USA']

In [9]:
from geopy.geocoders import Nominatim
from shapely.geometry import Point
import json
from tqdm import tqdm


# Iterate through each row to check and update geometry
for i, row in tqdm(gdf_usa.iterrows()):

    if row['geometry'] is None or row['geometry'].is_empty:
        
        # Dynamically create a geolocator object with a unique user_agent for each row
        geolocator = Nominatim(user_agent=f"geocoding_address_{i}")
        
        # Creates an address for the geolocator
        address = f"{row['address']}, {row['postal']}, {row['city']}, {row['state']}, {row['country']}"
        location = geolocator.geocode(address)
        
        if location:
            # Primary location found, updating the geometry
            gdf_usa.at[i, 'geometry'] = Point(location.longitude, location.latitude)
            gdf_usa.at[i, 'latitude'] = location.latitude
            gdf_usa.at[i, 'longitude'] = location.longitude
        else:
            # Primary location not found, trying alternate address
            alt_address = f"{row['postal']}, {row['city']}, {row['state']}, {row['country']}"
            alt_location = geolocator.geocode(alt_address)

            if alt_location:
                # Alternate location found, updating the geometry
                gdf_usa.at[i, 'geometry'] = Point(alt_location.longitude, alt_location.latitude)
                gdf_usa.at[i, 'latitude'] = alt_location.latitude
                gdf_usa.at[i, 'longitude'] = alt_location.longitude
            else:
                # Neither primary nor alternate location found
                print("Geometry not found")


gdf_usa_filtered = gdf_usa[gdf_usa['geometry'].notna()]

# Convert GeoDataFrame to JSON and save
json_str = gdf_usa_filtered.to_json()

json_dict = json.loads(json_str)

with open(filepath, 'w') as f:
    json.dump(json_dict, f, indent=2)

print("GeoJSON file has been formatted and saved.")

0it [00:00, ?it/s]

182it [00:01, 181.02it/s]

Geometry not found


1048it [00:01, 637.15it/s]

Geometry not found


1316it [00:02, 436.86it/s]

Geometry not found


1782it [00:25,  7.05it/s] 

Geometry not found


2012it [00:33, 29.74it/s]

Geometry not found


2338it [00:55, 19.99it/s]

Geometry not found


2343it [00:57, 16.21it/s]

Geometry not found


2526it [01:01, 41.05it/s]


GeoJSON file has been formatted and saved.


## References 

https://medium.com/@gopesh3652/geocoding-with-python-using-nominatim-a-beginners-guide-220b250ca48d 

https://geopy.readthedocs.io/en/stable/ 