In [2]:
import pandas as pd
from geocodio import GeocodioClient
from dotenv import load_dotenv
import os
import time

# Load API key from .env file
load_dotenv()
api_key = os.getenv("GEOCODIO_API_KEY")

# Initialize the Geocodio client
client = GeocodioClient(api_key)

# Function to chunk the address list
def chunked(iterable, size):
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

# Function to perform batch geocoding and handle exceptions
def batch_geocode_addresses(addresses, chunk_size=10000):
    all_results = []
    for i, chunk in enumerate(chunked(addresses, chunk_size)):
        try:
            # Geocode the chunk of addresses
            geocoded_chunk = client.batch_geocode(chunk)
            all_results.extend(geocoded_chunk)  # Concatenate results from each chunk
            print(f"Successfully geocoded chunk {i+1}")
            time.sleep(610)  # Delay to avoid hitting API rate limits
        except Exception as e:
            print(f"Error during batch geocoding of chunk {i+1}: {e}")
    return all_results

# Load addresses from CSV files
walmart_df = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/walmart.csv')
zips_df = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/zips.csv')

# Extract addresses and ZIP codes as lists
walmart_addresses = walmart_df['Address'].tolist()  # Assuming 'Address' is the column name
zip_codes = zips_df['zip'].astype(str).tolist()  # Assuming 'zip' is the column name

# Batch geocode
walmart_geocoded = batch_geocode_addresses(walmart_addresses)
zip_geocoded = batch_geocode_addresses(zip_codes)

Successfully geocoded chunk 1
Successfully geocoded chunk 1
Successfully geocoded chunk 2
Successfully geocoded chunk 3
Successfully geocoded chunk 4


In [3]:
# function to extract lat/lng from the geocoded data
def extract_lat_lng(geocoded_data):
    lat_lng_list = []
    for item in geocoded_data:
        if item['results']:  # Check if the results list is not empty
            lat = item['results'][0]['location']['lat']
            lng = item['results'][0]['location']['lng']
            lat_lng_list.append((lat, lng))
        else:
            lat_lng_list.append((None, None))  # Append None values if no results
    return lat_lng_list

walmart_lat_lng = extract_lat_lng(walmart_geocoded)
zip_lat_lng = extract_lat_lng(zip_geocoded)

In [4]:
# Add lat/lng to the Walmart dataframe
walmart_df = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/walmart.csv')
zip_df = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/zips.csv')

# Assuming the length of walmart_lat_lng and zip_lat_lng matches the respective dataframes
walmart_df[['latitude', 'longitude']] = pd.DataFrame(walmart_lat_lng, columns=['latitude', 'longitude'])
zip_df[['latitude', 'longitude']] = pd.DataFrame(zip_lat_lng, columns=['latitude', 'longitude'])

# Save the updated dataframes
walmart_df.to_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/walmart_geocoded.csv', index=False)
zip_df.to_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/zips_geocoded.csv', index=False)