In [13]:
from geopy.distance import great_circle
from scipy.spatial import cKDTree
import pandas as pd
import numpy as np

# Load your geocoded datasets
walmart_geocoded = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/walmart_geocoded.csv')
zips_geocoded = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/zips_geocoded.csv')

In [14]:
print(walmart_geocoded[['latitude', 'longitude']].isna().sum())
print(np.isinf(walmart_geocoded[['latitude', 'longitude']]).sum())

latitude     75
longitude    75
dtype: int64
latitude     0
longitude    0
dtype: int64


In [15]:
# Remove or impute rows with NaN or infinite values
walmart_geocoded = walmart_geocoded[~walmart_geocoded[['latitude', 'longitude']].isna().any(axis=1)]
walmart_geocoded = walmart_geocoded[np.isfinite(walmart_geocoded['latitude']) & np.isfinite(walmart_geocoded['longitude'])]

In [17]:
# Convert degrees to radians for use with cKDTree
walmart_coords = np.radians(walmart_geocoded[['latitude', 'longitude']].values)
zip_coords = np.radians(zips_geocoded[['latitude', 'longitude']].values)

# Build KD-Trees for both the Walmart locations and ZIP codes
walmart_tree = cKDTree(walmart_coords)

# Define the radius in radians
radius_in_miles = 80
earth_radius_in_miles = 3959
radius_in_radians = radius_in_miles / earth_radius_in_miles

# Find all points within the radius using query_ball_tree
results = walmart_tree.query_ball_point(zip_coords, r=radius_in_radians)

# Create a DataFrame to hold the results
results_df = pd.DataFrame(index=zips_geocoded.index)

# Add ZIP code to the results DataFrame
results_df['zip'] = zips_geocoded['zip']

# Iterate through the results and add the store numbers to the results DataFrame
for idx, store_indices in enumerate(results):
    if store_indices:  # Check if there are any stores within the threshold
        # Retrieve the store numbers using the indices
        store_numbers = walmart_geocoded.iloc[store_indices]['Store #'].tolist()
        # Add the store numbers to the DataFrame, ensuring each list is a separate column
        results_df.at[idx, 'nearby_walmarts'] = ', '.join(map(str, store_numbers))

# Save the results to a CSV file
results_df.to_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/distance.csv', index=False)