In [9]:
import pandas as pd
from geopy.distance import great_circle

# Load the datasets
distance_sample_df = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/interim/distance_sample.csv')
zips_sample_geocoded_df = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/interim/zips_sample_geocoded.csv')

# Split 'origin' into latitude and longitude
distance_sample_df[['lat_origin', 'lon_origin']] = distance_sample_df['origin'].str.strip('"').str.split(',', expand=True).astype(float)

# Define a function to calculate great circle distance
def calculate_great_circle_distance(origin, destination):
    return great_circle(origin, destination).miles

# Calculate great circle distance for each row in the distance_sample_df
for index, row in distance_sample_df.iterrows():
    origin = (row['lat_origin'], row['lon_origin'])
    destination_coords = row['destination'].strip('"').split(',')
    destination = (float(destination_coords[0]), float(destination_coords[1]))
    distance_sample_df.at[index, 'great_circle_distance'] = calculate_great_circle_distance(origin, destination)

# Function to find RUCA based on latitude and longitude
def find_ruca(lat, lon, zips_df):
    # Convert lat and lon to the same precision as in zips_df
    lat = round(lat, 6)
    lon = round(lon, 6)
    # Find the closest match in the zips dataframe
    potential_matches = zips_df[
        (zips_df['latitude'].round(6) == lat) & 
        (zips_df['longitude'].round(6) == lon)
    ]
    if not potential_matches.empty:
        return potential_matches.iloc[0]['ruca']
    else:
        return None

# Append RUCA to the distance_sample_df
distance_sample_df['ruca'] = distance_sample_df.apply(
    lambda row: find_ruca(
        row['lat_origin'], 
        row['lon_origin'], 
        zips_sample_geocoded_df
    ), 
    axis=1
)

# Export the DataFrame with the new 'great_circle_distance' and 'ruca' columns
distance_sample_df.to_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/interim/distance_sample.csv', index=False)

In [10]:
# Drop the 'lat_origin' and 'lon_origin' columns
distance_sample_df.drop(columns=['lat_origin', 'lon_origin'], inplace=True)

# Save the updated DataFrame without the 'lat_origin' and 'lon_origin' columns
updated_csv_path = '/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/interim/distance_sample.csv'
distance_sample_df.to_csv(updated_csv_path, index=False)

# Return the path to the updated CSV
updated_csv_path

'/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/interim/distance_sample.csv'