In [3]:
import pandas as pd
from geopy.distance import great_circle

# Load your data
walmart_geocoded = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/walmart_geocoded.csv')
zips_geocoded = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/zips_geocoded.csv')
distance_data = pd.read_csv('/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/distance.csv')

# Define the model estimation function
def estimate_driving_time(great_circle_distance, ruca):
    coefficients = {
        'great_circle_distance': 1.004209,
        'ruca_Rural': 0.174183,
        'ruca_Suburban': 1.177049,
        'ruca_Urban': -1.351231,
    }
    intercept = 17.3068349820822
    estimated_time = intercept + coefficients['great_circle_distance'] * great_circle_distance
    if ruca == 'Rural':
        estimated_time += coefficients['ruca_Rural']
    elif ruca == 'Suburban':
        estimated_time += coefficients['ruca_Suburban']
    elif ruca == 'Urban':
        estimated_time += coefficients['ruca_Urban']
    return estimated_time

# Prepare a CSV file to store the results with headers
output_file = '/Users/pintoza/Desktop/dev/data-science/walmart-proximity/data/processed/time.csv'
with open(output_file, 'w') as f:
    f.write('zip,15_min,30_min,45_min,60_min\n')  

# Loop through each ZIP code in distance.csv
for index, row in distance_data.iterrows():
    zip_code = row['zip']
    if pd.isna(row['nearby_walmarts']):
        continue  # Skip ZIP codes with no nearby Walmarts

    nearby_stores = [store.strip() for store in str(row['nearby_walmarts']).split(',')]
    time_counts = {'15_min': 0, '30_min': 0, '45_min': 0, '60_min': 0}

    for store_number in nearby_stores:
        store_number = int(store_number)
        if store_number in walmart_geocoded['Store #'].values:
            store_coords = walmart_geocoded[walmart_geocoded['Store #'] == store_number][['latitude', 'longitude']].iloc[0]
            zip_coords = zips_geocoded[zips_geocoded['zip'] == zip_code][['latitude', 'longitude', 'ruca']].iloc[0]
            distance = great_circle((zip_coords['latitude'], zip_coords['longitude']), (store_coords['latitude'], store_coords['longitude'])).miles
            ruca = zip_coords['ruca']
            estimated_time = estimate_driving_time(distance, ruca)

            # Increment the appropriate time count
            if estimated_time <= 15:
                time_counts['15_min'] += 1
                time_counts['30_min'] += 1
                time_counts['45_min'] += 1
                time_counts['60_min'] += 1
            elif estimated_time <= 30:
                time_counts['30_min'] += 1
                time_counts['45_min'] += 1
                time_counts['60_min'] += 1
            elif estimated_time <= 45:
                time_counts['45_min'] += 1
                time_counts['60_min'] += 1
            elif estimated_time <= 60:
                time_counts['60_min'] += 1

    # Append this ZIP code's counts to the CSV file
    with open(output_file, 'a') as f:
        line = f"{zip_code},{time_counts['15_min']},{time_counts['30_min']},{time_counts['45_min']},{time_counts['60_min']}\n"
        f.write(line)

KeyboardInterrupt: 