In [None]:
import pandas as pd
from geopy.distance import geodesic

# Cargar ubicaciones
geo = pd.read_csv("../data_clean/olist_geolocation_dataset_clean.csv")
customers = pd.read_csv("../data_clean/olist_customers_dataset_clean.csv")
sellers = pd.read_csv("../data_clean/olist_sellers_dataset_clean.csv")
orders = pd.read_csv("../data_clean/olist_orders_dataset_clean.csv")

# Obtener coordenadas representativas por ZIP code
geo = geo.groupby('geolocation_zip_code_prefix')[['geolocation_lat', 'geolocation_lng']].mean().reset_index()

# Unir ubicaciones al dataset
customers = customers.merge(geo, left_on='customer_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')
sellers = sellers.merge(geo, left_on='seller_zip_code_prefix', right_on='geolocation_zip_code_prefix', how='left')

# Renombrar columnas
customers.rename(columns={'geolocation_lat': 'customer_lat', 'geolocation_lng': 'customer_lng'}, inplace=True)
sellers.rename(columns={'geolocation_lat': 'seller_lat', 'geolocation_lng': 'seller_lng'}, inplace=True)

# Unir clientes y vendedores a las órdenes
df = df.merge(orders[['order_id', 'customer_id', 'seller_id']], on='order_id')\
       .merge(customers[['customer_id', 'customer_lat', 'customer_lng']], on='customer_id')\
       .merge(sellers[['seller_id', 'seller_lat', 'seller_lng']], on='seller_id')

# Calcular distancia geográfica
df['distancia_km'] = df.apply(
    lambda row: geodesic(
        (row['customer_lat'], row['customer_lng']),
        (row['seller_lat'], row['seller_lng'])
    ).km,
    axis=1
)
