In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
from fastapi import FastAPI, Query
from pydantic import BaseModel
from typing import List
import joblib
import re

In [2]:
# Ruta al archivo Parquet
ruta_archivo_parquet_sitios = r'C:\Users\guard\OneDrive\Desktop\Henry Data Science\Proyecto-FInal\Datos\gm_sitios_NJNY_20250305.parquet'

# Leer el archivo Parquet en un DataFrame
df = pd.read_parquet(ruta_archivo_parquet_sitios)

# Mostrar las columnas disponibles
print("Columnas disponibles en el DataFrame:")
print(df.columns)

# Mostrar las primeras filas del DataFrame para tener una idea del contenido
print("\nPrimeras filas del DataFrame:")
df.head()

Columnas disponibles en el DataFrame:
Index(['gmap_id', 'name', 'street_address', 'city', 'state', 'zip_code',
       'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price',
       'Monday_open', 'Monday_close', 'Tuesday_open', 'Tuesday_close',
       'Wednesday_open', 'Wednesday_close', 'Thursday_open', 'Thursday_close',
       'Friday_open', 'Friday_close', 'Saturday_open', 'Saturday_close',
       'Sunday_open', 'Sunday_close', 'Delivery', 'Dine-in', 'Takeout',
       'Good for kids', 'Casual', 'Dinner', 'Lunch'],
      dtype='object')

Primeras filas del DataFrame:


Unnamed: 0_level_0,gmap_id,name,street_address,city,state,zip_code,latitude,longitude,avg_rating,num_of_reviews,...,Saturday_close,Sunday_open,Sunday_close,Delivery,Dine-in,Takeout,Good for kids,Casual,Dinner,Lunch
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0x4ccab4b3af9c7c85:0x9defa19fe4e295c,Alpine Pizza,1104 U.S. 9,Schroon Lake,NY,12870,43.837433,-73.761254,4.3,45,...,21.0,16.0,21.0,1,1,1,1,1,1,0
26,0x89de5f9627d4e973:0x5b0cbc68d57ffefb,Stewart's Shops,4192 NY-30,Amsterdam,NY,12010,43.017033,-74.19412,2.2,15,...,0.0,4.0,0.0,1,0,0,0,0,0,0
34,0x89def3a6bb7c4091:0xde094955651ec4bf,Al's Pizzeria,201 N Main St,Gloversville,NY,12078,43.056137,-74.342115,4.2,28,...,21.0,12.0,21.0,0,0,1,1,1,0,0
53,0x89c3acab12c10d5d:0x73285fc4bd781796,Gulistan Pizza,783 S Orange Ave,Newark,NJ,7106,40.745207,-74.220116,3.3,8,...,,,,1,0,0,1,0,0,0
59,0x89c259f744b20deb:0xb536eb5ee402f92f,FREEHOLD In The Park,20 Union Square W,New York,NY,10003,40.73645,-73.989926,4.2,78,...,0.0,12.0,0.0,1,1,1,1,1,0,0


In [4]:
# Guardar el DataFrame preprocesado
df.to_csv('data_preprocesada.csv', index=False)

In [7]:
# Cargar datos preprocesados
df_binarias = pd.read_csv('data_preprocesada.csv')

# Seleccionar características relevantes para el modelo
X = df_binarias.drop(columns=['gmap_id', 'name', 'avg_rating', 'num_of_reviews', 'latitude', 'longitude', 'zip_code', 'city', 'state', 'street_address', 'price'])

# Convertir las columnas de horarios a formato numérico
for col in X.columns:
    if '_open' in col or '_close' in col:
        X[col] = pd.to_datetime(X[col], format='%H:%M', errors='coerce').dt.hour + pd.to_datetime(X[col], format='%H:%M', errors='coerce').dt.minute / 60

# Eliminar filas con valores NaN
X = X.dropna()

# Entrenar modelo Nearest Neighbors
modelo_knn = NearestNeighbors(n_neighbors=3, algorithm='auto').fit(X)

# Guardar el modelo
joblib.dump(modelo_knn, 'modelo_knn.pkl')

['modelo_knn.pkl']

In [6]:
# Inicializar FastAPI
app = FastAPI()

# Cargar modelo y datos
modelo_knn = joblib.load('modelo_knn.pkl')
df_binarias = pd.read_csv('data_preprocesada.csv')

class Consulta(BaseModel):
    horario: str
    condiciones_binarias: List[int]
    lat: float
    lon: float

@app.get("/recomendar_locales")
def recomendar_locales(horario: str = Query(...), condiciones_binarias: List[int] = Query(...), lat: float = Query(...), lon: float = Query(...)):
    # Crear vector de consulta
    consulta = pd.Series([horario] + condiciones_binarias)
    consulta = consulta.values.reshape(1, -1)
    
    # Encontrar vecinos más cercanos
    distancias, indices = modelo_knn.kneighbors(consulta)
    recomendaciones = df_binarias.iloc[indices[0]]
    
    # Ordenar por num_of_reviews y avg_rating
    recomendaciones = recomendaciones.sort_values(by=['num_of_reviews', 'avg_rating'], ascending=[False, False])
    return recomendaciones[['name', 'avg_rating', 'num_of_reviews', 'latitude', 'longitude','zip_code']].to_dict(orient='records')
