In [1]:
import pickle
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [2]:
# user id to recommend restaurants for
USER_ID = "u00000"

In [3]:
# load model from pickle file
with open("../model.pkl", "rb") as f:
    model: NearestNeighbors = pickle.load(f)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
# Log the model with MLflow
import mlflow
# mlflow.set_tracking_uri("file:///Users/kuangsmacbook/Desktop/Works/LINEMAN\ MLE/attachment/server/mlruns")
with mlflow.start_run():
    mlflow.sklearn.log_model(model, "recommend")



MlflowException: The configured tracking uri scheme: 'file' is invalid for use with the proxy mlflow-artifact scheme. The allowed tracking schemes are: {'http', 'https'}

In [8]:
# load user and restaurant data
user_df = pd.read_parquet("user.small.parquet")
restaurant_df = pd.read_parquet("restaurant.parquet").set_index("index")

In [9]:
user_df.head()

Unnamed: 0,user_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_990,feature_991,feature_992,feature_993,feature_994,feature_995,feature_996,feature_997,feature_998,feature_999
0,u00000,0.193018,1.235771,0.478129,0.056068,0.79887,0.942501,-0.186956,-0.186927,-0.137488,...,0.28967,0.529229,0.658945,0.313935,1.343077,-0.160149,1.79604,0.078384,0.460348,0.22815
1,u00001,0.542445,0.061676,-0.172018,0.366714,0.690263,0.565948,0.030949,0.47896,0.033966,...,-0.564581,1.279485,-0.600161,-0.370372,1.984003,1.414539,1.877632,1.124837,2.27337,1.369663
2,u00002,0.501249,1.194607,0.750967,-0.078167,0.361179,0.124189,0.736151,1.427973,-0.676132,...,-0.185844,-0.247236,1.687951,0.849775,0.671021,-0.654202,-0.11887,0.419687,0.391202,-0.808181
3,u00003,0.613723,-0.652035,1.161827,0.122698,0.828054,-0.29604,0.563028,0.711285,0.097485,...,0.720368,0.070521,0.338198,0.885493,0.889271,0.778233,1.164597,0.572195,0.062597,-0.822071
4,u00004,0.17965,0.166768,0.337015,0.594367,0.897039,0.858783,-0.523686,0.813889,0.517246,...,0.404695,0.707832,0.700675,1.538892,0.114889,0.793582,-0.071324,1.282532,-0.052153,0.395376


In [10]:
user_df.shape

(10, 1001)

In [11]:
restaurant_df.head()

Unnamed: 0_level_0,restaurant_id,latitude,longitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,r0000,13.844851141246949,100.3086836106867
1,r0001,13.78182589654106,100.84977532273388
2,r0002,13.883572268183674,100.3085459374542
3,r0003,14.06672545266528,100.7256760943531
4,r0004,13.961173933525885,100.39580661948798


In [12]:
model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'radius': 1.0}

In [13]:
model.effective_metric_

'euclidean'

In [14]:
user_df[user_df["user_id"] == USER_ID].drop(columns="user_id")

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_990,feature_991,feature_992,feature_993,feature_994,feature_995,feature_996,feature_997,feature_998,feature_999
0,0.193018,1.235771,0.478129,0.056068,0.79887,0.942501,-0.186956,-0.186927,-0.137488,0.413889,...,0.28967,0.529229,0.658945,0.313935,1.343077,-0.160149,1.79604,0.078384,0.460348,0.22815


In [15]:
# find 20 nearest neighbors to be recommend restaurants
difference, ind = model.kneighbors(
    user_df[user_df["user_id"] == USER_ID].drop(columns="user_id"), n_neighbors=20
)

In [16]:
difference

array([[23.66758003, 23.80081254, 23.83828907, 24.07205225, 24.16645254,
        24.18236662, 24.22441798, 24.23519259, 24.23628432, 24.26967065,
        24.27378404, 24.28877668, 24.29956383, 24.30405023, 24.30567711,
        24.33704798, 24.34151423, 24.34254504, 24.35122189, 24.35462439]])

In [12]:
ind

array([[1737, 2116,  862, 1060, 5083, 4855, 7495, 7182, 9669, 8749, 2316,
        2159, 4255, 7596, 7523,  782, 7075, 2751, 3382, 9548]])

In [13]:
ind.shape

(1, 20)

In [14]:
difference.shape

(1, 20)

In [30]:
# get restaurant id from restaurant indices returned from the model
recommend_df = restaurant_df.loc[ind[0]]

In [31]:
recommend_df

Unnamed: 0_level_0,restaurant_id,latitude,longitude
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1737,r1737,13.629531575765236,100.75870051163372
2116,r2116,13.73119735587392,100.56817065841396
862,r0862,13.919599015060994,100.72259887691092
1060,r1060,13.920823054026256,100.43834135327354
5083,r5083,14.056428831746032,100.84463908830617
4855,r4855,13.59175326881862,100.81991308469668
7495,r7495,13.948798704378673,100.81459459201476
7182,r7182,13.772837613569491,100.32849058764408
9669,r9669,13.922821220460014,100.58332900886523
8749,r8749,14.023339052429405,100.7799235369726


In [32]:
# set distance as restaurant score
recommend_df["difference"] = difference[0]

In [33]:
recommend_df

Unnamed: 0_level_0,restaurant_id,latitude,longitude,difference
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1737,r1737,13.629531575765236,100.75870051163372,23.66758
2116,r2116,13.73119735587392,100.56817065841396,23.800813
862,r0862,13.919599015060994,100.72259887691092,23.838289
1060,r1060,13.920823054026256,100.43834135327354,24.072052
5083,r5083,14.056428831746032,100.84463908830617,24.166453
4855,r4855,13.59175326881862,100.81991308469668,24.182367
7495,r7495,13.948798704378673,100.81459459201476,24.224418
7182,r7182,13.772837613569491,100.32849058764408,24.235193
9669,r9669,13.922821220460014,100.58332900886523,24.236284
8749,r8749,14.023339052429405,100.7799235369726,24.269671


In [34]:
# print the result in json format
print(recommend_df[["restaurant_id", "difference"]].to_json(orient="records", indent=2))

[
  {
    "restaurant_id":"r1737",
    "difference":23.6675800283
  },
  {
    "restaurant_id":"r2116",
    "difference":23.8008125402
  },
  {
    "restaurant_id":"r0862",
    "difference":23.8382890655
  },
  {
    "restaurant_id":"r1060",
    "difference":24.0720522494
  },
  {
    "restaurant_id":"r5083",
    "difference":24.166452545
  },
  {
    "restaurant_id":"r4855",
    "difference":24.182366617
  },
  {
    "restaurant_id":"r7495",
    "difference":24.2244179801
  },
  {
    "restaurant_id":"r7182",
    "difference":24.2351925876
  },
  {
    "restaurant_id":"r9669",
    "difference":24.2362843205
  },
  {
    "restaurant_id":"r8749",
    "difference":24.2696706463
  },
  {
    "restaurant_id":"r2316",
    "difference":24.2737840441
  },
  {
    "restaurant_id":"r2159",
    "difference":24.2887766768
  },
  {
    "restaurant_id":"r4255",
    "difference":24.2995638295
  },
  {
    "restaurant_id":"r7596",
    "difference":24.3040502318
  },
  {
    "restaurant_id":"r7523",
 

In [105]:
request = pd.read_parquet("request.parquet")

In [106]:
request

Unnamed: 0,user_id,latitude,longitude,size,sort_dis,max_dis
0,u83153,14.068817,100.646536,50,0.0,5000.0
1,u45712,14.109562,100.698690,50,1.0,5000.0
2,u52829,13.727387,100.830825,50,1.0,5000.0
3,u11570,13.921809,100.468203,20,1.0,5000.0
4,u99991,13.804917,100.682749,50,,5000.0
...,...,...,...,...,...,...
1995,u26540,13.924127,100.611641,50,0.0,10000.0
1996,u39495,13.710599,100.714934,50,,
1997,u62643,13.995389,100.403283,50,1.0,5000.0
1998,u07949,13.683828,100.343557,50,0.0,5000.0


In [107]:
request['size'] = request['size'].astype("int64")

In [144]:
import numpy as np
print(np.isnan(request[request['max_dis'].isna()].iloc[0, -1]) == False)

False


In [50]:
import mlflow
import uuid

In [88]:
mlflow.sklearn.autolog()

In [89]:
model = mlflow.sklearn.load_model("runs:/20667a37d29345be907ccc4007dd9978/recommend")

In [90]:
difference, ind = model.kneighbors(
    user_df[user_df["user_id"] == USER_ID].drop(columns="user_id"), n_neighbors=20
)

In [60]:
difference

array([[23.66758003, 23.80081254, 23.83828907, 24.07205225, 24.16645254,
        24.18236662, 24.22441798, 24.23519259, 24.23628432, 24.26967065,
        24.27378404, 24.28877668, 24.29956383, 24.30405023, 24.30567711,
        24.33704798, 24.34151423, 24.34254504, 24.35122189, 24.35462439]])

In [92]:
import geopy

In [93]:
# Step 4: Calculate the displacement between user and restaurants
recommended_restaurants = []
for idx, diff in zip(ind[0], difference[0]):
    restaurant = restaurant_df.loc[idx]
    if restaurant.any():
        coords_user = (14.068817, 100.646536)
        coords_restaurant = (restaurant.latitude, restaurant.longitude)
        displacement = int(geopy.distance.geodesic(coords_user, coords_restaurant).m)
        
        if displacement <= 30000:
            recommended_restaurants.append(
                {
                    "id": str(idx),
                    "difference": float(diff),
                    "displacement": displacement
                }
            )

In [94]:
recommended_restaurants

[{'id': '862', 'difference': 23.838289065463954, 'displacement': 18441},
 {'id': '1060', 'difference': 24.072052249370532, 'displacement': 27821},
 {'id': '5083', 'difference': 24.166452544984395, 'displacement': 21439},
 {'id': '7495', 'difference': 24.224417980096238, 'displacement': 22493},
 {'id': '9669', 'difference': 24.23628432046833, 'displacement': 17536},
 {'id': '8749', 'difference': 24.269670646297847, 'displacement': 15260},
 {'id': '2159', 'difference': 24.288776676754686, 'displacement': 19121},
 {'id': '4255', 'difference': 24.299563829541146, 'displacement': 8144},
 {'id': '7596', 'difference': 24.304050231792964, 'displacement': 19722},
 {'id': '2751', 'difference': 24.34254503906854, 'displacement': 9673}]

In [162]:
model.set_params(n_jobs = -1, algorithm='kd_tree')

In [163]:
model.get_params()

{'algorithm': 'kd_tree',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': -1,
 'n_neighbors': 5,
 'p': 2,
 'radius': 1.0}