In [3]:
import os
import sys
import numpy as np
import pandas as pd
from numpy import random as npr

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


In [4]:
SEED = 2021

USERPROFILE_DATASET_PATH = "../data/restaurant_data/userprofile.csv"
USERCUISINE_DATASET_PATH = "../data/restaurant_data/usercuisine.csv"
CHEFMOZCUISINE_DATASET_PATH = "../data/restaurant_data/chefmozcuisine.csv"
RATINGS_DATASET_PATH = "../data/restaurant_data_reformatted/ratings.csv"

## Load and transform restaurants data

In [29]:
def prepare_user_profile_df(df):
    df = df.drop(["latitude", "longitude"], axis=1)
    df = df.replace("?", pd.NA)
    df = df.fillna(method="bfill")
    return df


def prepare_user_cuisine_df(df):
    df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("userID").sum()
    return df


def load_and_clean_users_df():
    user_profile_df = pd.read_csv(USERPROFILE_DATASET_PATH)
    user_cuisine_df = pd.read_csv(USERCUISINE_DATASET_PATH)
    user_profile_df = prepare_user_profile_df(user_profile_df)
    user_cuisine_df = prepare_user_cuisine_df(user_cuisine_df)

    users_df = pd.merge(user_profile_df, user_cuisine_df, on="userID")
    return users_df


def load_and_prepare_rest_cuisine_df():
    df = pd.read_csv(CHEFMOZCUISINE_DATASET_PATH)
    df = df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("placeID").sum()
    return df



In [30]:
users_df = load_and_clean_users_df()
users_df

Unnamed: 0,userID,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,...,Swiss,Tapas,Tea_House,Tex-Mex,Thai,Tibetan,Tunisian,Turkish,Vegetarian,Vietnamese
0,U1001,false,abstemious,informal,family,on foot,single,independent,1989,variety,...,0,0,0,0,0,0,0,0,0,0
1,U1002,false,abstemious,informal,family,public,single,independent,1990,technology,...,0,0,0,0,0,0,0,0,0,0
2,U1003,false,social drinker,formal,family,public,single,independent,1989,none,...,0,0,0,0,0,0,0,0,0,0
3,U1004,false,abstemious,informal,family,public,single,independent,1940,variety,...,0,0,0,0,0,0,0,0,0,0
4,U1005,false,abstemious,no preference,family,public,single,independent,1992,none,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,U1134,false,casual drinker,no preference,family,public,single,independent,1991,variety,...,0,0,0,0,0,0,0,0,0,0
134,U1135,false,casual drinker,informal,family,on foot,single,kids,1988,variety,...,1,1,1,1,1,1,1,1,1,1
135,U1136,true,social drinker,no preference,friends,car owner,single,independent,1990,retro,...,0,0,0,0,0,0,0,0,0,0
136,U1137,false,social drinker,formal,family,public,single,independent,1989,eco-friendly,...,0,0,0,0,0,0,0,0,0,0


In [31]:
rests_df = load_and_prepare_rest_cuisine_df()
rests_df

Unnamed: 0_level_0,Afghan,African,American,Armenian,Asian,Bagels,Bakery,Bar,Bar_Pub_Brewery,Barbecue,...,Soup,Southern,Southwestern,Spanish,Steaks,Sushi,Thai,Turkish,Vegetarian,Vietnamese
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
ratings_df = pd.read_csv(RATINGS_DATASET_PATH)
ratings_df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2
...,...,...,...,...,...
1156,U1043,132630,1,1,1
1157,U1011,132715,1,1,0
1158,U1068,132733,1,1,0
1159,U1068,132594,1,1,1


In [33]:
real_dataset = pd.merge(ratings_df, users_df, on="userID")
real_dataset = pd.merge(merged_df, rests_df, on="placeID")
real_dataset["rating"] = real_dataset["rating"] / 2
real_dataset

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,smoker,drink_level,dress_preference,ambience,transport,...,Soup,Southern,Southwestern,Spanish,Steaks,Sushi,Thai,Turkish,Vegetarian,Vietnamese
0,U1077,135085,1.0,2,2,false,social drinker,elegant,family,public,...,0,0,0,0,0,0,0,0,0,0
1,U1108,135085,0.5,2,1,false,abstemious,informal,solitary,public,...,0,0,0,0,0,0,0,0,0,0
2,U1081,135085,0.5,2,1,false,casual drinker,informal,family,public,...,0,0,0,0,0,0,0,0,0,0
3,U1001,135085,0.0,1,1,false,abstemious,informal,family,on foot,...,0,0,0,0,0,0,0,0,0,0
4,U1056,135085,1.0,2,2,false,social drinker,informal,family,on foot,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,U1006,132922,1.0,1,2,true,social drinker,no preference,friends,car owner,...,0,0,0,0,0,0,0,0,0,0
869,U1003,132937,1.0,2,1,false,social drinker,formal,family,public,...,0,0,0,0,0,0,0,0,0,0
870,U1027,132937,0.5,1,1,true,social drinker,no preference,family,public,...,0,0,0,0,0,0,0,0,0,0
871,U1029,132937,0.5,1,1,true,casual drinker,formal,family,public,...,0,0,0,0,0,0,0,0,0,0


## Create rating matrix

In [36]:
long_table = real_dataset[["userID", "placeID", "rating"]]
long_table["userID"] = long_table["userID"].astype('category').cat.codes
long_table["placeID"] = long_table["placeID"].astype('category').cat.codes
long_table["user_id"] = long_table["userID"]
long_table["item_id"] = long_table["placeID"]
long_table = long_table.drop(["userID", "placeID"], axis=1)
long_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

Unnamed: 0,rating,user_id,item_id
0,1.0,76,89
1,0.5,107,89
2,0.5,80,89
3,0.0,0,89
4,1.0,55,89
...,...,...,...
868,1.0,5,44
869,1.0,2,46
870,0.5,26,46
871,0.5,28,46


In [37]:
rating_matrix = long_table.pivot(index="user_id", columns="item_id", values="rating")

In [38]:
rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,0.0,,,,,
1,,,,,,,,,,,...,,,,,0.5,,,,0.5,
2,,,,,,,,,,,...,,,1.0,1.0,,,,,,
3,,,,,,,,,,,...,,,,,,,,,1.0,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,,0.0,,,,,,,,,...,,1.0,1.0,0.5,1.0,,,,,
134,,,,,,,,,,,...,,,,,0.0,,,,0.0,
135,,,,,,,,,,,...,,0.5,,,,,,,,
136,,,,,,,,,,,...,,,1.0,,1.0,,,,,


In [39]:
rating_matrix = rating_matrix.fillna(0)
rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.5,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.5,1.0,0.0,0.0,0.0,0.0,0.0
134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Evaluate

In [40]:
import os
import sys
import importlib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from modules import models, evaluator, trainers, utils
importlib.reload(models)
importlib.reload(evaluator)
importlib.reload(trainers)


<module 'modules.trainers' from '/Users/vldpro/Workspace/university/recsys/modules/trainers.py'>

In [41]:
import datetime

def evaluate_on_real_dataset(rating_matrix, test_size, sample_size, n_iterations=1):
    trainer_list = [
        trainers.KnnTrainTestExecutor(),
        trainers.SvdTrainTestExecutor(),
        trainers.AutoRecTrainTestExecutor(config={"epoch": 50}),
    ]
    errors = []
    for trainer in trainer_list:
        for iteration in range(n_iterations): 
            start_time = datetime.datetime.utcnow()
            error = trainer(rating_matrix, test_size=test_size, sample_size=sample_size)
            duration = datetime.datetime.utcnow() - start_time
            eval_result = {"model_name": trainer.model_name, "rmse": error[0], "mae": error[1], "duration": duration, "iteration": iteration}
            errors.append(eval_result)
            print(f"{eval_result}")
    return pd.DataFrame(errors)
        

In [42]:
eval_results = evaluate_on_real_dataset(rating_matrix, test_size=0.1, sample_size=0.01)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.0535
MAE:  0.0516
{'model_name': 'knn', 'rmse': 0.053534672869968415, 'mae': 0.05158730158730159, 'duration': datetime.timedelta(microseconds=38027), 'iteration': 0}
RMSE: 0.0622
MAE:  0.0408
{'model_name': 'svd', 'rmse': 0.06222170273427087, 'mae': 0.04076126739630613, 'duration': datetime.timedelta(microseconds=27174), 'iteration': 0}


ValueError: row index exceeds matrix dimensions

In [56]:
eval_results

Unnamed: 0,rmse,mae
0,0.020488,0.000735
1,0.06384,0.034684
2,0.089257,0.017506
