In [6]:
import os
import sys
import numpy as np
import pandas as pd
from numpy import random as npr

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)



In [7]:
SEED = 2021
SYNTHETC_DATASET_SIZE = 10_000
SAMPLE_SIZES = [0.1]
ALPHA_SAMPLE_RATE = 2

USERPROFILE_DATASET_PATH = "../../data/restaurants/userprofile.csv"
USERCUISINE_DATASET_PATH = "../data/restaurants/usercuisine.csv"
CHEFMOZCUISINE_DATASET_PATH = "../../data/restaurants/chefmozcuisine.csv"
RATINGS_DATASET_PATH = "../../restaurant_data_reformatted/ratings.csv"

## Load and transform restaurants data

Helper functions 

In [3]:
def prepare_user_profile_df(df):
    df = df.drop(["latitude", "longitude"], axis=1)
    df = df.replace("?", pd.NA)
    df = df.fillna(method="bfill")
    return df


def prepare_user_cuisine_df(df):
    df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("userID").sum()
    return df


def load_and_clean_users_df():
    user_profile_df = pd.read_csv(USERPROFILE_DATASET_PATH)
    user_cuisine_df = pd.read_csv(USERCUISINE_DATASET_PATH)
    user_profile_df = prepare_user_profile_df(user_profile_df)
    user_cuisine_df = prepare_user_cuisine_df(user_cuisine_df)

    users_df = pd.merge(user_profile_df, user_cuisine_df, on="userID")
    return users_df


def load_and_prepare_rest_cuisine_df():
    df = pd.read_csv(CHEFMOZCUISINE_DATASET_PATH)
    df = df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("placeID").sum()
    return df



In [8]:
users_df = load_and_clean_users_df()
users_df

FileNotFoundError: [Errno 2] No such file or directory: '../data/restaurants/usercuisine.csv'

In [None]:
rests_df = load_and_prepare_rest_cuisine_df()
rests_df

In [None]:
ratings_df = pd.read_csv(RATINGS_DATASET_PATH)
ratings_df

Merge ratings dataset, users dataset, and restaurants dataset into single tablem

In [None]:
merged_df = pd.merge(ratings_df, users_df, on="userID")
merged_df = pd.merge(merged_df, rests_df, on="placeID")
merged_df

## Generate synthetic data

In [None]:
from sdv.tabular import GaussianCopula


def fit_syn_generator(df):
    model = GaussianCopula()
    df = df.drop(["userID", "placeID"], axis=1) # Drop ids
    df = df.astype("int64", errors="ignore") # Convert all numbers to int64
    model.fit(df.copy())
    return model



In [None]:
npr.seed(SEED)
syn_data_generator = fit_syn_generator(merged_df)

In [None]:
npr.seed(SEED)
syn_merged_df = syn_data_generator.sample(SYNTHETIC_DATSET_SIZE)
syn_merged_df.head()

## Fit DeepFM model

In [None]:
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names

from modules.models import DeepFmModel

In [None]:
dense_feat_names = ["height", "weight", "birth_year"]
sparse_feat_names = [
    c for c in list(syn_merged_df.columns) if c not in [
        "rating",
        "food_rating",
        "service_rating",
        "weight",
        "height",
        "birth_year"
    ]
]

In [None]:
class DeepFMDataLoader:
    def __init__(self, *, sparse_features, dense_features):
        self._sparse_feats = sparse_features
        self._dense_feats = dense_features
        
    def load(self, dataset):
        nn_input = pd.DataFrame()
        nn_input[self._sparse_feats] = dataset[self._sparse_feats]
        nn_input[self._dense_feats] = dataset[self._dense_feats]
        
        for feat in self._sparse_feats:
            encoder = LabelEncoder()
            nn_input[feat] = encoder.fit_transform(nn_input[feat])
            
        mms = MinMaxScaler(feature_range=(0,1))
        nn_input[self._dense_feats] = mms.fit_transform(nn_input[self._dense_feats])
        
        # problems may be here
        sparse_feature_columns = [
            SparseFeat(feat, vocabulary_size=nn_input[feat].nunique(), embedding_dim=4) 
            for i, feat in enumerate(self._sparse_feats)
        ]

        dense_feature_columns = [DenseFeat(feat, 1,) for feat in self._dense_feats]
        
        dnn_feat_cols = sparse_feature_columns + dense_feature_columns
        linear_feat_cols = sparse_feature_columns + dense_feature_columns
        
        feat_names = get_feature_names(linear_feat_cols + dnn_feat_cols)
        return nn_input, dnn_feat_cols, linear_feat_cols, feat_names


data_loader = DeepFMDataLoader(sparse_features=sparse_feat_names, dense_features=dense_feat_names)
nn_train_input, dnn_feats, lin_feats, feat_names = data_loader.load(syn_merged_df)

In [None]:
def _cross_join(df1, df2):
    df1["_join_key"] = 0
    df2["_join_key"] = 0
    merged_df = df1.merge(df2, on="_join_key")
    merged_df = merged_df.drop("_join_key", axis=1)
    return merged_df


def nn_prepare_data_for_rating_matrix(users_df, rests_df):
    users_df = users_df.drop("userID", axis=1)
    users_df["user_id"] = range(0, len(users_df))
    rests_df["item_id"] = range(0, len(rests_df))
    user_rest_long_table = _cross_join(users_df, rests_df)
    return user_rest_long_table
    

user_rest_long_table = nn_prepare_data_for_rating_matrix(users_df.copy(), rests_df.copy())

In [None]:
nn_user_rest_long_table, _dnn_feats, _lin_feats, _feat_names = data_loader.load(user_rest_long_table)

In [None]:
def merge_feats(feats_a, feats_b):
    assert len(feats_a) == len(feats_b)
    merged = []
    for feat_a, feat_b in zip(feats_a, feats_b):
        if isinstance(feat_a, DenseFeat):
            continue
        if feat_a.vocabulary_size >= feat_b.vocabulary_size:
            merged.append(feat_a)
        else:
            merged.append(feat_b)
    return merged


_merged_feats = merge_feats(dnn_feats, _dnn_feats)

In [None]:
def train_deepfm(feats, feat_names, x, y):
    deepfm = DeepFmModel(feats, feats, feat_names)
    train_set, test_set = train_test_split(x, test_size=0.2)
    deepfm.train(train_set, target_values=y[:len(train_set)])
    return deepfm
    

In [None]:
npr.seed(SEED)
deepfm = train_deepfm(_merged_feats, feat_names, x=nn_train_input, y=syn_merged_df["rating"].values)

In [None]:
np.random.seed(SEED)

class NNModelWrapper:
    def __init__(self, trained_nn):
        self._nn = trained_nn

    def predict_rating_matrix(self, nn_input, merged_df):
        y = self._nn.predict(nn_input)
        result = pd.DataFrame()
        result["rating"] = y.reshape((len(y),))
        result["user_id"] = merged_df["user_id"]
        result["item_id"] = merged_df["item_id"]
        output_matrix = result.pivot(index="user_id", columns="item_id", values="rating")
        return output_matrix

model_wrapper = NNModelWrapper(deepfm)
deepfm_rating_matrix = model_wrapper.predict_rating_matrix(nn_user_rest_long_table, user_rest_long_table)
deepfm_rating_matrix

In [None]:
deepfm_rating_matrix = np.around(deepfm_rating_matrix)
deepfm_rating_matrix

In [None]:
def make_deepfm_rating_matrix(syn_df, users_df, rests_df):
    dense_feat_names = ["height", "weight", "birth_year"]
    sparse_feat_names = [
        c for c in list(syn_df.columns) if c not in [
            "rating",
            "food_rating",
            "service_rating",
            "weight",
            "height",
            "birth_year"
        ]
    ]

    data_loader = DeepFMDataLoader(sparse_features=sparse_feat_names, dense_features=dense_feat_names)
    nn_train_input, dnn_feats, lin_feats, feat_names = data_loader.load(syn_df)
    user_rest_long_table = nn_prepare_data_for_rating_matrix(users_df.copy(), rests_df.copy())
    
    nn_user_rest_long_table, _dnn_feats, _lin_feats, _feat_names = data_loader.load(user_rest_long_table)
    _merged_feats = merge_feats(dnn_feats, _dnn_feats)
    deepfm = train_deepfm(_merged_feats, feat_names, x=nn_train_input, y=syn_df["rating"].values)

    model_wrapper = NNModelWrapper(deepfm)
    deepfm_rating_matrix = model_wrapper.predict_rating_matrix(nn_user_rest_long_table, user_rest_long_table)
    return deepfm_rating_matrix

In [None]:
np.random.seed(SEED)
_mtx = make_deepfm_rating_matrix(syn_merged_df, users_df, rests_df)

## Make similarity matrix

Creating a similarity matrix between users and restaurants based on users' food preference and restaurants' cuisines.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def make_similarity_matrix(users_df, rests_df):
    users_food_pref_df = users_df[rests_df.columns]
    return cosine_similarity(users_food_pref_df, rests_df)

In [None]:
sim_matrix = make_similarity_matrix(users_df, rests_df)
sim_matrix = sim_matrix * 2 # Rating is in range [0, 2]
sim_matrix

## Experiments

In [None]:
import os
import sys
import importlib
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from modules import models, evaluator, trainers, utils
importlib.reload(models)
importlib.reload(evaluator)
importlib.reload(trainers)

In [None]:
class ResponseFunction:
    def __init__(self, heu_matrix, nn_matrix, noise_matrix):
        assert heu_matrix.shape == nn_matrix.shape
        self._heu_matrix = heu_matrix
        self._nn_matrix = nn_matrix
        self._noise_matrix = noise_matrix
        
    def __call__(self, a1: float, a2: float):
        a3 = max(0.0, 1 - a1 - a2)
        return (
            a1 * self._heu_matrix
            + a2 * self._nn_matrix
            + a3 * npr.normal(0, 1, size=self._heu_matrix.shape)
        )   

In [None]:
_resp_fn_config = evaluator.ResponseFunctionConfig(
    factory=ResponseFunction,
    args=[sim_matrix, deepfm_rating_matrix, npr.normal(0, 1, size=sim_matrix.shape)]
)

_evaluators = [
    evaluator.TrainTestExecutorConfig(
        factory=trainers.AutoRecTrainTestExecutor,
        args={"config": {"epoch": 50}},
        model_name="autorec"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.SvdTrainTestExecutor,
        args={},
        model_name="svd"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.KnnTrainTestExecutor,
        args={},
        model_name="knn"
    )
]

np.random.seed(SEED)
_evaluator = evaluator.Evaluator(_resp_fn_config, n_proc=4)
results = _evaluator.evaluate(
    _evaluators, 
    a_sample_rate=ALPHA_SAMPLE_RATE,
    test_size=0.1, 
    sample_sizes=SAMPLE_SIZES
)

In [None]:
results = utils.group_points_by_minimum_error(results)
results

In [None]:
results = results.rename(
    columns={
        "a1": "α1",
        "a2": "α2",
        "rmse": "RMSE",
        "model_name": "Model"
    }
)

results["Model"] = results["Model"].map(
    {
        "knn": "kNN",
        "svd": "SVD",
        "autorec": "AutoRec"
    }
)

In [None]:
results

In [None]:
from plotly import graph_objects as go


def visualize_3d_plot(results, sample_sizes):
    name_to_color = {
        "kNN": "yellow",
        "SVD": "red",
        "AutoRec": "blue"
    }
    for sample_size in sample_sizes:
        data = results[results["sample_size"] == sample_size]
        fig = go.Figure(
            layout=go.Layout(
                height=600,
                width=800,
                font=dict(size=16),
                title=dict(
                    text=f"Model RMSE depending on α1 and α2 values. \n{sample_size}"
                ),
                margin=dict(l=20, r=20, t=20, b=20),
                scene=dict(
                    xaxis = dict(title="α1"),
                    yaxis = dict(title="α2"),
                    zaxis = dict(title="RMSE")
                ),
                scene_camera=dict(
                    up=dict(x=0, y=0, z=1),
                    center=dict(x=0, y=0, z=0),
                    eye=dict(x=1.2, y=1.8, z=1.0)
                )
            ),
            data=[
                go.Scatter3d(
                    name=model_name,
                    x=data[data["Model"] == model_name]["α1"],
                    y=data[data["Model"] == model_name]["α2"],
                    z=data[data["Model"] == model_name]["RMSE"],
                    mode="markers",
                    marker=dict(
                        size=6,
                        color=name_to_color[model_name],
                        line=dict(width=1, color='DarkSlateGrey')
                    )
                ) for model_name in ["AutoRec", "SVD", "kNN"]
            ]
        )

        fig.show("notebook")
        
        
visualize_3d_plot(results, SAMPLE_SIZES)