In [None]:
!pip install numpy==1.20.1 \
             sdv==0.8.0 \
             pandas==1.1.4 \
             scipy==1.6.2 \
             deepctr-torch==0.2.6 \
             scikit-surprise==1.1.1


Collecting deepctr-torch==0.2.6
  Using cached deepctr_torch-0.2.6-py3-none-any.whl (63 kB)
Collecting scikit-surprise==1.1.1
  Using cached scikit-surprise-1.1.1.tar.gz (11.8 MB)
Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting tensorflow
  Using cached tensorflow-2.4.1-cp38-cp38-macosx_10_11_x86_64.whl (173.9 MB)
Collecting h5py~=2.10.0
  Using cached h5py-2.10.0-cp38-cp38-macosx_10_9_x86_64.whl (3.0 MB)
Collecting tensorflow-estimator<2.5.0,>=2.4.0
  Using cached tensorflow_estimator-2.4.0-py2.py3-none-any.whl (462 kB)
Collecting flatbuffers~=1.12.0
  Using cached flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting keras-preprocessing~=1.1.2
  Using cached Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
Collecting tensorboard~=2.4
  Using cached tensorboard-2.5.0-py3-none-any.whl (6.0 MB)
Collecting google-pasta~=0.2
  Using cached google_pasta-0.2.0-py3-none-any.whl (57 kB)
Collecting absl-py~=0.10
  Using cached absl_py-0.12.0-py3-none-any.whl (

Collecting termcolor>=1.1.0
  Using cached termcolor-1.1.0.tar.gz (3.9 kB)
Collecting wrapt>=1.11.1
  Using cached wrapt-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl
Collecting tensorflow
  Using cached tensorflow-2.3.1-cp38-cp38-macosx_10_14_x86_64.whl (165.2 MB)
  Using cached tensorflow-2.3.0-cp38-cp38-macosx_10_11_x86_64.whl (165.2 MB)
  Using cached tensorflow-2.2.2-cp38-cp38-macosx_10_11_x86_64.whl (175.4 MB)
Collecting tensorflow-estimator<2.3.0,>=2.2.0
  Using cached tensorflow_estimator-2.2.0-py2.py3-none-any.whl (454 kB)
Collecting tensorboard<2.3.0,>=2.2.0
  Using cached tensorboard-2.2.2-py3-none-any.whl (3.0 MB)
Collecting tensorflow
  Using cached tensorflow-2.2.1-cp38-cp38-macosx_10_14_x86_64.whl (175.4 MB)
  Using cached tensorflow-2.2.0-cp38-cp38-macosx_10_11_x86_64.whl (175.4 MB)
INFO: pip is looking at multiple versions of sklearn to determine which version is compatible with other requirements. This could take a while.
INFO: pip is looking at multiple versions of pyyaml 

  Using cached tqdm-4.46.1-py2.py3-none-any.whl (63 kB)
  Using cached tqdm-4.46.0-py2.py3-none-any.whl (63 kB)
  Using cached tqdm-4.45.0-py2.py3-none-any.whl (60 kB)
  Using cached tqdm-4.44.1-py2.py3-none-any.whl (60 kB)
  Using cached tqdm-4.44.0-py2.py3-none-any.whl (60 kB)
  Using cached tqdm-4.43.0-py2.py3-none-any.whl (59 kB)
  Using cached tqdm-4.42.1-py2.py3-none-any.whl (59 kB)
  Using cached tqdm-4.42.0-py2.py3-none-any.whl (59 kB)
  Using cached tqdm-4.41.1-py2.py3-none-any.whl (56 kB)
  Using cached tqdm-4.41.0-py2.py3-none-any.whl (56 kB)
  Using cached tqdm-4.40.2-py2.py3-none-any.whl (55 kB)
  Using cached tqdm-4.40.1-py2.py3-none-any.whl (55 kB)
  Using cached tqdm-4.40.0-py2.py3-none-any.whl (54 kB)
  Using cached tqdm-4.39.0-py2.py3-none-any.whl (53 kB)
  Using cached tqdm-4.38.0-py2.py3-none-any.whl (53 kB)
  Using cached tqdm-4.37.0-py2.py3-none-any.whl (53 kB)
  Using cached tqdm-4.36.1-py2.py3-none-any.whl (52 kB)
  Using cached tqdm-4.36.0-py2.py3-none-any.whl 

In [None]:
from sdv.tabular import GaussianCopula
import numpy as npxm
import pandas as pd
from numpy import random as npra



## Helper classes and functions

In [None]:
class ResponseFunction:
    def __init__(self, heu_matrix, nn_matrix, noise_matrix):
        assert heu_matrix.shape == nn_matrix.shape
        self._heu_matrix = heu_matrix
        self._nn_matrix = nn_matrix
        self._noise_matrix = noise_matrix
        
    def __call__(self, a1: float, a2: float):
        a3 = max(0.0, 1 - a1 - a2)
        return (
            a1 * self._heu_matrix
            + a2 * self._nn_matrix
            + a3 * npr.normal(0, 1, size=self._heu_matrix.shape)
        )
    

class DeepFMDataLoader:
    def __init__(self, *, sparse_features, dense_features):
        self._sparse_feats = sparse_features
        self._dense_feats = dense_features
        
    def load(self, dataset):
        nn_input = pd.DataFrame()
        nn_input[self._sparse_feats] = dataset[self._sparse_feats]
        nn_input[self._dense_feats] = dataset[self._dense_feats]
        
        for feat in self._sparse_feats:
            encoder = LabelEncoder()
            nn_input[feat] = encoder.fit_transform(nn_input[feat])
            
        mms = MinMaxScaler(feature_range=(0,1))
        nn_input[self._dense_feats] = mms.fit_transform(nn_input[self._dense_feats])
        
        # problems may be here
        sparse_feature_columns = [
            SparseFeat(feat, vocabulary_size=nn_input[feat].nunique(), embedding_dim=4) 
            for i, feat in enumerate(self._sparse_feats)
        ]

        dense_feature_columns = [DenseFeat(feat, 1,) for feat in self._dense_feats]
        
        dnn_feat_cols = sparse_feature_columns + dense_feature_columns
        linear_feat_cols = sparse_feature_columns + dense_feature_columns
        
        feat_names = get_feature_names(linear_feat_cols + dnn_feat_cols)
        return nn_input, dnn_feat_cols, linear_feat_cols, feat_names
      
        
def merge_feats(feats_a, feats_b):
    assert len(feats_a) == len(feats_b)
    merged = []
    for feat_a, feat_b in zip(feats_a, feats_b):
        if isinstance(feat_a, DenseFeat):
            continue
        if feat_a.vocabulary_size >= feat_b.vocabulary_size:
            merged.append(feat_a)
        else:
            merged.append(feat_b)
    return merged
            

class NNModelWrapper:
    def __init__(self, trained_nn):
        self._nn = trained_nn

    def predict_rating_matrix(self, nn_input, merged_df):
        y = self._nn.predict(nn_input)
        result = pd.DataFrame()
        result["rating"] = y.reshape((len(y),))
        result["user_id"] = merged_df["user_id"]
        result["item_id"] = merged_df["item_id"]
        output_matrix = result.pivot(index="user_id", columns="item_id", values="rating")
        return output_matrix
    

def _cross_join(df1, df2):
    df1["_join_key"] = 0
    df2["_join_key"] = 0
    merged_df = df1.merge(df2, on="_join_key")
    merged_df = merged_df.drop("_join_key", axis=1)
    return merged_df


def rating_matrix_to_long_table(rating_matrix):
    df = pd.DataFrame(rating_matrix)
    df["user_id"] = df.index
    return df.melt(id_vars=["user_id"], var_name="item_id", value_name="rating")



## Plan

- Load and clean the data;
- Generate a synthetic dataset;
- Fit and evaluate DeepFM model on synthetic data;
- Create a simlirity function between between user and item;
- Create a rating matrix based on similarity function output;
- Create a rating matrix based on DeepFM output;
- Fit a SVD model to the matrix generated by `a1 * sim(u, i) + a2 * deepfm(u, i) + a3 * N(0, 1)`;
- Display plots;

# AutoRec model

## Data loading and cleaning

In [None]:
def prepare_user_profile_df(df):
    df = df.drop(["latitude", "longitude"], axis=1)
    df = df.replace("?", pd.NA)
    df = df.fillna(method="bfill")
    return df


def prepare_user_cuisine_df(df):
    df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("userID").sum()
    return df


def load_and_clean_users_df():
    user_profile_df = pd.read_csv("../data/restaurant_data/userprofile.csv")
    user_cuisine_df = pd.read_csv("../data/restaurant_data/usercuisine.csv")
    user_profile_df = prepare_user_profile_df(user_profile_df)
    user_cuisine_df = prepare_user_cuisine_df(user_cuisine_df)

    users_df = pd.merge(user_profile_df, user_cuisine_df, on="userID")
    return users_df



In [None]:
users_df = load_and_clean_users_df()
users_df

In [None]:
def load_and_prepare_rest_cuisine_df():
    df = pd.read_csv("../data/restaurant_data/chefmozcuisine.csv")
    df = df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("placeID").sum()
    return df
    


In [None]:
rests_df = load_and_prepare_rest_cuisine_df()
rests_df

In [None]:
ratings_df = pd.read_csv("../data/restaurant_data_reformatted/ratings.csv")
ratings_df

In [None]:
merged_df = pd.merge(ratings_df, users_df, on="userID")
merged_df = pd.merge(merged_df, rests_df, on="placeID")
merged_df

In [None]:
users_df.dtypes

## Generate syn data

In [None]:
def fit_syn_generator(df):
    model = GaussianCopula()
    df = df.drop(["userID", "placeID"], axis=1) # Drop ids
    df = df.astype("int64", errors="ignore") # Convert all numbers to int64
    model.fit(df.copy())
    return model



In [None]:
syn_data_generator = fit_syn_generator(merged_df)

In [None]:
syn_merged_df = syn_data_generator.sample(10_000)
syn_merged_df.head()

## Fit DeepFM

In [None]:
from deepctr_torch.models import DeepFM


class DeepFmModel:
    def __init__(self, linear_feature_columns, dnn_feature_columns, feature_names):
        self._linear_feature_columns = linear_feature_columns
        self._dnn_feature_columns = dnn_feature_columns
        self._feature_names = feature_names
        self._deepfm = DeepFM(
            self._linear_feature_columns,
            self._dnn_feature_columns,
            task='multiclass',
            device='cpu'
        )
        self._deepfm.compile("adam", "mse", metrics=['mse'], )
        
    def train(self, train_set, target_values):
        train_model_input = {n: train_set[n] for n in self._feature_names}
        history = self._deepfm.fit(
            train_model_input,
            target_values,
            batch_size=256,
            epochs=10,
            verbose=2,
            validation_split=0.2
        )

        return history

    def predict(self, test_set):
        test_model_input = {n: test_set[n] for n in self._feature_names}
        result = self._deepfm.predict(test_model_input, batch_size=256)
        return result

    #TODO: add evaluate() method
    


In [None]:
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names



In [None]:
dense_feat_names = ["height", "weight", "birth_year"]
sparse_feat_names = [
    c for c in list(syn_merged_df.columns) if c not in [
        "rating",
        "food_rating",
        "service_rating",
        "weight",
        "height",
        "birth_year"
    ]
]

In [None]:
data_loader = DeepFMDataLoader(sparse_features=sparse_feat_names, dense_features=dense_feat_names)
nn_train_input, dnn_feats, lin_feats, feat_names = data_loader.load(syn_merged_df)

In [None]:
def nn_prepare_data_for_rating_matrix(users_df, rests_df):
    users_df = users_df.drop("userID", axis=1)
    users_df["user_id"] = range(0, len(users_df))
    rests_df["item_id"] = range(0, len(rests_df))
    user_rest_long_table = _cross_join(users_df, rests_df)
    return user_rest_long_table

user_rest_long_table = nn_prepare_data_for_rating_matrix(users_df.copy(), rests_df.copy())

In [None]:
nn_user_rest_long_table, _dnn_feats, _lin_feats, _feat_names = data_loader.load(user_rest_long_table)

In [None]:
_merged_feats = merge_feats(dnn_feats, _dnn_feats)

In [None]:
def train_deepfm(feats, feat_names, x, y):
    deepfm = DeepFmModel(feats, feats, feat_names)
    train_set, test_set = train_test_split(x, test_size=0.2)
    deepfm.train(train_set, target_values=y[:len(train_set)])
    return deepfm
    

In [None]:
deepfm = train_deepfm(_merged_feats, feat_names, x=nn_train_input, y=syn_merged_df["rating"].values)

In [None]:
model_wrapper = NNModelWrapper(deepfm)
deepfm_rating_matrix = model_wrapper.predict_rating_matrix(nn_user_rest_long_table, user_rest_long_table)
deepfm_rating_matrix

In [None]:
deepfm_rating_matrix = np.around(deepfm_rating_matrix)
deepfm_rating_matrix

## Make similarity matrix

Creating a similarity matrix between users and restaurants based on users' food preference and restaurants' cuisines.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


def make_similarity_matrix(users_df, rests_df):
    users_food_pref_df = users_df[rests_df.columns]
    return cosine_similarity(users_food_pref_df, rests_df)



In [None]:
sim_matrix = make_similarity_matrix(users_df, rests_df)
sim_matrix = sim_matrix * 2

## Experiment

In [None]:
import itertools
import progressbar
from sklearn.model_selection import KFold
from surprise import Dataset, Reader, SVD, accuracy, KNNBasic
from surprise.model_selection import cross_validate, train_test_split



In [None]:
def _svd_train(data, sample_frac):
    dataset = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], Reader(rating_scale=(0, 2)))
    train_set, test_set = train_test_split(dataset, test_size=1.0 - sample_frac)
    algo = SVD()
    algo.fit(train_set)
    predictions = algo.test(test_set)
    return {"test_rmse": [accuracy.rmse(predictions)]}

def _knn_train(data, sample_frac):
    dataset = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], Reader(rating_scale=(0, 2)))
    train_set, test_set = train_test_split(dataset, test_size=1.0 - sample_frac)
    
    algo = KNNBasic()
    algo.fit(train_set)
    predictions = algo.test(test_set)
    return {"test_rmse": [accuracy.rmse(predictions)]}



def _transform_long_table_to_sparse_matrix(self, df, test_size):
    n_users = df.user_id.unique().shape[0]
    n_items = df.item_id.unique().shape[0]

    train_data, test_data = train_test_split(df, test_size=test_size)
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)

    train_row = []
    train_col = []
    train_rating = []

    for line in train_data.itertuples():
        u = line[1] - 1
        i = line[2] - 1
        train_row.append(u)
        train_col.append(i)
        train_rating.append(line[3])
    train_matrix = csr_matrix((train_rating, (train_row, train_col)), shape=(n_users, n_items))

    test_row = []
    test_col = []
    test_rating = []
    for line in test_data.itertuples():
        test_row.append(line[1] - 1)
        test_col.append(line[2] - 1)
        test_rating.append(line[3])
    test_matrix = csr_matrix((test_rating, (test_row, test_col)), shape=(n_users, n_items))
    print("Load data finished. Number of users:", n_users, "Number of items:", n_items)
    return train_matrix.todok(), test_matrix.todok(), n_users, n_items


def _train_autorec(data, sample_frac):
    train_matrix, test_matrix, n_users, n_items = _transform_long_table_to_sparse_matrix(data, test_size=1.0 - sample_frac)
    with tf.Session(config=config) as sess:
        model = IAutoRec(sess, n_users, n_items)
        model.build_network()
        model.execute(train_matrix, test_matrix)
    return None
    


In [None]:
import typing as t
import itertools
import collections
import functools
from multiprocessing import Pool
import datetime


ResponseFunctionParams = collections.namedtuple(
    "ResponseFunctionParams",
    ["sim_matrix", "deepfm_rating_matrix", "noise_matrix"]
)


COUNT = 10


def _iterate_a2(args, *, train_fn):
    a1, sample_frac, resp_fn_params = args
    response_function = ResponseFunction(*resp_fn_params)
    results = []
    a1_normalized = a1 / COUNT
    for a2 in range(0, COUNT - a1):
        a2_normalized = a2 / COUNT
        ground_truth_matrix = response_function(a1_normalized, a2_normalized)
        gt_long_table = rating_matrix_to_long_table(ground_truth_matrix)
        train_error_log = train_fn(gt_long_table, sample_frac=sample_frac)
        
        results.append((a1_normalized, a2_normalized, train_error_log))
        print(f"-- Experiment: ({a1_normalized}, {a2_normalized})")
    return results


def _experiment(resp_fn_params, *, train_fn, sample_frac=0.5, n_processes=4):
    procs_args = [(a1, sample_frac, resp_fn_params) for a1 in range(0, COUNT)]
    start_time = datetime.datetime.utcnow()
    with Pool(n_processes) as p:
        results = p.map(functools.partial(_iterate_a2, train_fn=train_fn), procs_args)
    calc_duration = datetime.datetime.utcnow() - start_time
    print(f"Total calcucation duration: {calc_duration}")
    return list(itertools.chain.from_iterable(results))



In [None]:
def _run(train_fn):
    noise_matrix = npr.normal(0, 1, size=sim_matrix.shape)
    resp_fn_params = ResponseFunctionParams(sim_matrix, deepfm_rating_matrix, noise_matrix)
    experiment_results = _experiment(resp_fn_params, train_fn=train_fn)
    return experiment_results



In [None]:
svd_exp_results = _run(_svd_train)



In [None]:
knn_exp_results = _run(_knn_train)

In [None]:
_results = [(a1, a2, np.mean(res["test_rmse"])) for a1, a2, res in knn_exp_results]
knn_results_df = pd.DataFrame(_results, columns=["a1", "a2", "rmse"])

In [None]:
knn_results_df["type"] = "knn"

In [None]:
knn_results_df

In [None]:
_results = [(a1, a2, np.mean(res["test_rmse"])) for a1, a2, res in svd_exp_results]
svd_results_df = pd.DataFrame(_results, columns=["a1", "a2", "rmse"])

In [None]:
svd_results_df["type"] = "svd"

In [None]:
svd_results_df

In [None]:
final_df = pd.concat([knn_results_df, svd_results_df])

In [None]:
import plotly.express as px

fig = px.scatter_3d(
    final_df, 
    x='a1', 
    y='a2', 
    z='rmse', 
    size_max=8, 
    opacity=1,
    color="type",
    color_continuous_scale=px.colors.sequential.thermal[::-1]
)

fig.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
)

fig.show()

In [None]:
__noise_matrix = npr.normal(0, 1, size=sim_matrix.shape)
__resp_fn_params = ResponseFunctionParams(sim_matrix, deepfm_rating_matrix, __noise_matrix)
_experiment_results = _experiment(__resp_fn_params)