In [5]:
!pip install numpy==1.20.1 \
             sdv==0.8.0 \
             pandas \
             scipy \
             deepctr-torch \
             scikit-surprise






In [6]:
from sdv.tabular import GaussianCopula
import numpy as np
import pandas as pd
from numpy import random as npr




## Helper classes and functions

In [7]:
class ResponseFunction:
    def __init__(self, heu_matrix, nn_matrix, noise_matrix):
        assert heu_matrix.shape == nn_matrix.shape
        self._heu_matrix = heu_matrix
        self._nn_matrix = nn_matrix
        self._noise_matrix = noise_matrix
        
    def __call__(self, a1: float, a2: float):
        a3 = max(0.0, 1 - a1 - a2)
        return (
            a1 * self._heu_matrix
            + a2 * self._nn_matrix
            + a3 * npr.normal(0, 1, size=self._heu_matrix.shape)
        )
    

class DeepFMDataLoader:
    def __init__(self, *, sparse_features, dense_features):
        self._sparse_feats = sparse_features
        self._dense_feats = dense_features
        
    def load(self, dataset):
        nn_input = pd.DataFrame()
        nn_input[self._sparse_feats] = dataset[self._sparse_feats]
        nn_input[self._dense_feats] = dataset[self._dense_feats]
        
        for feat in self._sparse_feats:
            encoder = LabelEncoder()
            nn_input[feat] = encoder.fit_transform(nn_input[feat])
            
        mms = MinMaxScaler(feature_range=(0,1))
        nn_input[self._dense_feats] = mms.fit_transform(nn_input[self._dense_feats])
        
        # problems may be here
        sparse_feature_columns = [
            SparseFeat(feat, vocabulary_size=nn_input[feat].nunique(), embedding_dim=4) 
            for i, feat in enumerate(self._sparse_feats)
        ]

        dense_feature_columns = [DenseFeat(feat, 1,) for feat in self._dense_feats]
        
        dnn_feat_cols = sparse_feature_columns + dense_feature_columns
        linear_feat_cols = sparse_feature_columns + dense_feature_columns
        
        feat_names = get_feature_names(linear_feat_cols + dnn_feat_cols)
        return nn_input, dnn_feat_cols, linear_feat_cols, feat_names
      
        
def merge_feats(feats_a, feats_b):
    assert len(feats_a) == len(feats_b)
    merged = []
    for feat_a, feat_b in zip(feats_a, feats_b):
        if isinstance(feat_a, DenseFeat):
            continue
        if feat_a.vocabulary_size >= feat_b.vocabulary_size:
            merged.append(feat_a)
        else:
            merged.append(feat_b)
    return merged
            

class NNModelWrapper:
    def __init__(self, trained_nn):
        self._nn = trained_nn

    def predict_rating_matrix(self, nn_input, merged_df):
        y = self._nn.predict(nn_input)
        result = pd.DataFrame()
        result["rating"] = y.reshape((len(y),))
        result["user_id"] = merged_df["user_id"]
        result["item_id"] = merged_df["item_id"]
        output_matrix = result.pivot(index="user_id", columns="item_id", values="rating")
        return output_matrix
    

def _cross_join(df1, df2):
    df1["_join_key"] = 0
    df2["_join_key"] = 0
    merged_df = df1.merge(df2, on="_join_key")
    merged_df = merged_df.drop("_join_key", axis=1)
    return merged_df


def rating_matrix_to_long_table(rating_matrix):
    df = pd.DataFrame(rating_matrix)
    df["user_id"] = df.index
    return df.melt(id_vars=["user_id"], var_name="item_id", value_name="rating")



## Plan

- Load and clean the data;
- Generate a synthetic dataset;
- Fit and evaluate DeepFM model on synthetic data;
- Create a simlirity function between between user and item;
- Create a rating matrix based on similarity function output;
- Create a rating matrix based on DeepFM output;
- Fit a SVD model to the matrix generated by `a1 * sim(u, i) + a2 * deepfm(u, i) + a3 * N(0, 1)`;
- Display plots;

# AutoRec model

## Data loading and cleaning

In [8]:
def prepare_user_profile_df(df):
    df = df.drop(["latitude", "longitude"], axis=1)
    df = df.replace("?", pd.NA)
    df = df.fillna(method="bfill")
    return df


def prepare_user_cuisine_df(df):
    df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("userID").sum()
    return df


def load_and_clean_users_df():
    user_profile_df = pd.read_csv("../data/restaurant_data/userprofile.csv")
    user_cuisine_df = pd.read_csv("../data/restaurant_data/usercuisine.csv")
    user_profile_df = prepare_user_profile_df(user_profile_df)
    user_cuisine_df = prepare_user_cuisine_df(user_cuisine_df)

    users_df = pd.merge(user_profile_df, user_cuisine_df, on="userID")
    return users_df



In [9]:
users_df = load_and_clean_users_df()
users_df

Unnamed: 0,userID,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,...,Swiss,Tapas,Tea_House,Tex-Mex,Thai,Tibetan,Tunisian,Turkish,Vegetarian,Vietnamese
0,U1001,false,abstemious,informal,family,on foot,single,independent,1989,variety,...,0,0,0,0,0,0,0,0,0,0
1,U1002,false,abstemious,informal,family,public,single,independent,1990,technology,...,0,0,0,0,0,0,0,0,0,0
2,U1003,false,social drinker,formal,family,public,single,independent,1989,none,...,0,0,0,0,0,0,0,0,0,0
3,U1004,false,abstemious,informal,family,public,single,independent,1940,variety,...,0,0,0,0,0,0,0,0,0,0
4,U1005,false,abstemious,no preference,family,public,single,independent,1992,none,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,U1134,false,casual drinker,no preference,family,public,single,independent,1991,variety,...,0,0,0,0,0,0,0,0,0,0
134,U1135,false,casual drinker,informal,family,on foot,single,kids,1988,variety,...,1,1,1,1,1,1,1,1,1,1
135,U1136,true,social drinker,no preference,friends,car owner,single,independent,1990,retro,...,0,0,0,0,0,0,0,0,0,0
136,U1137,false,social drinker,formal,family,public,single,independent,1989,eco-friendly,...,0,0,0,0,0,0,0,0,0,0


In [10]:
def load_and_prepare_rest_cuisine_df():
    df = pd.read_csv("../data/restaurant_data/chefmozcuisine.csv")
    df = df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("placeID").sum()
    return df
    


In [11]:
rests_df = load_and_prepare_rest_cuisine_df()
rests_df

Unnamed: 0_level_0,Afghan,African,American,Armenian,Asian,Bagels,Bakery,Bar,Bar_Pub_Brewery,Barbecue,...,Soup,Southern,Southwestern,Spanish,Steaks,Sushi,Thai,Turkish,Vegetarian,Vietnamese
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
ratings_df = pd.read_csv("../data/restaurant_data_reformatted/ratings.csv")
ratings_df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2
...,...,...,...,...,...
1156,U1043,132630,1,1,1
1157,U1011,132715,1,1,0
1158,U1068,132733,1,1,0
1159,U1068,132594,1,1,1


In [13]:
merged_df = pd.merge(ratings_df, users_df, on="userID")
merged_df = pd.merge(merged_df, rests_df, on="placeID")
merged_df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,smoker,drink_level,dress_preference,ambience,transport,...,Soup_y,Southern_y,Southwestern_y,Spanish_y,Steaks_y,Sushi_y,Thai_y,Turkish_y,Vegetarian_y,Vietnamese_y
0,U1077,135085,2,2,2,false,social drinker,elegant,family,public,...,0,0,0,0,0,0,0,0,0,0
1,U1108,135085,1,2,1,false,abstemious,informal,solitary,public,...,0,0,0,0,0,0,0,0,0,0
2,U1081,135085,1,2,1,false,casual drinker,informal,family,public,...,0,0,0,0,0,0,0,0,0,0
3,U1001,135085,0,1,1,false,abstemious,informal,family,on foot,...,0,0,0,0,0,0,0,0,0,0
4,U1056,135085,2,2,2,false,social drinker,informal,family,on foot,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,U1006,132922,2,1,2,true,social drinker,no preference,friends,car owner,...,0,0,0,0,0,0,0,0,0,0
869,U1003,132937,2,2,1,false,social drinker,formal,family,public,...,0,0,0,0,0,0,0,0,0,0
870,U1027,132937,1,1,1,true,social drinker,no preference,family,public,...,0,0,0,0,0,0,0,0,0,0
871,U1029,132937,1,1,1,true,casual drinker,formal,family,public,...,0,0,0,0,0,0,0,0,0,0


In [14]:
users_df.dtypes

userID              object
smoker              object
drink_level         object
dress_preference    object
ambience            object
                     ...  
Tibetan              uint8
Tunisian             uint8
Turkish              uint8
Vegetarian           uint8
Vietnamese           uint8
Length: 120, dtype: object

## Generate syn data

In [15]:
def fit_syn_generator(df):
    model = GaussianCopula()
    df = df.drop(["userID", "placeID"], axis=1) # Drop ids
    df = df.astype("int64", errors="ignore") # Convert all numbers to int64
    model.fit(df.copy())
    return model



In [16]:
syn_data_generator = fit_syn_generator(merged_df)

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  improvement from the last five Jacobian evaluations.
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale


In [17]:
syn_merged_df = syn_data_generator.sample(10_000)
syn_merged_df.head()

Unnamed: 0,rating,food_rating,service_rating,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,...,Soup_y,Southern_y,Southwestern_y,Spanish_y,Steaks_y,Sushi_y,Thai_y,Turkish_y,Vegetarian_y,Vietnamese_y
0,0,0,0,False,social drinker,elegant,family,public,single,independent,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,False,abstemious,formal,family,public,single,independent,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,False,social drinker,elegant,friends,public,single,independent,...,0,0,0,0,0,0,0,0,0,0
3,1,2,2,False,social drinker,elegant,family,public,single,independent,...,0,0,0,0,0,0,0,0,0,0
4,2,2,2,False,social drinker,elegant,family,car owner,single,independent,...,0,0,0,0,0,0,0,0,0,0


## Fit DeepFM

In [18]:
from deepctr_torch.models import DeepFM


class DeepFmModel:
    def __init__(self, linear_feature_columns, dnn_feature_columns, feature_names):
        self._linear_feature_columns = linear_feature_columns
        self._dnn_feature_columns = dnn_feature_columns
        self._feature_names = feature_names
        self._deepfm = DeepFM(
            self._linear_feature_columns,
            self._dnn_feature_columns,
            task='multiclass',
            device='cpu'
        )
        self._deepfm.compile("adam", "mse", metrics=['mse'], )
        
    def train(self, train_set, target_values):
        train_model_input = {n: train_set[n] for n in self._feature_names}
        history = self._deepfm.fit(
            train_model_input,
            target_values,
            batch_size=256,
            epochs=10,
            verbose=2,
            validation_split=0.2
        )

        return history

    def predict(self, test_set):
        test_model_input = {n: test_set[n] for n in self._feature_names}
        result = self._deepfm.predict(test_model_input, batch_size=256)
        return result

    #TODO: add evaluate() method
    


In [19]:
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names



In [20]:
dense_feat_names = ["height", "weight", "birth_year"]
sparse_feat_names = [
    c for c in list(syn_merged_df.columns) if c not in [
        "rating",
        "food_rating",
        "service_rating",
        "weight",
        "height",
        "birth_year"
    ]
]

In [21]:
data_loader = DeepFMDataLoader(sparse_features=sparse_feat_names, dense_features=dense_feat_names)
nn_train_input, dnn_feats, lin_feats, feat_names = data_loader.load(syn_merged_df)

In [22]:
def nn_prepare_data_for_rating_matrix(users_df, rests_df):
    users_df = users_df.drop("userID", axis=1)
    users_df["user_id"] = range(0, len(users_df))
    rests_df["item_id"] = range(0, len(rests_df))
    user_rest_long_table = _cross_join(users_df, rests_df)
    return user_rest_long_table

user_rest_long_table = nn_prepare_data_for_rating_matrix(users_df.copy(), rests_df.copy())

In [23]:
nn_user_rest_long_table, _dnn_feats, _lin_feats, _feat_names = data_loader.load(user_rest_long_table)

In [24]:
_merged_feats = merge_feats(dnn_feats, _dnn_feats)

In [25]:
def train_deepfm(feats, feat_names, x, y):
    deepfm = DeepFmModel(feats, feats, feat_names)
    train_set, test_set = train_test_split(x, test_size=0.2)
    deepfm.train(train_set, target_values=y[:len(train_set)])
    return deepfm
    

In [26]:
deepfm = train_deepfm(_merged_feats, feat_names, x=nn_train_input, y=syn_merged_df["rating"].values)

cpu
Train on 6400 samples, validate on 1600 samples, 25 steps per epoch
Epoch 1/10
9s - loss:  0.9045 - mse:  0.9045 - val_mse:  0.7194
Epoch 2/10
8s - loss:  0.7265 - mse:  0.7265 - val_mse:  0.6924
Epoch 3/10
8s - loss:  0.7195 - mse:  0.7195 - val_mse:  0.6918
Epoch 4/10
7s - loss:  0.7178 - mse:  0.7178 - val_mse:  0.7041
Epoch 5/10
10s - loss:  0.7194 - mse:  0.7194 - val_mse:  0.6959
Epoch 6/10
9s - loss:  0.7178 - mse:  0.7178 - val_mse:  0.6929
Epoch 7/10
9s - loss:  0.7162 - mse:  0.7162 - val_mse:  0.6924
Epoch 8/10
9s - loss:  0.7151 - mse:  0.7151 - val_mse:  0.6933
Epoch 9/10
8s - loss:  0.7172 - mse:  0.7172 - val_mse:  0.6930
Epoch 10/10
9s - loss:  0.7147 - mse:  0.7147 - val_mse:  0.6935


In [27]:
model_wrapper = NNModelWrapper(deepfm)
deepfm_rating_matrix = model_wrapper.predict_rating_matrix(nn_user_rest_long_table, user_rest_long_table)
deepfm_rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.650213,0.686209,0.703338,0.686209,0.679130,0.650213,0.650205,0.703338,0.703338,0.643245,...,0.650093,0.650093,0.643243,0.591489,0.657173,0.652611,0.657173,0.650322,0.616905,0.650314
1,0.736312,0.772736,0.789632,0.772736,0.765628,0.736312,0.736308,0.789632,0.789632,0.729325,...,0.736184,0.736184,0.729306,0.677303,0.743295,0.738885,0.743295,0.736418,0.701916,0.736424
2,0.759646,0.796144,0.812999,0.796144,0.789031,0.759646,0.759643,0.812999,0.812999,0.752655,...,0.759516,0.759516,0.752634,0.700587,0.766633,0.762249,0.766633,0.759751,0.725060,0.759760
3,0.639487,0.675151,0.692461,0.675151,0.668095,0.639487,0.639476,0.692461,0.692461,0.632534,...,0.639374,0.639374,0.632545,0.580984,0.646429,0.641749,0.646429,0.639600,0.607025,0.639580
4,0.731609,0.767911,0.784873,0.767911,0.760811,0.731609,0.731604,0.784873,0.784873,0.724627,...,0.731483,0.731483,0.724613,0.672682,0.738586,0.734132,0.738586,0.731716,0.697523,0.731718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,0.718732,0.755068,0.772012,0.755068,0.747966,0.718732,0.718727,0.772012,0.772012,0.711749,...,0.718605,0.718605,0.711733,0.659781,0.725710,0.721269,0.725710,0.718838,0.684557,0.718841
134,0.854365,0.889953,0.907305,0.889953,0.882902,0.854365,0.854353,0.907305,0.907305,0.847415,...,0.854253,0.854253,0.847428,0.795912,0.861303,0.856596,0.861303,0.854478,0.822096,0.854456
135,0.746169,0.782586,0.799485,0.782586,0.775479,0.746169,0.746165,0.799485,0.799485,0.739183,...,0.746042,0.746042,0.739164,0.687166,0.753152,0.748740,0.753152,0.746275,0.711790,0.746281
136,0.843034,0.879548,0.896395,0.879548,0.872434,0.843034,0.843031,0.896395,0.896395,0.836043,...,0.842904,0.842904,0.836021,0.783965,0.850023,0.845644,0.850023,0.843139,0.808409,0.843149


In [28]:
deepfm_rating_matrix = np.around(deepfm_rating_matrix)
deepfm_rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
134,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
135,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
136,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Make similarity matrix

Creating a similarity matrix between users and restaurants based on users' food preference and restaurants' cuisines.

In [29]:
from sklearn.metrics.pairwise import cosine_similarity


def make_similarity_matrix(users_df, rests_df):
    users_food_pref_df = users_df[rests_df.columns]
    return cosine_similarity(users_food_pref_df, rests_df)



In [30]:
sim_matrix = make_similarity_matrix(users_df, rests_df)
sim_matrix = sim_matrix * 2

## Experiment

In [31]:
import itertools
from sklearn.model_selection import KFold
from surprise import Dataset, Reader, SVD, accuracy, KNNBasic
from surprise.model_selection import cross_validate, train_test_split



In [32]:
def _svd_train(data, sample_frac):
    dataset = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], Reader(rating_scale=(0, 2)))
    train_set, test_set = train_test_split(dataset, test_size=1.0 - sample_frac)
    algo = SVD()
    algo.fit(train_set)
    predictions = algo.test(test_set)
    return {"test_rmse": [accuracy.rmse(predictions)]}

def _knn_train(data, sample_frac):
    dataset = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], Reader(rating_scale=(0, 2)))
    train_set, test_set = train_test_split(dataset, test_size=1.0 - sample_frac)
    
    algo = KNNBasic()
    algo.fit(train_set)
    predictions = algo.test(test_set)
    return {"test_rmse": [accuracy.rmse(predictions)]}



def _transform_long_table_to_sparse_matrix(self, df, test_size):
    n_users = df.user_id.unique().shape[0]
    n_items = df.item_id.unique().shape[0]

    train_data, test_data = train_test_split(df, test_size=test_size)
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)

    train_row = []
    train_col = []
    train_rating = []

    for line in train_data.itertuples():
        u = line[1] - 1
        i = line[2] - 1
        train_row.append(u)
        train_col.append(i)
        train_rating.append(line[3])
    train_matrix = csr_matrix((train_rating, (train_row, train_col)), shape=(n_users, n_items))

    test_row = []
    test_col = []
    test_rating = []
    for line in test_data.itertuples():
        test_row.append(line[1] - 1)
        test_col.append(line[2] - 1)
        test_rating.append(line[3])
    test_matrix = csr_matrix((test_rating, (test_row, test_col)), shape=(n_users, n_items))
    print("Load data finished. Number of users:", n_users, "Number of items:", n_items)
    return train_matrix.todok(), test_matrix.todok(), n_users, n_items


def _train_autorec(data, sample_frac):
    train_matrix, test_matrix, n_users, n_items = _transform_long_table_to_sparse_matrix(data, test_size=1.0 - sample_frac)
    with tf.Session(config=config) as sess:
        model = IAutoRec(sess, n_users, n_items)
        model.build_network()
        model.execute(train_matrix, test_matrix)
    return None
    


In [33]:
import typing as t
import itertools
import collections
import functools
from multiprocessing import Pool
import datetime


ResponseFunctionParams = collections.namedtuple(
    "ResponseFunctionParams",
    ["sim_matrix", "deepfm_rating_matrix", "noise_matrix"]
)


COUNT = 10


def _iterate_a2(args, *, train_fn):
    a1, sample_frac, resp_fn_params = args
    response_function = ResponseFunction(*resp_fn_params)
    results = []
    a1_normalized = a1 / COUNT
    for a2 in range(0, COUNT - a1):
        a2_normalized = a2 / COUNT
        ground_truth_matrix = response_function(a1_normalized, a2_normalized)
        gt_long_table = rating_matrix_to_long_table(ground_truth_matrix)
        train_error_log = train_fn(gt_long_table, sample_frac=sample_frac)
        
        results.append((a1_normalized, a2_normalized, train_error_log))
        print(f"-- Experiment: ({a1_normalized}, {a2_normalized})")
    return results


def _experiment(resp_fn_params, *, train_fn, sample_frac=0.5, n_processes=4):
    procs_args = [(a1, sample_frac, resp_fn_params) for a1 in range(0, COUNT)]
    start_time = datetime.datetime.utcnow()
    with Pool(n_processes) as p:
        results = p.map(functools.partial(_iterate_a2, train_fn=train_fn), procs_args)
    calc_duration = datetime.datetime.utcnow() - start_time
    print(f"Total calcucation duration: {calc_duration}")
    return list(itertools.chain.from_iterable(results))



## Test

In [34]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [86]:
import importlib
from modules import evaluator, models, trainers
import logging
importlib.reload(models)
importlib.reload(evaluator)
importlib.reload(trainers)

logging.basicConfig(level=logging.INFO)


_resp_fn_config = evaluator.ResponseFunctionConfig(
    factory=ResponseFunction,
    args=[sim_matrix, deepfm_rating_matrix, npr.normal(0, 1, size=sim_matrix.shape)]
)

_train_test_executor_config = evaluator.TrainTestExecutorConfig(
    factory=trainers.AutoRecTrainTestExecutor,
    args=[{"epoch": 50}],
    model_name="autorec",
    config={}
)

_evaluator = evaluator.Evaluator(_resp_fn_config, n_proc=3)
_evaluator.evaluate(_train_test_executor_config, a_sample_rate=3, test_size=0.1)


Subprocess started.
Subprocess started.
Subprocess started.
Load data finished. Number of users:Load data finished. Number of users:  138138Load data finished. Number of users:   Number of items:Number of items:138   769769Number of items:

 769
IAutoRec.IAutoRec.IAutoRec.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Epoch: 0000; Epoch: 0000; Epoch: 0000; RMSE:0.6209295764509759; MAE:0.47829822176518644
RMSE:0.7214849470116564; MAE:0.5776631520570956RMSE:1.0091746320566461; MAE:0.8059792197281895

Epoch: 0003; Epoch: 0003; Epoch: 0003; RMSE:1.002675972373643; MAE:0.7973678230191994
RMSE:0.7172579309038003; MAE:0.5728795917183135
RMSE:0.6130045790910237; MAE:0.5135279826394118
Epoch: 0006; Epoch: 0006; Epoch: 0006; RMSE:0.9977089069189582; MAE:0.7949084968238793
RMSE:0.5687543309924747; MAE:0.4625056652773813
Epoch: 0009; Epoch: 0009; RMSE:0.707113960596351; MAE:0.5641183473209193
Epoch: 0009; RMSE:0.998292445937665; MAE:0.7962049567800802
RMSE:0.548324660721418; MAE:0.43296695229476284
Epoch: 0012; Epoch: 0012; RMSE:0.7056804018132014; MAE:0.5638046964138792
Epoch: 0012; RMSE:0.9976847120849648; MAE:0.7941745191710744
RMSE:0.5173875975533352; MAE:0.4147480139634921
Epoch: 0015; Epoch: 0015; RMSE:0.6988395246145456; MAE:0.558056239619473
Epoch: 0015; RMSE:0.99637607102545; MAE:0.7933553069639708
RMS

INFO:root:autorec - EvaluationResult(a1=0.0, a2=0.0, train_test_log=[{'epoch': 0, 'rmse': 1.0091746320566461, 'mae': 0.8059792197281895}, {'epoch': 3, 'rmse': 1.002675972373643, 'mae': 0.7973678230191994}, {'epoch': 6, 'rmse': 0.9977089069189582, 'mae': 0.7949084968238793}, {'epoch': 9, 'rmse': 0.998292445937665, 'mae': 0.7962049567800802}, {'epoch': 12, 'rmse': 0.9976847120849648, 'mae': 0.7941745191710744}, {'epoch': 15, 'rmse': 0.99637607102545, 'mae': 0.7933553069639708}, {'epoch': 18, 'rmse': 0.9969651651167589, 'mae': 0.7945771466630592}, {'epoch': 21, 'rmse': 0.9963813709771275, 'mae': 0.7937988176004388}, {'epoch': 24, 'rmse': 0.9964795854233994, 'mae': 0.7935351361922357}, {'epoch': 27, 'rmse': 0.9961098202849923, 'mae': 0.79345566011358}, {'epoch': 30, 'rmse': 0.9962770696413109, 'mae': 0.7937840610215121}, {'epoch': 33, 'rmse': 0.9963848480928427, 'mae': 0.7937032174704809}, {'epoch': 36, 'rmse': 0.9964027564799852, 'mae': 0.7936299714668639}, {'epoch': 39, 'rmse': 0.9966447

RMSE:0.6688281968806853; MAE:0.5333557683628565
RMSE:0.3536376050173006; MAE:0.27771978540421305


INFO:root:autorec - EvaluationResult(a1=0.3333333333333333, a2=0.0, train_test_log=[{'epoch': 0, 'rmse': 0.7214849470116564, 'mae': 0.5776631520570956}, {'epoch': 3, 'rmse': 0.7172579309038003, 'mae': 0.5728795917183135}, {'epoch': 6, 'rmse': 0.707113960596351, 'mae': 0.5641183473209193}, {'epoch': 9, 'rmse': 0.7056804018132014, 'mae': 0.5638046964138792}, {'epoch': 12, 'rmse': 0.6988395246145456, 'mae': 0.558056239619473}, {'epoch': 15, 'rmse': 0.6923357868947594, 'mae': 0.5529161816257541}, {'epoch': 18, 'rmse': 0.6861508780986555, 'mae': 0.5482582934806114}, {'epoch': 21, 'rmse': 0.6792843530868649, 'mae': 0.5425774398250748}, {'epoch': 24, 'rmse': 0.6737529813717897, 'mae': 0.5377006447353877}, {'epoch': 27, 'rmse': 0.6702116133156137, 'mae': 0.5344173465199576}, {'epoch': 30, 'rmse': 0.6701529271437001, 'mae': 0.534149115725382}, {'epoch': 33, 'rmse': 0.6716517570437686, 'mae': 0.5352264501841643}, {'epoch': 36, 'rmse': 0.671296768880246, 'mae': 0.5345912307440458}, {'epoch': 39, 

Load data finished. Number of users: 138 Number of items: 769
IAutoRec.Load data finished. Number of users:
 138 Number of items: 769
IAutoRec.
Epoch: 0000; Epoch: 0000; RMSE:0.6776479688830166; MAE:0.5402199023522973
Epoch: 0003; RMSE:0.4971967740412901; MAE:0.3876901882433194
Epoch: 0003; RMSE:0.6962997185501392; MAE:0.5567702921377784
Epoch: 0006; RMSE:0.48070143331095094; MAE:0.39401996057119826
Epoch: 0006; RMSE:0.6711857106965515; MAE:0.5346757455812894
Epoch: 0009; RMSE:0.4567361690566636; MAE:0.3691715185680308
Epoch: 0009; RMSE:0.6760166268437913; MAE:0.5386782299857276
Epoch: 0012; RMSE:0.4321002904609687; MAE:0.3431565302576195
Epoch: 0012; RMSE:0.6716271981024303; MAE:0.5354206946499
Epoch: 0015; RMSE:0.43496745004803095; MAE:0.34227452619199095
Epoch: 0015; RMSE:0.6706534711159501; MAE:0.5348771863281
Epoch: 0018; RMSE:0.4231548845760664; MAE:0.335730006935069
Epoch: 0018; RMSE:0.6696682321755875; MAE:0.5340738742117552
Epoch: 0021; RMSE:0.40980497712889624; MAE:0.32882720

INFO:root:autorec - EvaluationResult(a1=0.0, a2=0.3333333333333333, train_test_log=[{'epoch': 0, 'rmse': 0.6776479688830166, 'mae': 0.5402199023522973}, {'epoch': 3, 'rmse': 0.6962997185501392, 'mae': 0.5567702921377784}, {'epoch': 6, 'rmse': 0.6711857106965515, 'mae': 0.5346757455812894}, {'epoch': 9, 'rmse': 0.6760166268437913, 'mae': 0.5386782299857276}, {'epoch': 12, 'rmse': 0.6716271981024303, 'mae': 0.5354206946499}, {'epoch': 15, 'rmse': 0.6706534711159501, 'mae': 0.5348771863281}, {'epoch': 18, 'rmse': 0.6696682321755875, 'mae': 0.5340738742117552}, {'epoch': 21, 'rmse': 0.6680362685699176, 'mae': 0.5327228266952608}, {'epoch': 24, 'rmse': 0.6681349556078922, 'mae': 0.5325275943030018}, {'epoch': 27, 'rmse': 0.6673739480895117, 'mae': 0.5319594465217764}, {'epoch': 30, 'rmse': 0.6671960604075601, 'mae': 0.5320231003684084}, {'epoch': 33, 'rmse': 0.6671614861180014, 'mae': 0.5320528954893889}, {'epoch': 36, 'rmse': 0.666902839521067, 'mae': 0.5317925868931295}, {'epoch': 39, 'rm

RMSE:0.3483779460614694; MAE:0.2775829540445555


INFO:root:autorec - EvaluationResult(a1=0.3333333333333333, a2=0.3333333333333333, train_test_log=[{'epoch': 0, 'rmse': 0.4971967740412901, 'mae': 0.3876901882433194}, {'epoch': 3, 'rmse': 0.48070143331095094, 'mae': 0.39401996057119826}, {'epoch': 6, 'rmse': 0.4567361690566636, 'mae': 0.3691715185680308}, {'epoch': 9, 'rmse': 0.4321002904609687, 'mae': 0.3431565302576195}, {'epoch': 12, 'rmse': 0.43496745004803095, 'mae': 0.34227452619199095}, {'epoch': 15, 'rmse': 0.4231548845760664, 'mae': 0.335730006935069}, {'epoch': 18, 'rmse': 0.40980497712889624, 'mae': 0.32882720765335416}, {'epoch': 21, 'rmse': 0.40478523778481507, 'mae': 0.3255017333323468}, {'epoch': 24, 'rmse': 0.39690815632841575, 'mae': 0.3179123928563093}, {'epoch': 27, 'rmse': 0.38887592963109247, 'mae': 0.31004818686072355}, {'epoch': 30, 'rmse': 0.38200025017861217, 'mae': 0.3043875456533075}, {'epoch': 33, 'rmse': 0.3744929029817799, 'mae': 0.2986923538309723}, {'epoch': 36, 'rmse': 0.36859391998449687, 'mae': 0.294

Load data finished. Number of users: 138 Number of items: 769
IAutoRec.
Epoch: 0000; RMSE:0.5464539463059988; MAE:0.45610340926869913
Epoch: 0003; RMSE:0.41757971576116937; MAE:0.3390796672028306
Epoch: 0006; RMSE:0.43672531470652054; MAE:0.35553237050075576
Epoch: 0009; RMSE:0.35593621147070126; MAE:0.28462157192149723
Epoch: 0012; RMSE:0.37756840562057686; MAE:0.30146857001310895
Epoch: 0015; RMSE:0.3775129474717222; MAE:0.3014870789137788
Epoch: 0018; RMSE:0.353138043379402; MAE:0.2817187086117489
Epoch: 0021; RMSE:0.3505089322350133; MAE:0.28032555507753826
Epoch: 0024; RMSE:0.35432506217197235; MAE:0.2837358555546433
Epoch: 0027; RMSE:0.34628661203035266; MAE:0.2774344950634135
Epoch: 0030; RMSE:0.3442600241434685; MAE:0.27523820111101827
Epoch: 0033; RMSE:0.34628790397422626; MAE:0.27629038823672997
Epoch: 0036; RMSE:0.34387439255452695; MAE:0.27443938803686413
Epoch: 0039; RMSE:0.3422236294516138; MAE:0.2733790856995451
Epoch: 0042; RMSE:0.3431618577061516; MAE:0.274367228388853

INFO:root:autorec - EvaluationResult(a1=0.0, a2=0.6666666666666666, train_test_log=[{'epoch': 0, 'rmse': 0.5464539463059988, 'mae': 0.45610340926869913}, {'epoch': 3, 'rmse': 0.41757971576116937, 'mae': 0.3390796672028306}, {'epoch': 6, 'rmse': 0.43672531470652054, 'mae': 0.35553237050075576}, {'epoch': 9, 'rmse': 0.35593621147070126, 'mae': 0.28462157192149723}, {'epoch': 12, 'rmse': 0.37756840562057686, 'mae': 0.30146857001310895}, {'epoch': 15, 'rmse': 0.3775129474717222, 'mae': 0.3014870789137788}, {'epoch': 18, 'rmse': 0.353138043379402, 'mae': 0.2817187086117489}, {'epoch': 21, 'rmse': 0.3505089322350133, 'mae': 0.28032555507753826}, {'epoch': 24, 'rmse': 0.35432506217197235, 'mae': 0.2837358555546433}, {'epoch': 27, 'rmse': 0.34628661203035266, 'mae': 0.2774344950634135}, {'epoch': 30, 'rmse': 0.3442600241434685, 'mae': 0.27523820111101827}, {'epoch': 33, 'rmse': 0.34628790397422626, 'mae': 0.27629038823672997}, {'epoch': 36, 'rmse': 0.34387439255452695, 'mae': 0.274439388036864

[{'duration': datetime.timedelta(seconds=46, microseconds=906388),
  'model_name': 'autorec',
  'model_config': {},
  'results': [EvaluationResult(a1=0.0, a2=0.0, train_test_log=[{'epoch': 0, 'rmse': 1.0091746320566461, 'mae': 0.8059792197281895}, {'epoch': 3, 'rmse': 1.002675972373643, 'mae': 0.7973678230191994}, {'epoch': 6, 'rmse': 0.9977089069189582, 'mae': 0.7949084968238793}, {'epoch': 9, 'rmse': 0.998292445937665, 'mae': 0.7962049567800802}, {'epoch': 12, 'rmse': 0.9976847120849648, 'mae': 0.7941745191710744}, {'epoch': 15, 'rmse': 0.99637607102545, 'mae': 0.7933553069639708}, {'epoch': 18, 'rmse': 0.9969651651167589, 'mae': 0.7945771466630592}, {'epoch': 21, 'rmse': 0.9963813709771275, 'mae': 0.7937988176004388}, {'epoch': 24, 'rmse': 0.9964795854233994, 'mae': 0.7935351361922357}, {'epoch': 27, 'rmse': 0.9961098202849923, 'mae': 0.79345566011358}, {'epoch': 30, 'rmse': 0.9962770696413109, 'mae': 0.7937840610215121}, {'epoch': 33, 'rmse': 0.9963848480928427, 'mae': 0.7937032174

In [68]:
def fn(arg):
    print(arg)
    
import multiprocessing

with multiprocessing.Pool(4) as p:
    p.map(fn, [i for i in range(10)])

102

3
4
56
7

8
9



## Test

In [None]:
E## 

def _run(train_fn):
    noise_matrix = npr.normal(0, 1, size=sim_matrix.shape)
    resp_fn_params = ResponseFunctionParams(sim_matrix, deepfm_rating_matrix, noise_matrix)
    experiment_results = _experiment(resp_fn_params, train_fn=train_fn)
    return experiment_results



In [None]:
svd_exp_results = _run(_svd_train)



In [None]:
knn_exp_results = _run(_knn_train)

In [None]:
_results = [(a1, a2, np.mean(res["test_rmse"])) for a1, a2, res in knn_exp_results]
knn_results_df = pd.DataFrame(_results, columns=["a1", "a2", "rmse"])

In [None]:
knn_results_df["type"] = "knn"

In [None]:
knn_results_df

In [None]:
_results = [(a1, a2, np.mean(res["test_rmse"])) for a1, a2, res in svd_exp_results]
svd_results_df = pd.DataFrame(_results, columns=["a1", "a2", "rmse"])

In [None]:
svd_results_df["type"] = "svd"

In [None]:
svd_results_df

In [None]:
final_df = pd.concat([knn_results_df, svd_results_df])

In [None]:
import plotly.express as px

fig = px.scatter_3d(
    final_df, 
    x='a1', 
    y='a2', 
    z='rmse', 
    size_max=8, 
    opacity=1,
    color="type",
    color_continuous_scale=px.colors.sequential.thermal[::-1]
)

fig.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
)

fig.show()

In [None]:
__noise_matrix = npr.normal(0, 1, size=sim_matrix.shape)
__resp_fn_params = ResponseFunctionParams(sim_matrix, deepfm_rating_matrix, __noise_matrix)
_experiment_results = _experiment(__resp_fn_params)