In [1]:
!pip install numpy==1.20.1 \
             sdv==0.8.0 \
             pandas \
             scipy \
             deepctr-torch \
             scikit-surprise \
             plotly==4.14.3






In [2]:
from sdv.tabular import GaussianCopula
import numpy as np
import pandas as pd
from numpy import random as npr



## Helper classes and functions

In [3]:
class ResponseFunction:
    def __init__(self, heu_matrix, nn_matrix, noise_matrix):
        assert heu_matrix.shape == nn_matrix.shape
        self._heu_matrix = heu_matrix
        self._nn_matrix = nn_matrix
        self._noise_matrix = noise_matrix
        
    def __call__(self, a1: float, a2: float):
        a3 = max(0.0, 1 - a1 - a2)
        return (
            a1 * self._heu_matrix
            + a2 * self._nn_matrix
            + a3 * npr.normal(0, 1, size=self._heu_matrix.shape)
        )
    

class DeepFMDataLoader:
    def __init__(self, *, sparse_features, dense_features):
        self._sparse_feats = sparse_features
        self._dense_feats = dense_features
        
    def load(self, dataset):
        nn_input = pd.DataFrame()
        nn_input[self._sparse_feats] = dataset[self._sparse_feats]
        nn_input[self._dense_feats] = dataset[self._dense_feats]
        
        for feat in self._sparse_feats:
            encoder = LabelEncoder()
            nn_input[feat] = encoder.fit_transform(nn_input[feat])
            
        mms = MinMaxScaler(feature_range=(0,1))
        nn_input[self._dense_feats] = mms.fit_transform(nn_input[self._dense_feats])
        
        # problems may be here
        sparse_feature_columns = [
            SparseFeat(feat, vocabulary_size=nn_input[feat].nunique(), embedding_dim=4) 
            for i, feat in enumerate(self._sparse_feats)
        ]

        dense_feature_columns = [DenseFeat(feat, 1,) for feat in self._dense_feats]
        
        dnn_feat_cols = sparse_feature_columns + dense_feature_columns
        linear_feat_cols = sparse_feature_columns + dense_feature_columns
        
        feat_names = get_feature_names(linear_feat_cols + dnn_feat_cols)
        return nn_input, dnn_feat_cols, linear_feat_cols, feat_names
      
        
def merge_feats(feats_a, feats_b):
    assert len(feats_a) == len(feats_b)
    merged = []
    for feat_a, feat_b in zip(feats_a, feats_b):
        if isinstance(feat_a, DenseFeat):
            continue
        if feat_a.vocabulary_size >= feat_b.vocabulary_size:
            merged.append(feat_a)
        else:
            merged.append(feat_b)
    return merged
            

class NNModelWrapper:
    def __init__(self, trained_nn):
        self._nn = trained_nn

    def predict_rating_matrix(self, nn_input, merged_df):
        y = self._nn.predict(nn_input)
        result = pd.DataFrame()
        result["rating"] = y.reshape((len(y),))
        result["user_id"] = merged_df["user_id"]
        result["item_id"] = merged_df["item_id"]
        output_matrix = result.pivot(index="user_id", columns="item_id", values="rating")
        return output_matrix
    

def _cross_join(df1, df2):
    df1["_join_key"] = 0
    df2["_join_key"] = 0
    merged_df = df1.merge(df2, on="_join_key")
    merged_df = merged_df.drop("_join_key", axis=1)
    return merged_df


def rating_matrix_to_long_table(rating_matrix):
    df = pd.DataFrame(rating_matrix)
    df["user_id"] = df.index
    return df.melt(id_vars=["user_id"], var_name="item_id", value_name="rating")



## Plan

- Load and clean the data;
- Generate a synthetic dataset;
- Fit and evaluate DeepFM model on synthetic data;
- Create a simlirity function between between user and item;
- Create a rating matrix based on similarity function output;
- Create a rating matrix based on DeepFM output;
- Fit a SVD model to the matrix generated by `a1 * sim(u, i) + a2 * deepfm(u, i) + a3 * N(0, 1)`;
- Display plots;

# AutoRec model

## Data loading and cleaning

In [4]:
def prepare_user_profile_df(df):
    df = df.drop(["latitude", "longitude"], axis=1)
    df = df.replace("?", pd.NA)
    df = df.fillna(method="bfill")
    return df


def prepare_user_cuisine_df(df):
    df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("userID").sum()
    return df


def load_and_clean_users_df():
    user_profile_df = pd.read_csv("../data/restaurant_data/userprofile.csv")
    user_cuisine_df = pd.read_csv("../data/restaurant_data/usercuisine.csv")
    user_profile_df = prepare_user_profile_df(user_profile_df)
    user_cuisine_df = prepare_user_cuisine_df(user_cuisine_df)

    users_df = pd.merge(user_profile_df, user_cuisine_df, on="userID")
    return users_df



In [5]:
users_df = load_and_clean_users_df()
users_df

Unnamed: 0,userID,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,birth_year,interest,...,Swiss,Tapas,Tea_House,Tex-Mex,Thai,Tibetan,Tunisian,Turkish,Vegetarian,Vietnamese
0,U1001,false,abstemious,informal,family,on foot,single,independent,1989,variety,...,0,0,0,0,0,0,0,0,0,0
1,U1002,false,abstemious,informal,family,public,single,independent,1990,technology,...,0,0,0,0,0,0,0,0,0,0
2,U1003,false,social drinker,formal,family,public,single,independent,1989,none,...,0,0,0,0,0,0,0,0,0,0
3,U1004,false,abstemious,informal,family,public,single,independent,1940,variety,...,0,0,0,0,0,0,0,0,0,0
4,U1005,false,abstemious,no preference,family,public,single,independent,1992,none,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,U1134,false,casual drinker,no preference,family,public,single,independent,1991,variety,...,0,0,0,0,0,0,0,0,0,0
134,U1135,false,casual drinker,informal,family,on foot,single,kids,1988,variety,...,1,1,1,1,1,1,1,1,1,1
135,U1136,true,social drinker,no preference,friends,car owner,single,independent,1990,retro,...,0,0,0,0,0,0,0,0,0,0
136,U1137,false,social drinker,formal,family,public,single,independent,1989,eco-friendly,...,0,0,0,0,0,0,0,0,0,0


In [6]:
def load_and_prepare_rest_cuisine_df():
    df = pd.read_csv("../data/restaurant_data/chefmozcuisine.csv")
    df = df.drop_duplicates()
    df = df.join(pd.get_dummies(df["Rcuisine"]))
    df = df.drop("Rcuisine", axis=1)
    df = df.groupby("placeID").sum()
    return df
    


In [7]:
rests_df = load_and_prepare_rest_cuisine_df()
rests_df

Unnamed: 0_level_0,Afghan,African,American,Armenian,Asian,Bagels,Bakery,Bar,Bar_Pub_Brewery,Barbecue,...,Soup,Southern,Southwestern,Spanish,Steaks,Sushi,Thai,Turkish,Vegetarian,Vietnamese
placeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135105,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135106,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
ratings_df = pd.read_csv("../data/restaurant_data_reformatted/ratings.csv")
ratings_df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating
0,U1077,135085,2,2,2
1,U1077,135038,2,2,1
2,U1077,132825,2,2,2
3,U1077,135060,1,2,2
4,U1068,135104,1,1,2
...,...,...,...,...,...
1156,U1043,132630,1,1,1
1157,U1011,132715,1,1,0
1158,U1068,132733,1,1,0
1159,U1068,132594,1,1,1


In [9]:
merged_df = pd.merge(ratings_df, users_df, on="userID")
merged_df = pd.merge(merged_df, rests_df, on="placeID")
merged_df

Unnamed: 0,userID,placeID,rating,food_rating,service_rating,smoker,drink_level,dress_preference,ambience,transport,...,Soup_y,Southern_y,Southwestern_y,Spanish_y,Steaks_y,Sushi_y,Thai_y,Turkish_y,Vegetarian_y,Vietnamese_y
0,U1077,135085,2,2,2,false,social drinker,elegant,family,public,...,0,0,0,0,0,0,0,0,0,0
1,U1108,135085,1,2,1,false,abstemious,informal,solitary,public,...,0,0,0,0,0,0,0,0,0,0
2,U1081,135085,1,2,1,false,casual drinker,informal,family,public,...,0,0,0,0,0,0,0,0,0,0
3,U1001,135085,0,1,1,false,abstemious,informal,family,on foot,...,0,0,0,0,0,0,0,0,0,0
4,U1056,135085,2,2,2,false,social drinker,informal,family,on foot,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,U1006,132922,2,1,2,true,social drinker,no preference,friends,car owner,...,0,0,0,0,0,0,0,0,0,0
869,U1003,132937,2,2,1,false,social drinker,formal,family,public,...,0,0,0,0,0,0,0,0,0,0
870,U1027,132937,1,1,1,true,social drinker,no preference,family,public,...,0,0,0,0,0,0,0,0,0,0
871,U1029,132937,1,1,1,true,casual drinker,formal,family,public,...,0,0,0,0,0,0,0,0,0,0


In [10]:
users_df.dtypes

userID              object
smoker              object
drink_level         object
dress_preference    object
ambience            object
                     ...  
Tibetan              uint8
Tunisian             uint8
Turkish              uint8
Vegetarian           uint8
Vietnamese           uint8
Length: 120, dtype: object

## Generate syn data

In [11]:
def fit_syn_generator(df):
    model = GaussianCopula()
    df = df.drop(["userID", "placeID"], axis=1) # Drop ids
    df = df.astype("int64", errors="ignore") # Convert all numbers to int64
    model.fit(df.copy())
    return model



In [12]:
syn_data_generator = fit_syn_generator(merged_df)

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  improvement from the last five Jacobian evaluations.
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale


In [13]:
syn_merged_df = syn_data_generator.sample(10_000)
syn_merged_df.head()

Unnamed: 0,rating,food_rating,service_rating,smoker,drink_level,dress_preference,ambience,transport,marital_status,hijos,...,Soup_y,Southern_y,Southwestern_y,Spanish_y,Steaks_y,Sushi_y,Thai_y,Turkish_y,Vegetarian_y,Vietnamese_y
0,2,2,2,False,social drinker,elegant,family,car owner,single,independent,...,0,0,0,0,0,0,0,0,0,0
1,1,2,2,False,social drinker,elegant,family,car owner,single,dependent,...,0,0,0,0,0,0,0,0,0,0
2,1,3,1,False,social drinker,no preference,friends,public,single,independent,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,False,social drinker,elegant,family,public,single,independent,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,False,social drinker,informal,friends,public,single,independent,...,0,0,0,0,0,0,0,0,0,0


## Fit DeepFM

In [14]:
from deepctr_torch.models import DeepFM


class DeepFmModel:
    def __init__(self, linear_feature_columns, dnn_feature_columns, feature_names):
        self._linear_feature_columns = linear_feature_columns
        self._dnn_feature_columns = dnn_feature_columns
        self._feature_names = feature_names
        self._deepfm = DeepFM(
            self._linear_feature_columns,
            self._dnn_feature_columns,
            task='multiclass',
            device='cpu'
        )
        self._deepfm.compile("adam", "mse", metrics=['mse'], )
        
    def train(self, train_set, target_values):
        train_model_input = {n: train_set[n] for n in self._feature_names}
        history = self._deepfm.fit(
            train_model_input,
            target_values,
            batch_size=256,
            epochs=10,
            verbose=2,
            validation_split=0.2
        )

        return history

    def predict(self, test_set):
        test_model_input = {n: test_set[n] for n in self._feature_names}
        result = self._deepfm.predict(test_model_input, batch_size=256)
        return result

    #TODO: add evaluate() method
    


In [15]:
import torch
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr_torch.inputs import SparseFeat, DenseFeat, get_feature_names



In [16]:
dense_feat_names = ["height", "weight", "birth_year"]
sparse_feat_names = [
    c for c in list(syn_merged_df.columns) if c not in [
        "rating",
        "food_rating",
        "service_rating",
        "weight",
        "height",
        "birth_year"
    ]
]

In [17]:
data_loader = DeepFMDataLoader(sparse_features=sparse_feat_names, dense_features=dense_feat_names)
nn_train_input, dnn_feats, lin_feats, feat_names = data_loader.load(syn_merged_df)

In [18]:
def nn_prepare_data_for_rating_matrix(users_df, rests_df):
    users_df = users_df.drop("userID", axis=1)
    users_df["user_id"] = range(0, len(users_df))
    rests_df["item_id"] = range(0, len(rests_df))
    user_rest_long_table = _cross_join(users_df, rests_df)
    return user_rest_long_table

user_rest_long_table = nn_prepare_data_for_rating_matrix(users_df.copy(), rests_df.copy())

In [19]:
nn_user_rest_long_table, _dnn_feats, _lin_feats, _feat_names = data_loader.load(user_rest_long_table)

In [20]:
_merged_feats = merge_feats(dnn_feats, _dnn_feats)

In [21]:
def train_deepfm(feats, feat_names, x, y):
    deepfm = DeepFmModel(feats, feats, feat_names)
    train_set, test_set = train_test_split(x, test_size=0.2)
    deepfm.train(train_set, target_values=y[:len(train_set)])
    return deepfm
    

In [22]:
deepfm = train_deepfm(_merged_feats, feat_names, x=nn_train_input, y=syn_merged_df["rating"].values)

cpu
Train on 6400 samples, validate on 1600 samples, 25 steps per epoch
Epoch 1/10
12s - loss:  0.8689 - mse:  0.8689 - val_mse:  0.7387
Epoch 2/10
11s - loss:  0.7134 - mse:  0.7134 - val_mse:  0.7233
Epoch 3/10
10s - loss:  0.7088 - mse:  0.7088 - val_mse:  0.7229
Epoch 4/10
11s - loss:  0.7102 - mse:  0.7102 - val_mse:  0.7224
Epoch 5/10
8s - loss:  0.7072 - mse:  0.7072 - val_mse:  0.7226
Epoch 6/10
11s - loss:  0.7061 - mse:  0.7061 - val_mse:  0.7224
Epoch 7/10
8s - loss:  0.7047 - mse:  0.7047 - val_mse:  0.7226
Epoch 8/10
8s - loss:  0.7036 - mse:  0.7036 - val_mse:  0.7219
Epoch 9/10
9s - loss:  0.7086 - mse:  0.7086 - val_mse:  0.7224
Epoch 10/10
9s - loss:  0.7041 - mse:  0.7041 - val_mse:  0.7235


In [23]:
model_wrapper = NNModelWrapper(deepfm)
deepfm_rating_matrix = model_wrapper.predict_rating_matrix(nn_user_rest_long_table, user_rest_long_table)
deepfm_rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.555873,1.472042,1.546815,1.472042,1.465081,1.555873,1.555748,1.546815,1.546815,1.548782,...,1.515305,1.515305,1.508346,1.424263,1.522421,1.535065,1.522421,1.555844,1.521894,1.555670
1,1.448242,1.366669,1.440115,1.366669,1.359772,1.448242,1.448111,1.440115,1.440115,1.441212,...,1.413386,1.413386,1.406473,1.321023,1.420452,1.428397,1.420452,1.448192,1.415637,1.448040
2,1.397222,1.316377,1.389406,1.316377,1.309500,1.397222,1.397091,1.389406,1.389406,1.390212,...,1.364115,1.364115,1.357216,1.271390,1.371166,1.377684,1.371166,1.397166,1.365050,1.397021
3,1.410257,1.327333,1.401578,1.327333,1.320397,1.410257,1.410130,1.401578,1.401578,1.403191,...,1.371922,1.371922,1.364981,1.280397,1.379018,1.389834,1.379018,1.410220,1.376825,1.410054
4,1.519070,1.436971,1.510726,1.436971,1.430059,1.519070,1.518941,1.510726,1.510726,1.512026,...,1.482884,1.482884,1.475960,1.390829,1.489962,1.499001,1.489962,1.519025,1.486146,1.518868
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,1.370093,1.289429,1.362351,1.289429,1.282557,1.370093,1.369961,1.362351,1.362351,1.363088,...,1.337455,1.337455,1.330560,1.244614,1.344502,1.350633,1.344502,1.370035,1.338032,1.369892
134,0.833193,0.753605,0.825979,0.753605,0.746755,0.833193,0.833059,0.825979,0.825979,0.826217,...,0.802013,0.802013,0.795142,0.709538,0.809050,0.814151,0.809050,0.833127,0.801683,0.832984
135,1.488807,1.407052,1.480606,1.407052,1.400150,1.488807,1.488677,1.480606,1.480606,1.481772,...,1.453480,1.453480,1.446563,1.361231,1.460550,1.468884,1.460550,1.488759,1.456090,1.488605
136,1.374025,1.294025,1.366556,1.294025,1.287171,1.374025,1.373891,1.366556,1.366556,1.367037,...,1.343099,1.343099,1.336217,1.249841,1.350132,1.354848,1.350132,1.373961,1.342369,1.373824


In [24]:
deepfm_rating_matrix = np.around(deepfm_rating_matrix)
deepfm_rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,2.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
134,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
135,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
136,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Make similarity matrix

Creating a similarity matrix between users and restaurants based on users' food preference and restaurants' cuisines.

In [25]:
from sklearn.metrics.pairwise import cosine_similarity


def make_similarity_matrix(users_df, rests_df):
    users_food_pref_df = users_df[rests_df.columns]
    return cosine_similarity(users_food_pref_df, rests_df)



In [26]:
sim_matrix = make_similarity_matrix(users_df, rests_df)
sim_matrix = sim_matrix * 2

## Experiment

In [27]:
import itertools
from sklearn.model_selection import KFold
from surprise import Dataset, Reader, SVD, accuracy, KNNBasic
from surprise.model_selection import cross_validate, train_test_split



In [28]:
def _svd_train(data, sample_frac):
    dataset = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], Reader(rating_scale=(0, 2)))
    train_set, test_set = train_test_split(dataset, test_size=1.0 - sample_frac)
    algo = SVD()
    algo.fit(train_set)
    predictions = algo.test(test_set)
    return {"test_rmse": [accuracy.rmse(predictions)]}

def _knn_train(data, sample_frac):
    dataset = Dataset.load_from_df(data[['user_id', 'item_id', 'rating']], Reader(rating_scale=(0, 2)))
    train_set, test_set = train_test_split(dataset, test_size=1.0 - sample_frac)
    
    algo = KNNBasic()
    algo.fit(train_set)
    predictions = algo.test(test_set)
    return {"test_rmse": [accuracy.rmse(predictions)]}



def _transform_long_table_to_sparse_matrix(self, df, test_size):
    n_users = df.user_id.unique().shape[0]
    n_items = df.item_id.unique().shape[0]

    train_data, test_data = train_test_split(df, test_size=test_size)
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)

    train_row = []
    train_col = []
    train_rating = []

    for line in train_data.itertuples():
        u = line[1] - 1
        i = line[2] - 1
        train_row.append(u)
        train_col.append(i)
        train_rating.append(line[3])
    train_matrix = csr_matrix((train_rating, (train_row, train_col)), shape=(n_users, n_items))

    test_row = []
    test_col = []
    test_rating = []
    for line in test_data.itertuples():
        test_row.append(line[1] - 1)
        test_col.append(line[2] - 1)
        test_rating.append(line[3])
    test_matrix = csr_matrix((test_rating, (test_row, test_col)), shape=(n_users, n_items))
    print("Load data finished. Number of users:", n_users, "Number of items:", n_items)
    return train_matrix.todok(), test_matrix.todok(), n_users, n_items


def _train_autorec(data, sample_frac):
    train_matrix, test_matrix, n_users, n_items = _transform_long_table_to_sparse_matrix(data, test_size=1.0 - sample_frac)
    with tf.Session(config=config) as sess:
        model = IAutoRec(sess, n_users, n_items)
        model.build_network()
        model.execute(train_matrix, test_matrix)
    return None
    


In [29]:
import typing as t
import itertools
import collections
import functools
from multiprocessing import Pool
import datetime


ResponseFunctionParams = collections.namedtuple(
    "ResponseFunctionParams",
    ["sim_matrix", "deepfm_rating_matrix", "noise_matrix"]
)


COUNT = 10


def _iterate_a2(args, *, train_fn):
    a1, sample_frac, resp_fn_params = args
    response_function = ResponseFunction(*resp_fn_params)
    results = []
    a1_normalized = a1 / COUNT
    for a2 in range(0, COUNT - a1):
        a2_normalized = a2 / COUNT
        ground_truth_matrix = response_function(a1_normalized, a2_normalized)
        gt_long_table = rating_matrix_to_long_table(ground_truth_matrix)
        train_error_log = train_fn(gt_long_table, sample_frac=sample_frac)
        
        results.append((a1_normalized, a2_normalized, train_error_log))
        print(f"-- Experiment: ({a1_normalized}, {a2_normalized})")
    return results


def _experiment(resp_fn_params, *, train_fn, sample_frac=0.5, n_processes=4):
    procs_args = [(a1, sample_frac, resp_fn_params) for a1 in range(0, COUNT)]
    start_time = datetime.datetime.utcnow()
    with Pool(n_processes) as p:
        results = p.map(functools.partial(_iterate_a2, train_fn=train_fn), procs_args)
    calc_duration = datetime.datetime.utcnow() - start_time
    print(f"Total calcucation duration: {calc_duration}")
    return list(itertools.chain.from_iterable(results))



## Test

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
importlib.reload(models)
importlib.reload(evaluator)
importlib.reload(trainers)


_resp_fn_config = evaluator.ResponseFunctionConfig(
    factory=ResponseFunction,
    args=[sim_matrix, deepfm_rating_matrix, npr.normal(0, 1, size=sim_matrix.shape)]
)

_evaluators = [
    evaluator.TrainTestExecutorConfig(
        factory=trainers.AutoRecTrainTestExecutor,
        args={"config": {"epoch": 50}},
        model_name="autorec"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.SvdTrainTestExecutor,
        args={},
        model_name="svd"
    ),
    evaluator.TrainTestExecutorConfig(
        factory=trainers.KnnTrainTestExecutor,
        args={},
        model_name="knn"
    )
]

_evaluator = evaluator.Evaluator(_resp_fn_config, n_proc=4)
_res = _evaluator.evaluate(_evaluators, a_sample_rate=3, test_size=0.1, sample_sizes=[0.1, 0.2])




Subprocess started.
Subprocess started.
Subprocess started.
Load data finished. Number of users: 138 Number of items:Load data finished. Number of users: 769
 138 Number of items: 769
Load data finished. Number of users: 138 Number of items: IAutoRec.769
IAutoRec.

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


IAutoRec.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Epoch: 0000; Epoch: 0000; Epoch: 0000; RMSE:0.6279511917564148; MAE:0.48043437443304227RMSE:0.7490426904424297; MAE:0.597639106427221

Epoch: 0003; Epoch: 0003; RMSE:1.0596445372375718; MAE:0.8519142698653626
Epoch: 0003; RMSE:0.6441611180208411; MAE:0.5336489644810757
Epoch: 0006; RMSE:0.7683488364067795; MAE:0.6125098774920176
Epoch: 0006; RMSE:1.0698111440984688; MAE:0.8572679441441261
Epoch: 0006; RMSE:0.6447929787673521; MAE:0.5093822666381428
RMSE:0.7496614297212677; MAE:0.6015019836808169Epoch: 0009; 
Epoch: 0009; RMSE:1.0468749149363907; MAE:0.8416358013790461
Epoch: 0009; RMSE:0.7432461869459421; MAE:0.5928109409493937
RMSE:0.623343928642509; MAE:0.4831526556766919
RMSE:1.051401537115989; MAE:0.8427042355712123
Epoch: 0012; Epoch: 0012; Epoch: 0012; RMSE:0.7457078584783998; MAE:0.5932258041810026RMSE:0.6057952794716989; MAE:0.47475952195125437

RMSE:1.058455115982041; MAE:0.8484241574515456
Epoch: 0015; Epoch: 0015; Epoch: 0015; RMSE:0.6038688100079133; MAE:0.4812740917644097R

INFO:root:autorec - EvaluationResult(a1=0.6666666666666666, a2=0.0, model_name='autorec', sample_size=0.1, rmse=0.555585579406484, mae=0.4400018479948527, test_size=0.1, calculation_time=datetime.timedelta(seconds=7, microseconds=918807))
INFO:root:autorec - EvaluationResult(a1=0.3333333333333333, a2=0.0, model_name='autorec', sample_size=0.1, rmse=0.7428518484396256, mae=0.5927721701442586, test_size=0.1, calculation_time=datetime.timedelta(seconds=7, microseconds=990871))
INFO:root:autorec - EvaluationResult(a1=0.0, a2=0.0, model_name='autorec', sample_size=0.1, rmse=1.0503909246604604, mae=0.841989109718028, test_size=0.1, calculation_time=datetime.timedelta(seconds=8, microseconds=61493))


Load data finished. Number of users: 138 Number of items:Load data finished. Number of users:  Load data finished. Number of users:769138 
 138Number of items: Number of items: 769 
769
IAutoRec.
IAutoRec.
IAutoRec.
Epoch: 0000; Epoch: 0000; Epoch: 0000; RMSE:0.9943128222731289; MAE:0.7909451194915077
RMSE:0.6160259138077246; MAE:0.47593742284931584
RMSE:0.7107950626649577; MAE:0.5691250799172728
Epoch: 0003; Epoch: 0003; Epoch: 0003; RMSE:0.9833464836099303; MAE:0.7769439007667894
RMSE:0.6242045966646667; MAE:0.5237621050175691
RMSE:0.7167420498411387; MAE:0.5707396127762934
Epoch: 0006; Epoch: 0006; Epoch: 0006; RMSE:0.5920870526489258; MAE:0.47866769382131386RMSE:0.7016120810990327; MAE:0.5612758993607001

RMSE:0.9797970320678985; MAE:0.7779109243717623
Epoch: 0009; Epoch: 0009; Epoch: 0009; RMSE:0.9795587120983231; MAE:0.7784144668224638
RMSE:0.5904148465585588; MAE:0.4592587870879627RMSE:0.7035381203674568; MAE:0.5633686746720311

Epoch: 0012; Epoch: 0012; Epoch: 0012; RMSE:0.5880

INFO:root:autorec - EvaluationResult(a1=0.0, a2=0.0, model_name='autorec', sample_size=0.2, rmse=0.979949811430048, mae=0.7767172280013274, test_size=0.1, calculation_time=datetime.timedelta(seconds=9, microseconds=495445))
INFO:root:autorec - EvaluationResult(a1=0.6666666666666666, a2=0.0, model_name='autorec', sample_size=0.2, rmse=0.40917252745928623, mae=0.3230879098815483, test_size=0.1, calculation_time=datetime.timedelta(seconds=9, microseconds=524479))
INFO:root:autorec - EvaluationResult(a1=0.3333333333333333, a2=0.0, model_name='autorec', sample_size=0.2, rmse=0.6883512991606868, mae=0.5482514644521287, test_size=0.1, calculation_time=datetime.timedelta(seconds=9, microseconds=525646))


Load data finished. Number of users: 138 Load data finished. Number of users:Number of items: 138 769 
Number of items: 769
IAutoRec.
IAutoRec.
Epoch: 0000; Epoch: 0000; RMSE:0.5110916970054143; MAE:0.3974452934766438RMSE:0.6721950998204012; MAE:0.5340920529273488

Epoch: 0003; Epoch: 0003; RMSE:0.48499730311641626; MAE:0.3971513947165298
RMSE:0.6969995365204589; MAE:0.5561261567729113
Epoch: 0006; Epoch: 0006; RMSE:0.48775861491816347; MAE:0.390550551506635
RMSE:0.6849618849356659; MAE:0.544571956923205
Epoch: 0009; Epoch: 0009; RMSE:0.43981717645718416; MAE:0.34573896626269096
RMSE:0.660506397998605; MAE:0.5229323585715248
Epoch: 0012; Epoch: 0012; RMSE:0.4415033959770522; MAE:0.34455142756847584
Epoch: 0015; RMSE:0.6530352731115375; MAE:0.5209131187461561
Epoch: 0015; RMSE:0.4365569228857979; MAE:0.3408764614043538
RMSE:0.6576088430740187; MAE:0.5229477298935188
Epoch: 0018; Epoch: 0018; RMSE:0.4312403283534241; MAE:0.34068821771100816
RMSE:0.6641836713680604; MAE:0.5255555656450613

INFO:root:autorec - EvaluationResult(a1=0.3333333333333333, a2=0.3333333333333333, model_name='autorec', sample_size=0.1, rmse=0.4160322156568325, mae=0.3268937151892108, test_size=0.1, calculation_time=datetime.timedelta(seconds=5, microseconds=62442))
INFO:root:autorec - EvaluationResult(a1=0.0, a2=0.3333333333333333, model_name='autorec', sample_size=0.1, rmse=0.6544113672604024, mae=0.5195694553687229, test_size=0.1, calculation_time=datetime.timedelta(seconds=5, microseconds=86504))


Load data finished. Number of users: 138 Number of items: 769
Load data finished. Number of users: 138 Number of items:IAutoRec. 769

IAutoRec.
Epoch: 0000; Epoch: 0000; RMSE:0.5080410318060211; MAE:0.398889403451979
RMSE:0.6839357436430403; MAE:0.5442180157355319
Epoch: 0003; Epoch: 0003; RMSE:0.47852056057807224; MAE:0.38940606371862435
RMSE:0.6947692869291177; MAE:0.5516185842408894
Epoch: 0006; Epoch: 0006; RMSE:0.4703873947676969; MAE:0.378351535801379
RMSE:0.6771283610570161; MAE:0.5337118597408492
Epoch: 0009; Epoch: 0009; RMSE:0.42919474267228624; MAE:0.343417987302028
RMSE:0.664797198320254; MAE:0.5252451781973733
Epoch: 0012; Epoch: 0012; RMSE:0.4378222146960204; MAE:0.3445811678183042
RMSE:0.6630897099436031; MAE:0.5257677540569682
Epoch: 0015; Epoch: 0015; RMSE:0.43351290327101816; MAE:0.34369004055082364
RMSE:0.6608659840691412; MAE:0.5248362083110544
Epoch: 0018; Epoch: 0018; RMSE:0.4214082396017657; MAE:0.33848209795895956
RMSE:0.6620546164498409; MAE:0.5249285402397625


INFO:root:autorec - EvaluationResult(a1=0.3333333333333333, a2=0.3333333333333333, model_name='autorec', sample_size=0.2, rmse=0.40150197911019875, mae=0.3205325405223746, test_size=0.1, calculation_time=datetime.timedelta(seconds=6, microseconds=233484))
INFO:root:autorec - EvaluationResult(a1=0.0, a2=0.3333333333333333, model_name='autorec', sample_size=0.2, rmse=0.655815159527479, mae=0.519167722990877, test_size=0.1, calculation_time=datetime.timedelta(seconds=6, microseconds=229469))


Load data finished. Number of users: 138 Number of items: 769
IAutoRec.
Epoch: 0000; RMSE:0.6257953449625391; MAE:0.5103070722442192
Epoch: 0003; RMSE:0.4252020606096497; MAE:0.34568569673784416
Epoch: 0006; RMSE:0.4913956498475418; MAE:0.40492498860805637
Epoch: 0009; RMSE:0.41304859880679407; MAE:0.32659869584966944
Epoch: 0012; RMSE:0.3817479368239041; MAE:0.29960493203419186
Epoch: 0015; RMSE:0.38460854917620474; MAE:0.30573839325997443
Epoch: 0018; RMSE:0.37436016969071617; MAE:0.2989958016153442
Epoch: 0021; RMSE:0.37625915612229405; MAE:0.30091546023445936
Epoch: 0024; RMSE:0.3792455547849249; MAE:0.304910552551691
Epoch: 0027; RMSE:0.36904536027739304; MAE:0.2961211430664238
Epoch: 0030; RMSE:0.3588185255364097; MAE:0.28529392866448744
Epoch: 0033; RMSE:0.35505641071721566; MAE:0.28189455977148575
Epoch: 0036; RMSE:0.3546789980019841; MAE:0.28276245115559484
Epoch: 0039; RMSE:0.35604177957422456; MAE:0.28458560301854147
Epoch: 0042; RMSE:0.3564739480243675; MAE:0.28532824821036

INFO:root:autorec - EvaluationResult(a1=0.0, a2=0.6666666666666666, model_name='autorec', sample_size=0.1, rmse=0.35445677593776437, mae=0.2834466851466428, test_size=0.1, calculation_time=datetime.timedelta(seconds=3, microseconds=540719))


Load data finished. Number of users: 138 Number of items: 769
IAutoRec.
Epoch: 0000; RMSE:0.6644169694869734; MAE:0.5472485454141942
Epoch: 0003; 

In [88]:
_res

Unnamed: 0,a1,a2,model_name,sample_size,rmse,mae,test_size,calculation_time
0,0.0,0.0,autorec,0.1,0.999702,0.789347,0.1,0 days 00:00:08.298853
1,0.0,0.333333,autorec,0.1,0.695637,0.556281,0.1,0 days 00:00:08.258170
2,0.0,0.666667,autorec,0.1,0.362268,0.284103,0.1,0 days 00:00:05.017108
3,0.333333,0.0,autorec,0.1,0.708863,0.56268,0.1,0 days 00:00:08.645729
4,0.333333,0.333333,autorec,0.1,0.432496,0.346618,0.1,0 days 00:00:08.229426
5,0.666667,0.0,autorec,0.1,0.54328,0.432169,0.1,0 days 00:00:08.896952
6,0.0,0.0,svd,0.1,1.016565,0.80265,0.1,0 days 00:00:01.797992
7,0.0,0.333333,svd,0.1,0.688121,0.547655,0.1,0 days 00:00:01.031336
8,0.0,0.666667,svd,0.1,0.379268,0.302538,0.1,0 days 00:00:00.834340
9,0.333333,0.0,svd,0.1,0.708962,0.561842,0.1,0 days 00:00:01.854811


In [71]:
from modules import utils
importlib.reload(utils)

final_df = utils.reformat_evaluation_results_to_single_dataframe(_res)

In [None]:
def fn(arg):
    print(arg)
    
import multiprocessing

with multiprocessing.Pool(4) as p:
    p.map(fn, [i for i in range(10)])

## Test

In [None]:
E## 

def _run(train_fn):
    noise_matrix = npr.normal(0, 1, size=sim_matrix.shape)
    resp_fn_params = ResponseFunctionParams(sim_matrix, deepfm_rating_matrix, noise_matrix)
    experiment_results = _experiment(resp_fn_params, train_fn=train_fn)
    return experiment_results



In [None]:
svd_exp_results = _run(_svd_train)



In [None]:
knn_exp_results = _run(_knn_train)

In [None]:
_results = [(a1, a2, np.mean(res["test_rmse"])) for a1, a2, res in knn_exp_results]
knn_results_df = pd.DataFrame(_results, columns=["a1", "a2", "rmse"])

In [None]:
knn_results_df["type"] = "knn"

In [None]:
knn_results_df

In [None]:
_results = [(a1, a2, np.mean(res["test_rmse"])) for a1, a2, res in svd_exp_results]
svd_results_df = pd.DataFrame(_results, columns=["a1", "a2", "rmse"])

In [None]:
svd_results_df["type"] = "svd"

In [None]:
svd_results_df

In [None]:
final_df = pd.concat([knn_results_df, svd_results_df])


In [73]:
final_df

Unnamed: 0,a1,a2,rmse,mae,model
0,0.0,0.0,0.998009,0.796176,autorec
1,0.0,0.2,0.801127,0.639381,autorec
2,0.0,0.4,0.602875,0.479668,autorec
3,0.0,0.6,0.414044,0.329826,autorec
4,0.0,0.8,0.255337,0.19152,autorec
5,0.2,0.0,0.80224,0.63976,autorec
6,0.2,0.2,0.604645,0.482815,autorec
7,0.2,0.4,0.422164,0.336559,autorec
8,0.2,0.6,0.265359,0.212605,autorec
9,0.4,0.0,0.604517,0.482237,autorec


In [46]:
list(final_df.itertuples())

[Pandas(Index=0, a1=0.0, a2=0.0, rmse=0.9949673907664449, mae=0.7923815580430478, model='autorec'),
 Pandas(Index=1, a1=0.0, a2=0.3333333333333333, rmse=0.6786814480047597, mae=0.5426796832198574, model='autorec'),
 Pandas(Index=2, a1=0.0, a2=0.6666666666666666, rmse=0.35709021144901004, mae=0.28372440514684727, model='autorec'),
 Pandas(Index=3, a1=0.3333333333333333, a2=0.0, rmse=0.6683969664402708, mae=0.5316347939844119, model='autorec'),
 Pandas(Index=4, a1=0.3333333333333333, a2=0.3333333333333333, rmse=0.3610617348845148, mae=0.2882630280771858, model='autorec'),
 Pandas(Index=5, a1=0.6666666666666666, a2=0.0, rmse=0.35574720369904816, mae=0.2771563971677428, model='autorec'),
 Pandas(Index=0, a1=0.0, a2=0.0, rmse=1.0224643345983386, mae=0.814381778732094, model='svd'),
 Pandas(Index=1, a1=0.0, a2=0.3333333333333333, rmse=0.6811989030895484, mae=0.5426162775758177, model='svd'),
 Pandas(Index=2, a1=0.0, a2=0.6666666666666666, rmse=0.34974594938183207, mae=0.28077448841469643, mo

In [65]:
from modules import utils
importlib.reload(utils)

error_surface = utils.group_points_by_minimum_error(final_df)
error_surface

Unnamed: 0,a1,a2,rmse,mae,model
0,0.0,0.0,0.998009,0.796176,autorec
1,0.0,0.2,0.801127,0.639381,autorec
2,0.0,0.4,0.602875,0.479668,autorec
3,0.0,0.6,0.412921,0.327766,svd
4,0.0,0.8,0.232028,0.180071,svd
5,0.2,0.0,0.80224,0.63976,autorec
6,0.2,0.2,0.604645,0.482815,autorec
7,0.2,0.4,0.413265,0.330328,svd
8,0.2,0.6,0.225136,0.178228,svd
9,0.4,0.0,0.604517,0.482237,autorec


In [67]:
!pip install plotly
import plotly.express as px

fig = px.scatter_3d(
    error_surface, 
    x='a1', 
    y='a2', 
    z='rmse',
    size="rmse",
    size_max=18, 
    opacity=1,
    color="model",
    color_continuous_scale=px.colors.sequential.thermal[::-1]
)

fig.update_layout(
    margin=dict(l=20, r=20, t=20, b=20),
)

fig.show()



In [None]:
__noise_matrix = npr.normal(0, 1, size=sim_matrix.shape)
__resp_fn_params = ResponseFunctionParams(sim_matrix, deepfm_rating_matrix, __noise_matrix)
_experiment_results = _experiment(__resp_fn_params)