# project ai: Easer

by Michiel Téblick and thibaut Van Goethem

In this notebook we will look at the easer model proposed at https://dl.acm.org/doi/pdf/10.1145/3308558.3313710.

This model will be applied to a dataset from foods.com which containes a bunch of recipes with user ratings/reactions on them.

Preprocessing and fold splitting is done ahead of time.


In [1]:
import math

import pandas as pd
import numpy as np
import scipy
from sklearn.model_selection import KFold
import time
import pickle
from scipy import sparse
import statistics as st

## Reading and preprocessing the data

In [2]:
use_less_data = False # set this to true for testing purposes


df_train = pd.read_csv('../smallfolds/fold_0/train.csv')
df_test = pd.read_csv('../smallfolds/fold_0/test.csv')
df_validate = pd.read_csv('../smallfolds/fold_0/validate.csv')
df = pd.concat([df_train, df_test, df_validate])

print("amount of interactions in the full dataset: ",len(df))
print("amount of recipes in the full dataset: ",len(df.recipe_id.unique()))
print("amount of users in the full dataset: ",len(df.user_id.unique()))

if use_less_data:
    df = df[df['count_item'] >= 10]
    print("amount of recipes in the smaller dataset: ",len(df.recipe_id.unique()))
    print("amount of users in the smaller dataset: ",len(df.user_id.unique()))
df.reset_index()


amount of interactions in the full dataset:  197073
amount of recipes in the full dataset:  22386
amount of users in the full dataset:  13141


Unnamed: 0,index,user_id,recipe_id,date,rating,review,count_user,count_item
0,0,185285,127155,2005-08-11,5,This recipe contained ingredients I knew I lik...,133,4
1,1,522099,424415,2010-05-21,5,I really didn't expect to like this rice as mu...,61,6
2,2,171790,424415,2010-05-22,4,What a wonderful aroma while cooking. Dinner g...,259,6
3,3,537179,58758,2008-11-21,4,Nice recipe! I scaled this down for 2. Was nic...,57,3
4,4,235751,116953,2005-08-28,5,What a great idea and recipe! It would be a re...,17,4
...,...,...,...,...,...,...,...,...
197068,38039,836288,205768,2008-11-01,4,This recipe can it be made . on top of the sto...,8,5
197069,38040,86627,55438,2003-10-29,5,If everyone knew how easy and great tasting th...,17,13
197070,38041,8526,34620,2003-10-26,5,I've made 4 loaves of this already and it is a...,16,5
197071,38042,41468,82303,2006-09-01,5,WOW this was great. What I love the most is th...,9,12


Set all ratings to 1 (even negative interactions are seen as interactions)

In [3]:
df.loc[:,'rating'] = 1


### rescaling the id's
The recipes and users don't go from 0 to amount so if we were to put this in a matrix we would get empty columns and rows. This is not that handy so we reindex both the user_id and recipe_ids

This is a step we must not forget when entering the data in the model, as we also need to remap our input data using the same remapping that was used here

In [4]:
userSet = set(df['user_id'].to_list())
user_transform_dict = dict(map(reversed, enumerate(userSet)))
recipeSet = set(df['recipe_id'].to_list())
recipe_transform_dict = dict(map(reversed, enumerate(recipeSet)))
recipe_dict = dict(enumerate(recipeSet))

In [5]:
keep_nan_user = [k for k, v in user_transform_dict.items() if pd.isnull(v)]
keep_nan_recipe = [k for k, v in recipe_transform_dict.items() if pd.isnull(v)]


def transform_id(dataframe):
    tochange = dataframe['user_id']
    dataframe['user_id'] = tochange.map(user_transform_dict).fillna(tochange.mask(tochange.isin(keep_nan_user)))

    tochange = dataframe['recipe_id']
    dataframe['recipe_id'] = tochange.map(recipe_transform_dict).fillna(tochange.mask(tochange.isin(keep_nan_recipe)))
    return dataframe

def open_csv(filename, use_less_data=False):
    df = pd.read_csv(filename)
    if use_less_data:
        df = df[df['count_item'] >= 10]
    df = transform_id(df)
    df.loc[:,'rating'] = 1
    df.drop('review', axis=1, inplace=True)
    return df


### Creation of the folds


In [6]:
k = 10
folds = list()
for directory in ["../smallfolds/fold_%d" % i for i in range(k)]:
    folds.append(( directory + "/train.csv", directory + "/validate.csv",directory + "/test.csv"))

## Creation model
Here we define the models used for the experiments. Both the easer predictor and a populaliry predictor are created. the popularity predictor is used as a baseline

In [7]:
def split_test(data_set):
    ground_truth = data_set.sort_values('date').groupby('user_id').tail(1)
    predict = pd.concat([data_set, ground_truth]).drop_duplicates(keep=False)
    return predict, ground_truth

def data_frame_to_matrix(dataframe):
    ratings = dataframe.rating
    idx = (dataframe.user_id, dataframe.recipe_id)
    return sparse.csc_matrix((ratings, idx), shape=(len(df.user_id.unique()), len(df.recipe_id.unique())),
                                dtype=float)

In [8]:
class popularity:
    def __init__(self):
        pass
    def train(self, data):
        data = data.sort_values('count_user',ascending=False)
        self.pop = data[data.columns[1]].to_numpy()
    def predict(self):
        return self.pop

In [9]:
class Easer:
    def __init__(self):
        pass

    def train(self, X_train, lambda_=1250):
        #Code here is a modified version of the code provided in the paper

        G = X_train.T.dot(X_train)
        G = G.toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        diagIndices
        P = scipy.linalg.inv(G)
        del G
        div = -np.diag(P)
        self.B = P / div
        self.B[diagIndices] = 0


    def predict(self, xu):
        return xu * self.B


In [10]:
K = 10
K2 = 10

def recal_easer(model, predict_data, test_data):
    total = len(test_data)

    X_train = data_frame_to_matrix(predict_data)
    y_pred = model.predict(X_train)

    X_test = data_frame_to_matrix(test_data)

    interacted_recipes = (X_train == 1).toarray()
    y_pred[interacted_recipes] = -100000
    idx_top_scores = (-y_pred).argsort()[:,:10]
    dense_X_test = X_test.toarray()

    correct_K = 0
    correct_K2 = 0
    ndcg = 0

    for idx, row in enumerate(idx_top_scores):
        for rank, index in enumerate(row):
            if dense_X_test[idx][index] == 1:
                if rank < K:
                    correct_K += 1
                if rank < K2:
                    correct_K2 += 1
                ndcg += 1/(math.log2(rank+2))

    print("easer recall@%s = %s" % (str(K), str(correct_K / total)))
    print("easer recall@%s = %s" % (str(K2), str(correct_K2 / total)))
    print("easer ndcg@%s = %s" % (100, str(ndcg / total)), end="\n\n")

    return correct_K/total, correct_K2/total, ndcg/total

## training models + evaluation


In [11]:
#Please enter the path here of where you will place the pickle files (with trailing /)
data_path="D:/results_aiproject_improvement/"
result_list_K = list()
result_list_K2 = list()
result_ndcg = list()

for f_idx, fold_files in enumerate(folds):
    start = time.time()
    train_data = open_csv(fold_files[0], True)
    #Here we have the user item matrix
    X_train = data_frame_to_matrix(train_data)

    #train models

    model_pop=popularity()
    model_pop.train(train_data)
    # modelpopfile = open(data_path+"model_pop_fold" + str(f_idx) + ".pkl", mode='wb')
    # pickle.dump(model_pop, modelpopfile)
    # modelpopfile.close()
    # del model_pop

    test_data = open_csv(fold_files[1], use_less_data)

    model = Easer()
    model.train(X_train, lambda_=1250)
    interactions, ground_truth = split_test(test_data)

    recall20, recall50, ndcg = recal_easer(model, interactions, ground_truth)

    result_list_K.append(recall20)
    result_list_K2.append(recall50)
    result_ndcg.append(ndcg)

    print("done fold:",str(f_idx))
    #
    # print("easer fold: %s, recall@%s = %s" % (str(f_idx), str(K), recall20))
    # print("easer fold: %s, recall@%s = %s" % (str(f_idx), str(K2), recall50))
    # print("easer fold: %s, ndcg@%s = %s" % (str(f_idx), 100, ndcg), end="\n\n")

    end = time.time()
    print("training took : ", end - start, "s")

print("mean recall@%s over 10 folds: " % str(K), str(st.mean(result_list_K)))
print("mean recall@%s over 10 folds: " % str(K2), str(st.mean(result_list_K2)))
print("mean ndcg@%s over 10 folds: " % str(100), str(st.mean(result_ndcg)), end="\n\n")
print("standard deviation recall@%s over 10 folds: " % str(K), str(st.pstdev(result_list_K)))
print("standard deviation recall@%s over 10 folds: " % str(K2), str(st.pstdev(result_list_K2)))
print("standard deviation ndcg@%s over 10 folds: " % str(100), str(st.pstdev(result_ndcg)))



training took :  159.5821979045868 s
training took :  160.7154998779297 s
training took :  157.91159868240356 s
training took :  154.6998155117035 s
training took :  155.2664613723755 s
training took :  159.19303345680237 s
training took :  157.98179507255554 s
training took :  160.0924220085144 s
training took :  160.4871311187744 s
training took :  154.44989609718323 s


StatisticsError: mean requires at least one data point

## Evaluation results of the folds

Here we use recall@20, recal@50 and ndcg@100


In [None]:
#recall score for popularity
result_list_pop_K5=list()
result_list_pop_K10=list()
result_list_pop_K20=list()
result_list_pop_ndcg5=list()
result_list_pop_ndcg10=list()
result_list_pop_ndcg20=list()
for i in range(k):
    test_data = open_csv(folds[i][2], True)
    predict_data, ground_truth = split_test(test_data)
    model = pickle.load(open(data_path+"model_pop_fold"+str(i)+".pkl", mode='rb'))

    pop=model.predict()
    total = 0
    correct_K5 = 0
    correct_K10 = 0
    correct_K20 = 0
    ndcg5 = 0
    ndcg10 = 0
    ndcg20 = 0
    for idx, interaction in ground_truth.iterrows():
        user = interaction['user_id']
        user_data = predict_data.loc[(predict_data['user_id'] == user)]
        already_interacted_recipes = user_data[user_data.columns[1]].to_numpy()
        newpop = pop[:150]
        newpop = newpop[~np.in1d(newpop,already_interacted_recipes)]
        newpop_K5 = newpop[:5]
        newpop_K10 = newpop[:10]
        newpop_K20 = newpop[:20]
        # newpop_ndcg5 = newpop[:5]
        # newpop_ndcg10 = newpop[:10]
        # newpop_ndcg20 = newpop[:20]
        recipe = interaction['recipe_id']
        if recipe in newpop_K5:
            correct_K5 += 1
        if recipe in newpop_K10:
            correct_K10 += 1
        if recipe in newpop_K20:
            correct_K20 += 1

        if recipe in newpop_K5:
            ndcg5 += 1/(math.log2(np.where(newpop_K5 == recipe)[0]+2))
        if recipe in newpop_K5:
            ndcg10 += 1/(math.log2(np.where(newpop_K10 == recipe)[0]+2))
        if recipe in newpop_K5:
            ndcg20 += 1/(math.log2(np.where(newpop_K20 == recipe)[0]+2))
        total += 1
    result_list_pop_K5.append(correct_K5 / total)
    result_list_pop_K10.append(correct_K10 / total)
    result_list_pop_K20.append(correct_K20 / total)
    result_list_pop_ndcg5.append(ndcg5 / total)
    result_list_pop_ndcg10.append(ndcg10 / total)
    result_list_pop_ndcg20.append(ndcg20 / total)
    print("popularity fold: %s, recall@5,10,20 = %s,%s,%s" % (str(i), str(correct_K5 / total),str(correct_K10 / total),str(correct_K20 / total)))
    print("popularity fold: %s, ndcg@5,10,20 = %s,%s,%s" % (str(i),str(ndcg5 / total),str(ndcg10 / total),str(ndcg20 / total)), end="\n\n")

print("mean recall@5,10,20 over 10 folds: %s,%s,%s" % (str(st.mean(result_list_pop_K5)),str(st.mean(result_list_pop_K10)),str(st.mean(result_list_pop_K20))))
print("mean ndcg@5,10,20 over 10 folds: %s,%s,%s" % (str(st.mean(result_list_pop_ndcg5)),str(st.mean(result_list_pop_ndcg10)),str(st.mean(result_list_pop_ndcg20))), end="\n\n")
print("standard deviation recall@5,10,20 over 10 folds: %s,%s,%s" %(str(st.pstdev(result_list_pop_K5)), str(st.pstdev(result_list_pop_K10)), str(st.pstdev(result_list_pop_K20))))
print("standard deviation ndcg@5,10,20 over 10 folds: %s,%s,%s" % (str(st.pstdev(result_list_pop_ndcg5)),str(st.pstdev(result_list_pop_ndcg10)),str(st.pstdev(result_list_pop_ndcg20))))

The next section is a demonstration that selects a random user and makes a recommendation prediction for this user.

In [None]:
# import random
# # read recipe data and load pre-trained model
# df_recipes = pd.read_csv('../data/RAW_recipes.csv')
# df_recipes.drop(['minutes', 'contributor_id', 'submitted', 'tags',
#                  'nutrition', 'n_steps', 'steps', 'description', 'n_ingredients'], axis=1, inplace=True)
# data = pickle.load(open(data_path+"data_fold0.pkl", mode='rb'))
# model = pickle.load(open(data_path+"model_fold0.pkl", mode='rb'))
# predict_data = data[0]
# ratings = predict_data.rating
# idx = (predict_data.user_id, predict_data.recipe_id)
# x_train = sparse.csc_matrix((ratings, idx), shape=(len(df.user_id.unique()), len(df.recipe_id.unique())), dtype=float)
#
# # get random user and make prediction
# random_user = x_train.getrow(random.randint(0, len(df.user_id.unique())))
# prediction = model.predict(random_user)[0]
# interacted_recipes = []
# for recipe_id in random_user.indices:
#     interacted_recipes.append(recipe_dict[recipe_id])
#     prediction[recipe_id] = -100000
#
#
# top_index = (-prediction).argsort()[:10]
# recommended_recipes = []
# for recipe_id in top_index:
#     recommended_recipes.append(recipe_dict[recipe_id])
#
# # get interacted recipes and recommended recipes
# user_interactions = df_recipes[df_recipes['id'].isin(interacted_recipes)].drop('id', axis=1)
# user_recommendations = df_recipes[df_recipes['id'].isin(recommended_recipes)].drop('id', axis=1)
#
# display(user_interactions)
# display(user_recommendations)
