# project ai: Easer

by Michiel Téblick and thibaut Van Goethem

In this notebook we will test the hypothesis that is easer not good at predicting the long tail
Preprocessing and fold splitting is done ahead of time.


In [1]:
import math

import pandas as pd
import numpy as np
import scipy
from sklearn.model_selection import KFold
import time
import pickle
from scipy import sparse
import statistics as st

## Reading and preprocessing the data

In [2]:
use_less_data = True

df_train = pd.read_csv('../folds/fold_0/train.csv')
df_test = pd.read_csv('../folds/fold_0/test.csv')
df_validate = pd.read_csv('../folds/fold_0/validate.csv')
full_df = pd.concat([df_train, df_test, df_validate])
full_df.reset_index()
full_df.loc[:, 'rating'] = 1
df=full_df

print("amount of interactions in the full dataset: ", len(df))
print("amount of recipes in the full dataset: ", len(df.recipe_id.unique()))
print("amount of users in the full dataset: ", len(df.user_id.unique()))

if use_less_data:
    lesser_cuttoff=15
    df = df[df['count_item'] >= lesser_cuttoff]
    # df_smaller.reset_index()
    print("amount of recipes in the smaller dataset: ", len(df.recipe_id.unique()))
    print("amount of users in the smaller dataset: ", len(df.user_id.unique()))
df.reset_index()


amount of interactions in the full dataset:  733951
amount of recipes in the full dataset:  80511
amount of users in the full dataset:  32635
amount of recipes in the smaller dataset:  9030
amount of users in the smaller dataset:  31075


Unnamed: 0,index,user_id,recipe_id,date,rating,review,count_user,count_item
0,0,56680,79222,2006-11-11,1,"Oh, This was wonderful! Had a soup and salad ...",174,18
1,1,827374,79222,2010-11-29,1,We made this last night and really enjoyed it....,10,18
2,21,89831,33096,2004-03-15,1,Merlot...this is the second time that I made y...,2572,27
3,22,231054,33096,2007-10-25,1,I love this -- and idea behind it. I'm sure y...,266,27
4,23,470894,33096,2010-04-27,1,so simple to put together and very refreshing....,13,27
...,...,...,...,...,...,...,...,...
347051,151892,315805,55438,2011-12-29,1,I had a ham bone leftover from Christmas dinne...,46,17
347052,151900,1423741,39902,2012-07-25,1,I switched out the American Cheese for Sharp C...,7,15
347053,151907,422893,196735,2009-01-02,1,"Yum, Yum, I love potatoes with Rosemary & this...",1130,17
347054,151908,96177,196735,2009-01-12,1,We just loved these tatters. Quick easy and ve...,563,17


Set all ratings to 1 (even negative interactions are seen as interactions)

In [3]:
df.loc[:, 'rating'] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


### rescaling the id's
The recipes and users don't go from 0 to amount so if we were to put this in a matrix we would get empty columns and rows. This is not that handy so we reindex both the user_id and recipe_ids

This is a step we must not forget when entering the data in the model, as we also need to remap our input data using the same remapping that was used here

In [4]:
userSet = set(df['user_id'].to_list())
user_transform_dict = dict(map(reversed, enumerate(userSet)))
recipeSet = set(df['recipe_id'].to_list())
recipe_transform_dict = dict(map(reversed, enumerate(recipeSet)))
recipe_dict = dict(enumerate(recipeSet))

In [5]:
reverse_recipes = {v: k for k, v in recipe_transform_dict.items()}
reverse_users = {v: k for k, v in user_transform_dict.items()}

In [6]:
keep_nan_user = [k for k, v in user_transform_dict.items() if pd.isnull(v)]
keep_nan_recipe = [k for k, v in recipe_transform_dict.items() if pd.isnull(v)]


def transform_id(dataframe):
    userSet = set(dataframe['user_id'].to_list())
    user_transform_dict = dict(map(reversed, enumerate(userSet)))
    tochange = dataframe['user_id']
    dataframe['user_id'] = tochange.map(user_transform_dict).fillna(tochange.mask(tochange.isin(keep_nan_user)))

    tochange = dataframe['recipe_id']
    dataframe['recipe_id'] = tochange.map(recipe_transform_dict).fillna(tochange.mask(tochange.isin(keep_nan_recipe)))
    return dataframe


def open_csv(filename, use_less_data=False):
    df = pd.read_csv(filename)
    if use_less_data:
        df = df[df['count_item'] >= lesser_cuttoff]
    df = transform_id(df)
    df.loc[:, 'rating'] = 1
    df.drop('review', axis=1, inplace=True)
    df['count_user'] = df.groupby(['user_id'])['user_id'].transform('size')
    # df = df.drop(df[(df['count_user'] <= 2)].index)
    df['count_item'] = df.groupby(['recipe_id'])['recipe_id'].transform('size')
    # df = df.drop(df[(df['count_item'] <= 2)].index)
    #df.drop('date', axis=1, inplace=True)
    return df


### Creation of the folds


In [7]:
k = 10
folds = list()
for directory in ["../folds/fold_%d" % i for i in range(k)]:
    folds.append((directory + "/train.csv", directory + "/validate.csv", directory + "/test.csv"))

## Creation model
Here we define the models used for the experiments. Both the easer predictor and a populaliry predictor are created. the popularity predictor is used as a baseline

In [8]:
def split_test(data_set):
    ground_truth = data_set.sort_values('date').groupby('user_id').tail(1)
    predict = pd.concat([data_set, ground_truth]).drop_duplicates(keep=False)
    predict=predict[predict.recipe_id.isin(reverse_recipes)==True]
    predict.reset_index()
    return predict, ground_truth


def data_frame_to_matrix(dataframe):
    ratings = dataframe.rating
    idx = (dataframe.user_id, dataframe.recipe_id)
    return sparse.csc_matrix((ratings, idx), shape=(dataframe.user_id.max()+1, len(df.recipe_id.unique())),
                             dtype=np.float32)

In [9]:
class popularity:
    def __init__(self):
        pass

    def train(self, data):
        data = data.sort_values('count_item', ascending=False)
        self.pop = data[data.columns[1]].unique()

    def predict(self):
        return self.pop

In [10]:
class Easer:
    def __init__(self):
        pass

    def train(self, X_train, lambda_=1250):
        #Code here is a modified version of the code provided in the paper

        G = X_train.T.dot(X_train)
        G = G.toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = scipy.linalg.inv(G)
        del G
        div = -np.diag(P)
        self.B = P / div
        self.B[diagIndices] = 0

    def predict(self, xu):
        return xu * self.B


In [11]:
K = 5
K2 = 10
K3 = 20


def recal_easer(model, predict_data, test_data):
    total = len(test_data)
    print(total)
    X_train = data_frame_to_matrix(predict_data)
    y_pred = model.predict(X_train)
    print(len(test_data[test_data.recipe_id.isin(reverse_recipes)==False]))
    test_data=test_data[test_data.recipe_id.isin(reverse_recipes)==True]
    X_test = data_frame_to_matrix(test_data)

    interacted_recipes = (X_train == 1).toarray()
    y_pred[interacted_recipes] = -100000
    idx_top_scores = (-y_pred).argsort()[:, :100]
    dense_X_test = X_test.toarray()

    correct_K = 0
    correct_K2 = 0
    correct_K3 = 0
    ndcg_K = 0
    ndcg_K2 = 0
    ndcg_K3 = 0

    for idx, row in enumerate(idx_top_scores):
        if(idx>=len(dense_X_test)):continue
        for rank, index in enumerate(row):
            if dense_X_test[idx][index] == 1:
                if rank < K:
                    correct_K += 1
                    ndcg_K += 1/(math.log2(rank+2))
                if rank < K2:
                    correct_K2 += 1
                    ndcg_K2 += 1/(math.log2(rank+2))
                if rank < K3:
                    correct_K3 += 1
                    ndcg_K3 += 1/(math.log2(rank+2))


    print("easer recall@5,10,20 = %s,%s,%s" % (str(correct_K / total),str(correct_K2 / total),str(correct_K3 / total)))
    print("easer ndcg@5,10,20 = %s,%s,%s" % (str(ndcg_K / total),str(ndcg_K2 / total),str(ndcg_K3 / total)), end="\n\n")

    return correct_K/total, correct_K2/total,correct_K3/total, ndcg_K/total,ndcg_K2/total,ndcg_K3/total

## training models + evaluation


In [12]:
#Please enter the path here of where you will place the pickle files (with trailing /)
data_path = "D:/results_aiproject_improvement/"
result_list_K = list()
result_list_K2 = list()
result_ndcg = list()

for f_idx, fold_files in enumerate(folds):
    start = time.time()
    train_data = open_csv(fold_files[0], True)
    print(len(train_data))
    #Here we have the user item matrix
    X_train = data_frame_to_matrix(train_data)

    #train models

    test_data = open_csv(fold_files[1], False)

    model = Easer()
    model.train(X_train, lambda_=1250)
    interactions, ground_truth = split_test(test_data)

    recal_easer(model, interactions, ground_truth)

    # result_list_K.append(recall20)
    # result_list_K2.append(recall50)
    # result_ndcg.append(ndcg)
    #
    # print("done fold:",str(f_idx))
    #
    # print("easer fold: %s, recall@%s = %s" % (str(f_idx), str(K), recall20))
    # print("easer fold: %s, recall@%s = %s" % (str(f_idx), str(K2), recall50))
    # print("easer fold: %s, ndcg@%s = %s" % (str(f_idx), 100, ndcg), end="\n\n")

    end = time.time()
    print("training took : ", end - start, "s")
    break

print("mean recall@%s over 10 folds: " % str(K), str(st.mean(result_list_K)))
print("mean recall@%s over 10 folds: " % str(K2), str(st.mean(result_list_K2)))
print("mean ndcg@%s over 10 folds: " % str(100), str(st.mean(result_ndcg)), end="\n\n")
print("standard deviation recall@%s over 10 folds: " % str(K), str(st.pstdev(result_list_K)))
print("standard deviation recall@%s over 10 folds: " % str(K2), str(st.pstdev(result_list_K2)))
print("standard deviation ndcg@%s over 10 folds: " % str(100), str(st.pstdev(result_ndcg)))


242967
6528
2731
easer recall@5,10,20 = 0.013786764705882353,0.020833333333333332,0.03262867647058824
easer ndcg@5,10,20 = 0.009151513766222055,0.011409928793355017,0.014396394451212014

training took :  20.78832244873047 s


StatisticsError: mean requires at least one data point

In [None]:
test_data = open_csv(folds[0][1], False)
interactions, ground_truth = split_test(test_data)
recal_easer(model, interactions, ground_truth)


In [None]:
print(recipe_transform_dict[67256])
for i in recipe_transform_dict:
    if (recipe_transform_dict[i] in [10785, 35451, 15749, 13017, 28357, 22286, 8752, 10150, 27639, 32783]):
        print(i)
