first of we start with preprocessing the raw data

In [1]:
import pandas as pd
import numpy as np
import scipy

In [2]:
df = pd.read_csv('../data/RAW_interactions.csv')
df.drop('review', axis=1, inplace=True)
df.drop('date', axis=1, inplace=True)
df.reset_index()
df.drop_duplicates(subset=['user_id', 'recipe_id'])

Unnamed: 0,user_id,recipe_id,rating
0,38094,40893,4
1,1293707,40893,5
2,8937,44394,4
3,126440,85009,5
4,57222,85009,5
...,...,...,...
1132362,116593,72730,0
1132363,583662,386618,5
1132364,157126,78003,5
1132365,53932,78003,4


In [3]:
#Set all ratings to 1 (even negative interactions are seen as interactions)
# df.loc[:,'rating'] = 1

In [4]:
#randomly drop a subset of data as we dont have enough resource to run the entire dataset
unique_recipes = df.recipe_id.unique()
subset = np.random.choice(unique_recipes, size=int(len(unique_recipes)/10), replace=False, p=None)
# Keep only the recipes that were in the randomly sampled df
df = df[df['recipe_id'].isin(subset)]
df.reset_index()

Unnamed: 0,index,user_id,recipe_id,rating
0,2,8937,44394,4
1,8,76535,134728,4
2,9,273745,134728,5
3,10,353911,134728,5
4,11,190375,134728,5
...,...,...,...,...
112047,1132311,1630384,421875,4
112048,1132362,116593,72730,0
112049,1132364,157126,78003,5
112050,1132365,53932,78003,4


In [5]:
#Preprocessing step where we remove all recipes that only have a single review from a person that only has a single review
#This is done as these items will never be connected to other items and thus will never be recommended

df['counts'] = df.groupby(['recipe_id'])['recipe_id'].transform('size')
df['counts_user'] = df.groupby(['user_id'])['user_id'].transform('size')
# 1121916 interaction in df after the and drop
# df = df.drop(df[(df['counts'] == 1) & (df['counts_user'] == 1)].index)

# 884607 interactions after the or drop
df=df.drop(df[(df['counts']==1)|(df['counts_user']==1)].index)
df.drop(['counts', 'counts_user'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

The recipes and users don't go from 0 to amount so if we were to put this in a matrix we would get empty columns and rows. This is not that handy so we reindex both the user_id and recipe_ids

This is a step we must not forget when entering the data in the model, as we also need to remap our input data using the same remapping that was used here

In [6]:
userSet = set(df['user_id'].to_list())
user_transform_dict = dict(map(reversed, enumerate(userSet)))
recipeSet = set(df['recipe_id'].to_list())
recipe_transform_dict = dict(map(reversed, enumerate(recipeSet)))

In [7]:
keep_nan = [k for k, v in user_transform_dict.items() if pd.isnull(v)]
tochange = df['user_id']
df['user_id'] = tochange.map(user_transform_dict).fillna(tochange.mask(tochange.isin(keep_nan)))

keep_nan = [k for k, v in recipe_transform_dict.items() if pd.isnull(v)]
tochange = df['recipe_id']
df['recipe_id'] = tochange.map(recipe_transform_dict).fillna(tochange.mask(tochange.isin(keep_nan)))


Creation of the folds for k-fold validation

In [8]:
unique_users = df.user_id.unique()

unique_users.sort(axis=0)

In [9]:
from sklearn.model_selection import train_test_split, KFold

k = 2
kf = KFold(n_splits=k)
kf.get_n_splits(df)
folds = list()
for train_index, test_index in kf.split(unique_users):
    X_train = df.query('user_id in @train_index')
    X_test = df.query('user_id in @test_index')
    folds.append((X_train, X_test))

Here starts the actual programming of the model

In [10]:
class Easer:
    def __init__(self):
        pass

    def train(self, X_train, lambda_=0.5):
        self.X = X_train

        #X_train_t=X_train.copy().transpose()
        G = X_train.T.dot(X_train)  # sparse
        G=G.toarray()
        #G= X_train.T.dot(X_train).toarray() # dense
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = scipy.linalg.inv(G)  # sparse
        #P = np.linalg.inv(G) # dense
        # dense diagonal retrieval (for some reason i cant get this to work on sparse matrixes something with the dimensions being wrong)
        div = -np.diag(P)
        self.B = P / div
        self.B[diagIndices] = 0

        self.pred = self.X * self.B

    def predicts(self, xu):
        #TODO give back the predicted vector given a user interaction vector
        return xu * self.B


In [11]:
import time
import pickle
from scipy import sparse
for f_idx,i in enumerate(folds):
    start = time.time()
    #for fold in folds: # removed for loop for testing
    train_data = i[0]
    ratings = train_data.rating
    idx = (train_data.user_id, train_data.recipe_id)
    print(len(df.user_id.unique()))
    print(len(df.recipe_id.unique()))
    #Here we have the user item matrix
    X_train = sparse.csc_matrix((ratings, idx), shape=(len(df.user_id.unique()), len(df.recipe_id.unique())), dtype=float)
    #train model
    model = Easer()
    model.train(X_train)
    print("done")
    end = time.time()
    print("training took : ",end - start,"s")
    datafile=open("D:\\results_aiproject\\data_fold"+str(f_idx)+".pkl",mode='wb')
    pickle.dump(i,datafile)
    modelfile=open("D:\\results_aiproject\\model_fold"+str(f_idx)+".pkl",mode='wb')
    pickle.dump(model,modelfile)
    datafile.close()
    modelfile.close()
    break

10153
13427
done
training took :  39.408345222473145 s


In [12]:
def remove_single_interaction(data):
    removed_recipes=dict()
    users=data.user_id.unique()
    for user in users:
        user_data=data.loc[(data['user_id']==user)]
        row=user_data.sample()
        removed_recipes[user]=row['recipe_id'].iat[0]
        data=data.drop(data[(data["recipe_id"] == row["recipe_id"].iat[0]) & (data["user_id"] ==row["user_id"].iat[0])].index)
    return (data,removed_recipes)

In [13]:
data=pickle.load(open("D:\\results_aiproject\\data_fold0.pkl",mode='rb'))
model=pickle.load(open("D:\\results_aiproject\\model_fold0.pkl",mode='rb'))


test_data=data[1]
temp_result=remove_single_interaction(test_data)
# test_data.reset_index(drop=True, inplace=True)
removed_test_data=temp_result[0]
removed_recipes=temp_result[1]
ratings = removed_test_data.rating
idx = (removed_test_data.user_id, removed_test_data.recipe_id)
x_test=sparse.csc_matrix((ratings, idx), shape=(len(test_data.user_id.unique()), len(df.recipe_id.unique())), dtype=float)
y_pred = model.predicts(x_test)
print(y_pred)

[[-4.09486739e-02  0.00000000e+00  2.27742384e-01 ...  0.00000000e+00
   2.86319020e-02  0.00000000e+00]
 [ 8.72115303e-04  0.00000000e+00 -1.23126466e-01 ...  0.00000000e+00
   9.50088586e-03  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 2.91849529e-03  0.00000000e+00 -3.23695011e-01 ...  0.00000000e+00
  -2.27084752e-02  0.00000000e+00]
 [ 2.96987310e-05  0.00000000e+00 -1.58024958e-03 ...  0.00000000e+00
   3.85714210e-04  0.00000000e+00]
 [ 4.53880962e-04  0.00000000e+00  4.46710192e-02 ...  0.00000000e+00
   1.15878473e-02  0.00000000e+00]]


In [15]:
#Evaluate recall@k
#Do elementwise multiplication of top K predicts and true interactions
K = 20
total=0
correct=0
wrong=0
for user in test_data.user_id.unique():
    user_data=removed_test_data.loc[(removed_test_data['user_id']==user)]
    already_interacted_recipes=user_data[user_data.columns[1]].to_numpy()
    predicted=y_pred[user]
    np.put(predicted, already_interacted_recipes,  -5)
    ind = (-predicted).argsort()[:K]
    # ratings = removed_test_data.rating
    # idx = (removed_test_data.user_id, removed_test_data.recipe_id)
    # x_test=sparse.csc_matrix((ratings, idx), shape=(len(test_data.user_id.unique()), len(df.recipe_id.unique())), dtype=float)
    # temp = (X_test == 1).toarray()
    # model.pred[temp] = -100000
    # test = (-model.pred).argsort()[:,:10]
    if(removed_recipes[user] in ind):
        correct+=1
    else:
        wrong+=1
    total+=1
print("recall@%s = %s"%(str(K),str(correct/total)))

recall@20 = 0.034666141422099665
