In [37]:
import pandas as pd 
import scipy
import numpy as np
from fastFM import als
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [38]:
ratingsTrainDf = pd.read_csv('data/interactions_train.csv')
recipes_RAW = pd.read_csv('data/RAW_recipes.csv').rename(columns={'id': 'recipe_id'})
recipes_RAW.loc[144074, 'minutes'] = 25
datasetDf = pd.merge(ratingsTrainDf, recipes_RAW, how='left', on='recipe_id')

In [39]:
unique_recipe_contributors = recipes_RAW['contributor_id'].value_counts()
avg_rating_and_unique_recipes = pd.DataFrame(unique_recipe_contributors).merge(right=datasetDf.groupby(['contributor_id'])['rating'].agg(['mean']),right_index=True,left_index=True)

top_500_reviews = datasetDf.groupby('contributor_id')['rating'].agg(['count','mean']).sort_values(by='count')
top_500_reviews = top_500_reviews[top_500_reviews['count'] > 500].index
datasetDf['top_500_reviews'] = datasetDf['user_id'].apply(lambda x: x in top_500_reviews)
top_50_recipes = avg_rating_and_unique_recipes[avg_rating_and_unique_recipes['count'] > 50].index 
datasetDf['top_50_recipes'] = datasetDf['user_id'].apply(lambda x: x in top_50_recipes)

datasetDf = datasetDf.astype({'date':'datetime64[us]','submitted':'datetime64[us]'})

datasetDf['days_difference'] = (datasetDf.date - datasetDf.submitted).dt.days


In [40]:
datasetDf = datasetDf.sample(frac=1)

In [41]:
trainRatio = 0.8
trainSplit = int(trainRatio * len(datasetDf))
n_iter = 50

In [42]:
# Baseline Factorization Machine

In [43]:
dataTrain = datasetDf.to_dict('records')

In [44]:
userIDs,itemIDs = {},{}

for d in dataTrain:
    u,i = d['user_id'],d['recipe_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

nUsers, nItems = len(userIDs),len(itemIDs)

X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item

y = np.array([float(d['rating']) for d in dataTrain])


In [45]:
X_train,y_train = X[:trainSplit],y[:trainSplit]
X_test,y_test = X[trainSplit:],y[trainSplit:]

In [46]:
fm = als.FMRegression(n_iter=n_iter, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [47]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [48]:
MSE(y_pred, y_test)

np.float64(1.4101955558506474)

In [49]:
### Added minutes

In [50]:
def normalize(field):
    array = []
    for d in dataTrain:
        array.append(d[field])
    array = (np.min(array) - array) / (np.min(array) - np.max(array))
    return array

In [51]:
minutes = normalize('minutes')

X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems + 1))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item
    X[i,nUsers + nItems] = minutes[i]

y = np.array([float(d['rating']) for d in dataTrain])

X_train,y_train = X[:trainSplit],y[:trainSplit]
X_test,y_test = X[trainSplit:],y[trainSplit:]

In [52]:
fm = als.FMRegression(n_iter=n_iter, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [53]:
MSE(y_pred, y_test)

np.float64(1.4232689990699412)

In [54]:
### Add more numeric columns

In [55]:
steps = normalize('n_steps')
ingredients = normalize('n_ingredients')

X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems + 3))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item
    X[i,nUsers + nItems] = minutes[i]
    X[i,nUsers + nItems + 1] = steps[i]
    X[i,nUsers + nItems + 2] = ingredients[i]
y = np.array([float(d['rating']) for d in dataTrain])

X_train,y_train = X[:trainSplit],y[:trainSplit]
X_test,y_test = X[trainSplit:],y[trainSplit:]

In [56]:
fm = als.FMRegression(n_iter=n_iter, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [57]:
MSE(y_pred, y_test)

np.float64(1.4176539039914906)

In [58]:
### Add more numeric columns

In [59]:
diff = normalize('days_difference')

X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems + 2))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item
    X[i,nUsers + nItems] = minutes[i]
    X[i,nUsers + nItems + 1] = diff[i]
y = np.array([float(d['rating']) for d in dataTrain])

X_train,y_train = X[:trainSplit],y[:trainSplit]
X_test,y_test = X[trainSplit:],y[trainSplit:]

In [60]:
fm = als.FMRegression(n_iter=n_iter, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [61]:
MSE(y_pred, y_test)

np.float64(1.4174419772616815)

In [62]:
### Add binary columns

In [63]:
X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems + 4))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item
    X[i,nUsers + nItems] = minutes[i]
    X[i,nUsers + nItems + 1] = diff[i]
    X[i,nUsers + nItems + 2] = 1 if dataTrain[i]['top_500_reviews'] else 0
    X[i,nUsers + nItems + 3] = 1 if dataTrain[i]['top_50_recipes'] else 0
    
y = np.array([float(d['rating']) for d in dataTrain])

X_train,y_train = X[:trainSplit],y[:trainSplit]
X_test,y_test = X[trainSplit:],y[trainSplit:]

In [64]:
fm = als.FMRegression(n_iter=n_iter, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [65]:
MSE(y_pred, y_test)

np.float64(1.4213771146999603)

In [None]:
y_pred = [5] * len(y_test)
MSE(y_pred, y_test)

np.float64(1.0900408496147544)

In [74]:
unique_values, counts = np.unique(y_test, return_counts=True)
print(unique_values)
print(counts)


[0. 1. 2. 3. 4. 5.]
[  3342    653   1375   5155  25374 103882]
