In [8]:
import pandas as pd 
import scipy
import numpy as np
from fastFM import als
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [103]:
ratingsTrainDf = pd.read_csv('data/interactions_train.csv')
recipes_RAW = pd.read_csv('data/RAW_recipes.csv').rename(columns={'id': 'recipe_id'})
recipes_RAW.loc[144074, 'minutes'] = 25
datasetDf = pd.merge(ratingsTrainDf, recipes_RAW, how='left', on='recipe_id')
datasetDf = datasetDf[['user_id', 'recipe_id', 'rating', 'minutes', 'contributor_id', 'n_steps', 'n_ingredients']]

In [6]:
# Baseline Factorization Machine

In [None]:
dataTrain = [row for _, row in datasetDf.iterrows()]

In [35]:
userIDs,itemIDs = {},{}

for d in dataTrain:
    u,i = d['user_id'],d['recipe_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

nUsers, nItems = len(userIDs),len(itemIDs)

X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item

y = np.array([float(d['rating']) for d in dataTrain])


In [36]:
X_train,y_train = X[:400000],y[:400000]
X_test,y_test = X[400000:],y[400000:]

In [37]:
fm = als.FMRegression(n_iter=100, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [13]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [38]:
MSE(y_pred, y_test)

np.float64(1.427173447955908)

In [None]:
### Added minutes

In [104]:
def normalize(field):
    array = []
    for d in dataTrain:
        array.append(d[field])
    array = (np.min(array) - array) / (np.min(array) - np.max(array))
    return array

In [105]:
minutes = normalize('minutes')

X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems + 1))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item
    X[i,nUsers + nItems] = minutes[i]

y = np.array([float(d['rating']) for d in dataTrain])

X_train,y_train = X[:400000],y[:400000]
X_test,y_test = X[400000:],y[400000:]

In [106]:
fm = als.FMRegression(n_iter=100, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [107]:
MSE(y_pred, y_test)

np.float64(1.4062152193704807)

In [None]:
### Contributor ID Added

In [39]:
contributorIDs = {}

for d in dataTrain:
    c = d['contributor_id']
    if not c in contributorIDs: contributorIDs[c] = len(contributorIDs)

nContributors = len(contributorIDs)
X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems + nContributors))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    contributor = contributorIDs[dataTrain[i]['contributor_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item
    X[i,nUsers + nItems + contributor] = 1

y = np.array([float(d['rating']) for d in dataTrain])

X_train,y_train = X[:400000],y[:400000]
X_test,y_test = X[400000:],y[400000:]


In [40]:
fm = als.FMRegression(n_iter=100, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [41]:
MSE(y_pred, y_test)

np.float64(1.4606811899541692)

In [None]:
### Add more numeric columns

In [109]:
steps = normalize('n_steps')
ingredients = normalize('n_ingredients')

X = scipy.sparse.lil_matrix((len(dataTrain), nUsers + nItems + 3))

for i in range(len(dataTrain)):
    user = userIDs[dataTrain[i]['user_id']]
    item = itemIDs[dataTrain[i]['recipe_id']]
    X[i,user] = 1 # One-hot encoding of user
    X[i,nUsers + item] = 1 # One-hot encoding of item
    X[i,nUsers + nItems] = minutes[i]
    X[i,nUsers + nItems] = steps[i]
    X[i,nUsers + nItems] = ingredients[i]
y = np.array([float(d['rating']) for d in dataTrain])

X_train,y_train = X[:400000],y[:400000]
X_test,y_test = X[400000:],y[400000:]

In [110]:
fm = als.FMRegression(n_iter=100, init_stdev=0.1, rank=5, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)
y_pred = fm.predict(X_test)

In [111]:
MSE(y_pred, y_test)

np.float64(1.4210398793240218)