## Rating

In [27]:
import gzip
import random
import scipy
from collections import defaultdict
import tensorflow as tf
import csv
from implicit import bpr

ModuleNotFoundError: No module named 'implicit'

In [4]:
def parseData(fname):
    for l in gzip.open(fname):
        d = eval(l)
        yield d

In [7]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

## Bayesian Personalized Ranking

In [11]:
data = []
for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    u = d['user_id']
    i = d['recipe_id']
    r = int(d['rating'])
    data.append(d)


In [18]:
nTrain = int(len(data) * 0.5)
data = data[:nTrain]
random.shuffle(data)

In [22]:
data[0]

{'user_id': '87356042',
 'recipe_id': '53584929',
 'date': '2003-02-01',
 'rating': '5'}

In [13]:
userIDs,itemIDs = {},{}

for d in data:
    u,i = d['user_id'],d['recipe_id']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)

nUsers,nItems = len(userIDs),len(itemIDs)

In [20]:
nUsers,nItems

(13533, 151462)

In [23]:
Xiu = scipy.sparse.lil_matrix((nItems, nUsers))
for d in data:
    Xiu[itemIDs[d['recipe_id']],userIDs[d['user_id']]] = 1
    
Xui = scipy.sparse.csr_matrix(Xiu.T)

In [25]:
model = bpr.BayesianPersonalizedRanking(factors = 5)
model.fit(Xiu)

NameError: name 'bpr' is not defined

In [4]:
userIDs = {}
itemIDs = {}
interactions = []


for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    u = d['user_id']
    i = d['recipe_id']
    r = int(d['rating'])
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))
    

In [32]:
nTrain = int(len(interactions) * 0.5)
nTest = len(interactions) - nTrain
interactionsTrain = interactions[:nTrain]
interactionsTest = interactions[450000:]

In [33]:
len(interactionsTest)

50000

In [34]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
users_train = []
for u,i,r in interactionsTrain:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

### Similarity

In [35]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [36]:
ratingMean = sum([d[2] for d in interactionsTest]) / nTrain

In [None]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for i2 in itemsPerUser[user]:
        if i2 == item: continue
        ratings.append(ratingDict[(user,i2)] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

## Latent Factor Model

In [37]:
mu = sum([int(r) for _,_,r in interactionsTrain]) / len(interactionsTrain)
optimizer = tf.keras.optimizers.Adam(0.1)

In [38]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [10]:
modelLFM = LatentFactorModel(mu, 5, 1)

In [39]:
def trainingStep(model, interactions):
    Nsamples = 40000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [13]:
for i in range(10):
    obj = trainingStep(modelLFM, interactionsTrain)
    

### Q9

In [11]:
predictions = []
for d in interactionsTest:
    u = d[0]
    i = d[1]
    r = d[2]
    pred = modelLFM.predict(userIDs[u], itemIDs[i]).numpy()
    predictions.append(pred)

NameError: name 'modelLFM' is not defined

In [12]:
labels = []
for d in interactionsTest:
    labels.append(d[2])

In [45]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [93]:
MSE(predictions, labels)

0.9108149116230921

### Q10

In [35]:
userIDs_train = {}
recipeIDs_train = {}
cnt = 0
nTrain = int(len(interactions) * 0.9)


for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    cnt += 1
    r = int(d['rating'])
    
    if cnt <= nTrain:
        if not user in userIDs_train: 
            userIDs_train[user] = len(userIDs_train)
        if not recipe in recipeIDs_train: 
            recipeIDs_train[recipe] = len(recipeIDs_train)
#         train_interactions.append((user, recipe, r))

In [40]:
max_user_id = 0
min_user_id = 0
temp_max_id = tf.argmax(modelLFM.betaU)
temp_min_id = tf.argmin(modelLFM.betaU)
for u in userIDs_train:
    if userIDs_train[u] == temp_max_id:
        max_user_id = u
    if userIDs_train[u] == temp_min_id:
        min_user_id = u


temp_max_id = tf.argmax(modelLFM.betaI)
temp_min_id = tf.argmin(modelLFM.betaI)
for u in recipeIDs_train:
    if recipeIDs_train[u] == temp_max_id:
        max_recipe_id = u
    if recipeIDs_train[u] == temp_min_id:
        min_recipe_id = u



In [39]:
print("The user ID and recipe ID with the largest value of beta is",max_user_id, "and",max_recipe_id)
print("The user ID and recipe ID with the smallest value of beta is",min_user_id, "and",min_recipe_id)

The user ID and recipe ID with the largest value of beta is 82042865 and 20543947
The user ID and recipe ID with the smallest value of beta is 60620344 and 85105824


### Q11

In [40]:
allRatings = []
userRatings = defaultdict(list)
itemRatings = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    r = int(d['rating'])
    allRatings.append(r)
    userRatings[user].append(r)
    itemRatings[recipe].append(r)

globalAverage = sum(allRatings) / len(allRatings)

userAverage = {}
itemAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])
for i in itemRatings:
    itemAverage[i] = sum(itemRatings[i])/len(itemRatings[i])

In [41]:
import numpy as np
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [26]:
lmbd = 0.0001
modelLFM = LatentFactorModel(mu, 5, lmbd)
for i in range(50):
        obj = trainingStep(modelLFM, interactionsTrain)
    
predictions = []
labels = []
for d in interactionsTest:
        u = d[0]
        i = d[1]
        r = d[2]
        labels.append(r)
        
        if u in userIDs and i in itemIDs:
            pred = modelLFM.predict(userIDs[u], itemIDs[i]).numpy()
        elif u in userIDs:
            pred = userAverage[u]
        elif i in itemIDs:
            pred = itemAverage[i]
        else:
            pred = globalAverage

        predictions.append(pred)
        
mse = MSE(predictions, labels)

In [None]:
results =[]
for lmbd in np.arange(0.00001,0.0001,0.00001):
    for factor in np.arange(1,10,1):
        modelLFM = LatentFactorModel(mu, factor, lmbd)
        for i in range(10):
                obj = trainingStep(modelLFM, interactionsTrain)

        predictions = []
        labels = []
        for d in interactionsTest:
                u = d[0]
                i = d[1]
                r = d[2]
                labels.append(r)

                if u in userIDs and i in itemIDs:
                    pred = modelLFM.predict(userIDs[u], itemIDs[i]).numpy()
                elif u in userIDs:
                    pred = userAverage[u]
                elif i in itemIDs:
                    pred = itemAverage[i]
                else:
                    pred = globalAverage

                predictions.append(pred)

        mse = MSE(predictions, labels)
        res = (lmbd,factor,mse)
        results.append(res)
        
        

In [28]:
print("The lambda I choose is: "+ str(lmbd))
print("The  mse of the model is: "+str(mse))

The lambda I choose is: 1e-05
The  mse of the model is: 0.8781421428756998


In [29]:
predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
    #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u in userIDs and i in itemIDs:
        pred = modelLFM.predict(userIDs[u], itemIDs[i]).numpy()
    elif u in userIDs:
        pred = userAverage[u]
    elif i in itemIDs:
        pred = itemAverage[i]
    else:
        pred = globalAverage
    predictions.write(u + '-' + i + ',' + str(pred) + '\n')

predictions.close()

In [19]:
print("My Kaggle user name is: Ashley9988")

My Kaggle user name is: Ashley9988
