# CSE 258, Fall 2021: Homework 3

## Tasks (Cook/Make prediction)

In [32]:
import gzip
import csv
import random
import numpy
import math
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn import linear_model

In [33]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [34]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [35]:
def getHeader(path):
    f = gzip.open(path, 'rt', encoding = "utf8")
    header = f.readline()
    header = header.strip().split('\t')
    header = header[0].split(',')
    return header

In [36]:
data = []
train = []
valid = []

Load in data and split the training data (‘trainInteractions.csv.gz’) as follows:\
(1) Reviews 1-400,000 for training\
(2) Reviews 400,000-500,000 for validation

In [37]:
for user,recipe,d in readCSV("trainInteractions.csv.gz"):
  data.append(d)

In [38]:
header = getHeader("trainInteractions.csv.gz")

In [39]:
train = data[:400000]
valid = data[400000:]

Useful precalculation

In [40]:
recipeCount = defaultdict(int)
totalCooked = 0
userSet = set()
recipeSet = set()
userPerRecipe = defaultdict(set)
recipePerUser = defaultdict(set)
ratingDict = {}

In [41]:
for d in train:
    user,recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1
    userSet.add(user)
    recipeSet.add(recipe)
    userPerRecipe[recipe].add(user)
    recipePerUser[user].add(recipe)
    ratingDict[(user,recipe)] = int(d['rating'])

### Q3
An alternate baseline than the one provided might make use of the Jaccard similarity (or another similarity metric). Given a pair $(u, g)$ in the validation set, consider all training items $g_0$ that user $u$ has cooked. For each, compute the Jaccard similarity between $g$ and $g_0$, i.e., users (in the training set) who have made $g$ and users who have made $g_0$. Predict as ‘made’ if the maximum of these Jaccard similarities exceeds a threshold (you may choose the threshold that works best). Report the performance on your validation set.

In [42]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

Pearson Similarity

In [43]:
userAverages = {}
recipeAverages = {}

for u in recipePerUser:
    rs = [ratingDict[(u,i)] for i in recipePerUser[u]]
    if len(rs) == 0:
        userAverages[u] = 0
        continue
    userAverages[u] = sum(rs) / len(rs)
    
for i in userPerRecipe:
    rs = [ratingDict[(u,i)] for u in userPerRecipe[i]]
    if len(rs) == 0:
        recipeAverages[i] = 0
        continue
    recipeAverages[i] = sum(rs) / len(rs)

In [44]:
def Pearson(i1, i2):
    # Between two items
    iBar1 = recipeAverages[i1]
    iBar2 = recipeAverages[i2]
    inter = userPerRecipe[i1].intersection(userPerRecipe[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += (ratingDict[(u,i1)] - iBar1)*(ratingDict[(u,i2)] - iBar2)
    for u in inter: #usersPerItem[i1]:
        denom1 += (ratingDict[(u,i1)] - iBar1)**2
    #for u in usersPerItem[i2]:
        denom2 += (ratingDict[(u,i2)] - iBar2)**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [45]:
def Q3():
    thresholds = numpy.arange(0, 1, 0.1)
    res = []
    for threshold in thresholds:
        correct = 0
        for d in valid:
            similarities = [0]
            userCooked = recipePerUser[d['user_id']]
            for recipe in userCooked:
                similarities.append(Pearson(d['recipe_id'], recipe))
                
            if max(similarities) > threshold:
                correct += (d['rating'] != '-1')
            else:
                correct += (d['rating'] == '-1')

        res.append(correct/len(valid))

    plt.plot(thresholds, res, 'b-')
    plt.xlabel('Threshold')
    plt.ylabel('Accuracy for different thresholds')
    plt.show()
    return res

In [30]:
res = Q3()

KeyError: '57597698'

In [31]:
print("Accuracy on Validation set with similarity threshold ", 0.001 + res.index(max(res)) * 0.001, " is")
print(max(res))

NameError: name 'res' is not defined

### Q4
Improve the above predictor by incorporating both a Jaccard-based threshold and a popularity based threshold. Report the performance on your validation set.

In [None]:
def Q4():
    
    pop_threshold = 1.69
    sim_threshold = 0.012
    
    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked / pop_threshold:
            break

    correct = 0
    for d in valid:
        similarities = [0]
        for recipe in recipePerUser[d['user_id']]:
            if d['recipe_id'] not in userPerRecipe:
                similarities.append(0)
            else:
                similarities.append(Jaccard(userPerRecipe[d['recipe_id']], userPerRecipe[recipe]))

        if max(similarities) > sim_threshold and d['recipe_id'] in return1:
                correct += (d['rating'] != '-1')
        else:
                correct += (d['rating'] == '-1')
    
    print("Accuracy on Validation set with \npopularity threshold ", pop_threshold, 
          " (totalCooked / threshold) \nand similarity threshold ", sim_threshold, " is")
    print(correct/len(valid))

In [None]:
Q4()

### Q5
To run our model on the test set, we’ll have to use the files ‘stub Made.txt’ to find the user id/recipe id pairs about which we have to make predictions. Using that data, run the above model and upload your solution to Kaggle. Tell us your Kaggle user name. If you’ve already uploaded a better solution to Kaggle, that’s fine too!

Kaggle user name: ZiangX\
Score: 0.67190

In [None]:
def Q5():
    
    pop_threshold = 1.69
    sim_threshold = 0.012
    
    mostPopular = [(recipeCount[x], x) for x in recipeCount]
    mostPopular.sort()
    mostPopular.reverse()

    return1 = set()
    count = 0
    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > totalCooked / pop_threshold:
            break               
                
    predictions = open("predictions_Made.txt", 'w')
    for l in open("stub_Made.txt"):
        if l.startswith("user_id"):
            predictions.write(l)
            continue
        u,i = l.strip().split('-')
        similarities = [0]
        for recipe in recipePerUser[u]:
            if i not in userPerRecipe:
                similarities.append(0)
            else:
                similarities.append(Jaccard(userPerRecipe[i], userPerRecipe[recipe]))  
        
        if max(similarities) > sim_threshold and i in return1:
            predictions.write(u + '-' + i + ",1\n")
        else:
            predictions.write(u + '-' + i + ",0\n")

    predictions.close()

In [None]:
Q5()

## Tasks (Rating prediction)
Let’s start by building our training/validation sets much as we did for the first task. This time building a validation set is more straightforward: you can simply use part of the data for validation, and do not need to randomly sample non-cooked users/recipes.

In [46]:
train = data[:400000]
valid = data[400000:]

In [47]:
import scipy
import tensorflow as tf

### Q9
Fit a predictor of the form\
$$rating(user, item) \simeq \alpha + \beta_user + \beta_item$$\
by fitting the mean and the two bias terms as described in the lecture notes. Use a regularization
parameter of $\lambda = 1$. Report the MSE on the validation set.

In [48]:
userIDs = {}
itemIDs = {}
interactions = []

for d in data:
    u = d['user_id']
    i = d['recipe_id']
    r = int(d['rating'])
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))

In [49]:
interactionsTrain = interactions[:400000]
interactionsValid = interactions[400000:]

In [50]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

In [51]:
optimizer = tf.keras.optimizers.Adam(0.1)

In [52]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [53]:
modelLFM = LatentFactorModel(mu, 5, 0.00001)

2021-11-07 12:28:10.334415: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-07 12:28:10.334766: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [54]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [55]:
for i in range(100):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.40737453
iteration 20, objective = 0.38777682
iteration 30, objective = 0.39532566
iteration 40, objective = 0.40330067
iteration 50, objective = 0.3978235
iteration 60, objective = 0.408463
iteration 70, objective = 0.39260688
iteration 80, objective = 0.39281672
iteration 90, objective = 0.4075775
iteration 100, objective = 0.4036573


In [56]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [58]:
Predictions =\
    [modelLFM.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTrain]

KeyboardInterrupt: 

In [None]:
labels = [r for _,_,r in interactionsTrain]

In [None]:
print('MSE on test set is')
MSE(Predictions, labels)

In [None]:
Predictions_valid =\
    [modelLFM.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsValid]
labels_valid = [r for _,_,r in interactionsValid]

In [None]:
print('MSE on validation set is')
MSE(Predictions_valid, labels_valid)

### Q11
Find a better value of $\lambda$ using your validation set. Report the value you chose, its MSE, and upload your
solution to Kaggle by running it on the test data.

Kaggle user name: ZiangX\
Score: 0.83678

In [None]:
allRatings = []
userRatings = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
  r = int(d['rating'])
  allRatings.append(r)
  userRatings[user].append(r)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
  userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

predictions = open("predictions_Rated.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u in userIDs and i in itemIDs:
        predictions.write(u + '-' + i + ',' + str(modelBiasOnly.predict(userIDs[u],itemIDs[i]).numpy()) + '\n')
    elif u in userAverage:
        predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
    else:
        predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()