# CSE 258, Fall 2021: Homework 3

In [1]:
import gzip
import csv
import random
import numpy
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn import linear_model

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [4]:
def getHeader(path):
    f = gzip.open(path, 'rt', encoding = "utf8")
    header = f.readline()
    header = header.strip().split('\t')
    header = header[0].split(',')
    return header

In [5]:
data = []
train = []
valid = []

In [6]:
for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    data.append(d)

In [7]:
header = getHeader("trainInteractions.csv.gz")

Useful precalculation

In [8]:
recipeCount = defaultdict(int)
totalCooked = 0
userSet = set()
recipeSet = set()
userPerRecipe = defaultdict(set)
recipePerUser = defaultdict(set)

In [9]:
for d in data:
    user,recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1
    userSet.add(user)
    recipeSet.add(recipe)
    userPerRecipe[recipe].add(user)
    recipePerUser[user].add(recipe)

## Tasks (Rating prediction)
Let’s start by building our training/validation sets much as we did for the first task. This time building a validation set is more straightforward: you can simply use part of the data for validation, and do not need to randomly sample non-cooked users/recipes.

In [10]:
import scipy
import tensorflow as tf

### Q9
Fit a predictor of the form\
$$rating(user, item) \simeq \alpha + \beta_user + \beta_item$$\
by fitting the mean and the two bias terms as described in the lecture notes. Use a regularization
parameter of $\lambda = 1$. Report the MSE on the validation set.

In [11]:
userIDs = {}
itemIDs = {}
interactions = []

for d in data:
    u = d['user_id']
    i = d['recipe_id']
    r = int(d['rating'])
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))

In [12]:
interactionsTrain = interactions

In [13]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

In [14]:
optimizer = tf.keras.optimizers.Adam(0.1)

In [15]:
class LatentFactorModelBiasOnly(tf.keras.Model):
    def __init__(self, mu, lamb):
        super(LatentFactorModelBiasOnly, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i]
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        pred = self.alpha + beta_u + beta_i
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

In [16]:
def trainingStepBiasOnly(model, interactions):
    Nsamples = 400000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
        (grad, var) in zip(gradients, model.trainable_variables)
        if grad is not None)
    return loss.numpy()

In [17]:
modelBiasOnly = LatentFactorModelBiasOnly(mu, 0.00001)

2021-11-14 17:48:50.932103: I tensorflow/core/platform/cpu_feature_guard.cc:145] This TensorFlow binary is optimized with Intel(R) MKL-DNN to use the following CPU instructions in performance critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in non-MKL-DNN operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-14 17:48:50.932496: I tensorflow/core/common_runtime/process_util.cc:115] Creating new thread pool with default inter op setting: 8. Tune using inter_op_parallelism_threads for best performance.


In [18]:
for i in range(100):
    obj = trainingStepBiasOnly(modelBiasOnly, interactionsTrain)

In [19]:
allRatings = []
userRatings = defaultdict(list)
itemRatings = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    r = int(d['rating'])
    allRatings.append(r)
    userRatings[user].append(r)
    itemRatings[recipe].append(r)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
itemAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])
for i in itemRatings:
    itemAverage[i] = sum(itemRatings[i]) / len(itemRatings[i])

predictions = open("predictions_Rated_Y.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u in userIDs and i in itemIDs:
        predictions.write(u + '-' + i + ',' + str(modelBiasOnly.predict(userIDs[u],itemIDs[i]).numpy()) + '\n')
    elif u in userAverage:
        predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
    elif i in itemAverage:
        predictions.write(u + '-' + i + ',' + str(itemAverage[i]) + '\n')
    else:
        predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()