# CSE 258, Fall 2021: Homework 3

## Tasks (Cook/Make prediction)

In [1]:
import gzip
import csv
import random
import numpy
import math
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn import linear_model
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [3]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    c = csv.reader(f)
    header = next(c)
    for l in c:
        d = dict(zip(header,l))
        yield d['user_id'],d['recipe_id'],d

In [4]:
def getHeader(path):
    f = gzip.open(path, 'rt', encoding = "utf8")
    header = f.readline()
    header = header.strip().split('\t')
    header = header[0].split(',')
    return header

In [5]:
data = []
train = []
valid = []

Load in data and split the training data (‘trainInteractions.csv.gz’) as follows:\
(1) Reviews 1-400,000 for training\
(2) Reviews 400,000-500,000 for validation

In [6]:
for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    data.append(d)

In [7]:
header = getHeader("trainInteractions.csv.gz")

In [8]:
train = data[:400000]
valid = data[400000:]

Useful precalculation

In [10]:
recipeCount = defaultdict(int)
totalCooked = 0
userSet = set()
recipeSet = set()
userPerRecipe = defaultdict(set)
recipePerUser = defaultdict(set)
ratingDict = {}

In [11]:
for d in data:
    user,recipe = d['user_id'], d['recipe_id']
    recipeCount[recipe] += 1
    totalCooked += 1
    userSet.add(user)
    recipeSet.add(recipe)
    userPerRecipe[recipe].add(user)
    recipePerUser[user].add(recipe)
    ratingDict[(user,recipe)] = int(d['rating'])

In [12]:
userIDs = {}
itemIDs = {}
interactions = []

for d in data:
    u = d['user_id']
    i = d['recipe_id']
    r = int(d['rating'])
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))

In [13]:
interactionsTrain = interactions

In [14]:
len(interactionsTrain)

500000

In [15]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)

In [16]:
mu

4.580794

In [17]:
optimizer = tf.keras.optimizers.Adam(0.1)

NameError: name 'tf' is not defined

In [18]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001))
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    # Prediction for a single instance (useful for evaluation)
    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] +\
            tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))
    
    # Prediction for a sample of instances
    def predictSample(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        pred = self.alpha + beta_u + beta_i +\
               tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return pred
    
    # Loss
    def call(self, sampleU, sampleI, sampleR):
        pred = self.predictSample(sampleU, sampleI)
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r) / len(sampleR)

NameError: name 'tf' is not defined

In [None]:
def trainingStep(model, interactions):
    Nsamples = 400000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU,sampleI,sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [None]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [None]:
def exp():
    lmds = numpy.arange(0, 1, 0.1)
    
    for lmd in lmds:
        modelLFM = LatentFactorModel(mu, 1, lmd)
        for i in range(50):
            obj = trainingStep(modelLFM, interactionsTrain)
#             if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))
        
        Predictions_valid =\
            [modelLFM.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsValid]
        labels_valid = [r for _,_,r in interactionsValid]
        
        print('MSE on validation set is', lmd)
        print(MSE(Predictions_valid, labels_valid))
        print('-----------------------------------------')

In [None]:
exp()

In [None]:
# Predictions = []
# for u,i,_ in interactionsTrain:
#     Predictions.append(modelLFM.predict(userIDs[u],itemIDs[i]).numpy())

In [None]:
# Predictions =\
#     [modelLFM.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsTrain]

In [None]:
# labels = [r for _,_,r in interactionsTrain]

In [None]:
# print('MSE on train set is')
# MSE(Predictions, labels)

In [None]:
model = LatentFactorModel(mu, 1, 0.00001)

In [None]:
for i in range(50):
    obj = trainingStep(model, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

In [None]:
Predictions_valid =\
    [model.predict(userIDs[u],itemIDs[i]).numpy() for u,i,_ in interactionsValid]
labels_valid = [r for _,_,r in interactionsValid]

In [None]:
print('MSE on validation set is')
MSE(Predictions_valid, labels_valid)

hw3 0.9097578798565606

### Q11
Find a better value of $\lambda$ using your validation set. Report the value you chose, its MSE, and upload your
solution to Kaggle by running it on the test data.

Kaggle user name: ZiangX\
Score: 0.83678

In [None]:
allRatings = []
userRatings = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    r = int(d['rating'])
    allRatings.append(r)
    userRatings[user].append(r)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

predictions = open("predictions_Rated3.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u in userIDs and i in itemIDs:
        predictions.write(u + '-' + i + ',' + str(model.predict(userIDs[u],itemIDs[i]).numpy()) + '\n')
    elif u in userAverage:
        predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
    else:
        predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()

In [None]:
model = SVD()

In [None]:
model.fit(data)

In [None]:
allRatings = []
userRatings = defaultdict(list)
itemRatings = defaultdict(list)

for user,recipe,d in readCSV("trainInteractions.csv.gz"):
    r = int(d['rating'])
    allRatings.append(r)
    userRatings[user].append(r)
    itemRatings[recipe].append(r)

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
itemAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])
for i in itemRatings:
    itemAverage[i] = sum(itemRatings[i]) / len(itemRatings[i])

predictions = open("predictions_Rated3.txt", 'w')
for l in open("stub_Rated.txt"):
    if l.startswith("user_id"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u in userIDs and i in itemIDs:
        predictions.write(u + '-' + i + ',' + str(model.predict(userIDs[u],itemIDs[i]).numpy()) + '\n')
    elif i in itemAverage:
        predictions.write(u + '-' + i + ',' + str(itemAverage[i]) + '\n')
    elif u in userAverage:
        predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
    else:
        predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()