# CSE 258 Assignment 1 Rating
**Ming Ki Toby Cheng**

In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
from sklearn import linear_model
import numpy
import random
import matplotlib.pyplot as plt

In [2]:
def readGz(path):
    for l in gzip.open(path, "rt"):
        yield eval(l)


def readCSV(path):
    f = gzip.open(path, "rt")
    f.readline()
    for l in f:
        yield l.strip().split(",")

In [3]:
# Defining MSE function
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [4]:
users = []
books = []
ratings = []

for user, book, _ in readCSV("train_Interactions.csv.gz"):
    users.append(user)
    books.append(book)
    ratings.append(_)

In [5]:
# Re-initializing data
users_train = users[:190000]
books_train = books[:190000]
ratings_train = ratings[:190000]
users_valid = users[190000:]
books_valid = books[190000:]
ratings_valid = ratings[190000:]

In [6]:
training_rating = list(zip(users_train, books_train, ratings_train))
validation_rating = list(zip(users_valid, books_valid, ratings_valid))

In [7]:
train_rating = []
userRatings = defaultdict(list)
bookRatings = defaultdict(list)


for user, book, r in training_rating:
    train_rating.append(int(r))
    userRatings[user].append(int(r))
    bookRatings[book].append(int(r))


In [8]:
trainAverage = sum(train_rating) / len(train_rating)

alpha = trainAverage
nUsers = len(userRatings)
nItems = len(bookRatings)
users = list(userRatings.keys())
items = list(bookRatings.keys())

userBiases = defaultdict(float)
itemBiases = defaultdict(float)
userGamma = defaultdict(float)
itemGamma = defaultdict(float)


In [9]:
K = 1
for u in userRatings:
    userGamma[u] = [random.random() * 0.1 - 0.05 for k in range(K)]
for i in bookRatings:
    itemGamma[i] = [random.random() * 0.1 - 0.05 for k in range(K)]

In [10]:
def unpack(theta):
    global alpha
    global userBiases
    global itemBiases
    global userGamma
    global itemGamma
    index = 0
    alpha = theta[index]
    index += 1
    userBiases = dict(zip(users, theta[index:index+nUsers]))
    index += nUsers
    itemBiases = dict(zip(items, theta[index:index+nItems]))
    index += nItems
    for u in users:
        userGamma[u] = theta[index:index+K]
        index += K
    for i in items:
        itemGamma[i] = theta[index:index+K]
        index += K

In [11]:
def inner(x, y):
    return sum([a*b for a,b in zip(x,y)])

# def prediction(user, item):
#      return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

# Defining predictor function
def prediction(user, item):
    if user not in userBiases and item in itemBiases:
        return alpha + itemBiases[item]
    if user in userBiases and item not in itemBiases:
        return alpha + userBiases[user]
    if user not in userBiases and item not in itemBiases:
        return alpha
    if user in userBiases and item in itemBiases:
        return alpha + userBiases[user] + itemBiases[item] + inner(userGamma[user], itemGamma[item])

In [12]:
def cost(theta, labels, lamb):
    unpack(theta)
    predictions = [prediction(d[0], d[1]) for d in training_rating]
    cost = MSE(predictions, labels)
    print("MSE = " + str(cost))
    for u in users:
        cost += lamb*userBiases[u]**2
        for k in range(K):
            cost += lamb*userGamma[u][k]**2
    for i in items:
        cost += lamb*itemBiases[i]**2
        for k in range(K):
            cost += lamb*itemGamma[i][k]**2
    return cost

In [13]:
def derivative(theta, labels, lamb):
    unpack(theta)
    N = len(training_rating)
    dalpha = 0
    dUserBiases = defaultdict(float)
    dItemBiases = defaultdict(float)
    dUserGamma = {}
    dItemGamma = {}
    for u in userRatings:
        dUserGamma[u] = [0.0 for k in range(K)]
    for i in bookRatings:
        dItemGamma[i] = [0.0 for k in range(K)]
    for d in training_rating:
        u,i = d[0], d[1]
        pred = prediction(u, i)
        diff = pred - int(d[2])
        dalpha += 2/N*diff
        dUserBiases[u] += 2/N*diff
        dItemBiases[i] += 2/N*diff
        for k in range(K):
            dUserGamma[u][k] += 2/N*itemGamma[i][k]*diff
            dItemGamma[i][k] += 2/N*userGamma[u][k]*diff
    for u in userBiases:
        dUserBiases[u] += 2*lamb*userBiases[u]
        for k in range(K):
            dUserGamma[u][k] += 2*lamb*userGamma[u][k]
    for i in itemBiases:
        dItemBiases[i] += 2*lamb*itemBiases[i]
        for k in range(K):
            dItemGamma[i][k] += 2*lamb*itemGamma[i][k]
    dtheta = [dalpha] + [dUserBiases[u] for u in users] + [dItemBiases[i] for i in items]
    for u in users:
        dtheta += dUserGamma[u]
    for i in items:
        dtheta += dItemGamma[i]
    return numpy.array(dtheta)

In [14]:
labels = [int(d[2]) for d in training_rating]

In [15]:
scipy.optimize.fmin_l_bfgs_b(cost, [alpha] + # Initialize alpha
                                   [0.0]*(nUsers+nItems) + # Initialize beta
                                   [random.random() * 0.1 - 0.05 for k in range(K*(nUsers+nItems))], # Gamma
                             derivative, args = (labels, 0.00007))

MSE = 1.4735497584932453
MSE = 1.4561022672576671
MSE = 1.392315273385649
MSE = 8.073102620738412
MSE = 1.3750058885332772
MSE = 1.2119086141150166
MSE = 1.2103906463933831
MSE = 1.2044224527934595
MSE = 1.1822085137951317
MSE = 1.07588008150663
MSE = 1.0457248331346796
MSE = 1.0191965181595632
MSE = 1.0150351632159835
MSE = 1.0165950844639755
MSE = 1.0164807340050686
MSE = 1.0154383720483349
MSE = 1.0141403208116053
MSE = 1.0139030271925975
MSE = 1.013997140381256
MSE = 1.019516169145231
MSE = 1.0139937532459375
MSE = 1.0138315591497997
MSE = 1.0137498537704501
MSE = 1.0136440751741382
MSE = 1.0136546270664304
MSE = 1.0137898348256824
MSE = 1.0139059612337105
MSE = 1.01391201526589
MSE = 1.013912649690732
MSE = 1.0137326725621756
MSE = 1.0084504125083744
MSE = 1.0131576783447587
MSE = 1.012084938925759
MSE = 1.0114747642721849
MSE = 1.0114740875218557
MSE = 1.0094119327732838
MSE = 1.0104216723689758
MSE = 1.0100236959101812
MSE = 1.0099198610512967
MSE = 1.0097993025494842
MSE = 1.00

KeyboardInterrupt: 

In [None]:
val_pred = []
real_rate = []
for user, book, rating in validation_rating:
    val_pred.append(prediction(user,book))
    real_rate.append(int(rating))

In [None]:
MSE(val_pred,real_rate)

lambda: 0.0005, 0.0001 
MSE: 1.340872431, 1.177193

In [None]:
# Initializing user and book biases


globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
userBias = {}
user_total = 0
bookAverage = {}
bookBias = {}
book_total = 0


for user in userRatings:
    userAverage[user] = sum(userRatings[user]) / len(userRatings[user])
    
for user in userAverage:
    user_total += float(userAverage[user])
    
for user in userAverage:
    userBias[user] = userAverage[user] - (user_total/len(userAverage))

for book in bookRatings:
    bookAverage[book] = sum(bookRatings[book]) / len(bookRatings[book])
    
for book in bookAverage:
    book_total += float(bookAverage[book])
    
for book in bookAverage:
    bookBias[book] = bookAverage[book] - (book_total/len(bookAverage))

# Defining alpha, bookBias, userBias by convergence from Training data and new lambda value
Reg = []
MSE_val = []

for i in numpy.arange(2.75, 3.5, 0.01):
    lamb = i
    alpha_sum = 0
    alpha = globalAverage
    MSE_diff = 5
    trial = 0

    while MSE_diff > 0.00005 or trial > 1000:
        model_predictions = []
        alpha_sum = 0
        for user, book, r in training_rating:
            alpha_sum += (int(r)-(userBias[user] + bookBias[book]))
        alpha = alpha_sum/len(training_rating)

        for user in booksPerUser:
            beta_U_Sum = 0
            for books in booksPerUser[user]:
                beta_U_Sum += ratingPerCombo[user,books] - (alpha + bookBias[books])
            userBias[user] = beta_U_Sum/(lamb+ len(booksPerUser[user]))

        for book in usersPerBook:
            beta_I_Sum = 0
            for users in usersPerBook[book]:
                beta_I_Sum += ratingPerCombo[users,book] - (alpha + userBias[users])
            bookBias[book] = beta_I_Sum/(lamb+ len(usersPerBook[book]))

        for user, book, r in training_rating:
            model_predictions.append(prediction(user, book))

        #print('Trial #:',trial)

        if trial == 0:
            MSE_0 = 0

        MSE_1 = MSE(model_predictions, ratings_train)

        MSE_diff = abs(MSE_1 - MSE_0)

        #if trial != 0:
            #print('MSE_diff:', MSE_diff)

        #print('MSE_old', MSE_0)
        #print('MSE_new:', MSE_1)

        MSE_0 = MSE_1
        trial += 1 
        # Predictions and MSE on validation data
        
    model_predictions = []
    for user, book, r in validation_rating:
        model_predictions.append(prediction(user, book))

    MSE_valid = MSE(model_predictions, ratings_valid)
    Reg.append(i)
    MSE_val.append(MSE_valid)

In [None]:
# Predictions and MSE on validation data
model_predictions = []
for user, book, r in validation_rating:
    model_predictions.append(prediction(user, book))

MSE_valid = MSE(model_predictions, ratings_valid)
print(MSE_valid)

In [None]:
plt.plot(Reg, MSE_val, 'r--')
plt.ylabel('MSE')
plt.xlabel('Regularization Values')
plt.show()

In [None]:
print(MSE_val.index(min(MSE_val)),Reg[MSE_val.index(min(MSE_val))], min(MSE_val))

In [None]:
# Defining alpha, bookBias, userBias by convergence from Training data and lambda value 1
lamb = 3.15
alpha_sum = 0
alpha = 0
MSE_diff = 5
trial = 0

while MSE_diff > 0.00005 or trial > 1000:
    model_predictions = []
    alpha_sum = 0
    
    for user, book, r in training_rating:
        alpha_sum += (r-(userBias[user] + bookBias[book]))
    alpha = alpha_sum/len(training_rating)
    
    for user in booksPerUser:
        beta_U_Sum = 0
        for books in booksPerUser[user]:
            beta_U_Sum += ratingPerCombo[user,books] - (alpha + bookBias[books])
        userBias[user] = beta_U_Sum/(lamb+ len(booksPerUser[user]))
    
    for book in usersPerBook:
        beta_I_Sum = 0
        for users in usersPerBook[book]:
            beta_I_Sum += ratingPerCombo[users,book] - (alpha + userBias[users])
        bookBias[book] = beta_I_Sum/(lamb+ len(usersPerBook[book]))
    
    for user, book, r in training_rating:
        model_predictions.append(prediction(user, book))
    
    if trial == 0:
        MSE_0 = 0

    MSE_1 = MSE(model_predictions, ratings_train)
    
    MSE_diff = abs(MSE_1 - MSE_0)

    print('MSE_new:', MSE_1)
    
    MSE_0 = MSE_1
    trial += 1 

In [None]:
# Predictions and MSE on validation data
model_predictions = []
for user, book, r in validation_rating:
    model_predictions.append(prediction(user, book))

MSE_valid = MSE(model_predictions, ratings_valid)
print(MSE_valid)

#### Using previous lambda now use all the data available

In [None]:
users = []
books = []
ratings = []
for user, book, _ in readCSV("train_Interactions.csv.gz"):
    users.append(user)
    books.append(book)
    ratings.append(int(_))

In [None]:
all_ratings = list(zip(users, books, ratings))

In [None]:
# Forming dictionary with user and book combinations
ratingPerCombo = {}
usersPerBook = defaultdict(set)
booksPerUser = defaultdict(set)

for user, book,r in all_ratings:
    usersPerBook[book].add(user)
    booksPerUser[user].add(book)
    ratingPerCombo[(user,book)] = int(r)

In [None]:
# Initializing all user ratings and all books dictionaries
totalRatings = []
userRatings = defaultdict(list)
bookRatings = defaultdict(list)

# All ratings for each user and each book
for user, book, r in all_ratings:
    r = int(r)
    totalRatings.append(r)
    userRatings[user].append(r)
    bookRatings[book].append(r)

globalAverage = sum(totalRatings) / len(totalRatings)
userAverage = {}
userBias = {}
user_total = 0
bookAverage = {}
bookBias = {}
book_total = 0

# Initializing user and book biases
for user in userRatings:
    userAverage[user] = sum(userRatings[user]) / len(userRatings[user])
    
for user in userAverage:
    user_total += float(userAverage[user])
    
for user in userAverage:
    userBias[user] = userAverage[user] - (user_total/len(userAverage))

for book in bookRatings:
    bookAverage[book] = sum(bookRatings[book]) / len(bookRatings[book])
    
for book in bookAverage:
    book_total += float(bookAverage[book])
    
for book in bookAverage:
    bookBias[book] = bookAverage[book] - (book_total/len(bookAverage))

In [None]:
# Defining alpha, bookBias, userBias by convergence from Training data and lambda value 1
lamb = 3.15
alpha_sum = 0
alpha = 0
MSE_diff = 5
trial = 0

while MSE_diff > 0.00005 or trial > 1000:
    model_predictions = []
    actual_rating = []
    alpha_sum = 0
    
    for user, book, r in all_ratings:
        alpha_sum += (r-(userBias[user] + bookBias[book]))
    alpha = alpha_sum/len(all_ratings)
    
    for user in booksPerUser:
        beta_U_Sum = 0
        for books in booksPerUser[user]:
            beta_U_Sum += ratingPerCombo[user,books] - (alpha + bookBias[books])
        userBias[user] = beta_U_Sum/(lamb+ len(booksPerUser[user]))
    
    for book in usersPerBook:
        beta_I_Sum = 0
        for users in usersPerBook[book]:
            beta_I_Sum += ratingPerCombo[users,book] - (alpha + userBias[users])
        bookBias[book] = beta_I_Sum/(lamb+ len(usersPerBook[book]))
    
    for user, book, r in all_ratings:
        model_predictions.append(int(prediction(user, book)))
        actual_rating.append(r)
    
    if trial == 0:
        MSE_0 = 0

    MSE_1 = MSE(model_predictions, actual_rating)
    
    MSE_diff = abs(MSE_1 - MSE_0)

    print('MSE_new:', MSE_1)
    
    MSE_0 = MSE_1
    trial += 1 

In [None]:
# Predictions and MSE on validation data
model_predictions = []
for user, book, r in validation_rating:
    model_predictions.append(prediction(user, book))

MSE_valid = MSE(model_predictions, ratings_valid)
print('MSE on validation set:',MSE_valid)

In [None]:
# Writing predictions of test set to file
predictions = open("predictions_Rating_Assignment1.txt", "w")
for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue
    u, b = l.strip().split("-")
    
    predictions.write(u + "-" + b + "," + str(prediction(u,b)) + "\n")

predictions.close()

**Kaggle Username: tobycheng or Toby Cheng**

**Kaggle Rating MSE: 1.13707**