In [None]:
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from __future__ import print_function
from collections import defaultdict

data_root = '/home/linuxthink/data/CSE255/'

In [None]:
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

In [None]:
num_all = len(all_data)
num_train = 900000
num_valid = 100000
assert num_train + num_valid == num_all

In [None]:
train_data = all_data[:num_train]
valid_data = all_data[num_train:]

In [None]:
datum = train_data[0]
print(datum)

In [None]:
# 2.1 (version 0) average predictor: using raw data
# rating(user, item) = alpha

# get averaged rating
train_ratings = np.array([d['rating'] for d in train_data]).astype(float)
alpha = np.mean(train_ratings)
print(train_ratings.shape)
print('alpha', alpha)

# calculate mse
valid_ratings = np.array([d['rating'] for d in valid_data]).astype(float)
print(valid_ratings.shape)
valid_mse = (1. / num_valid) * np.sum((valid_ratings - alpha) ** 2.0)
print('valid_mse', valid_mse)

In [None]:
# version without mapping user_id to user_index
# use strings all the time

# get all items and users
item_ids = sorted(list(set([d['itemID'] for d in all_data])))
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))

# build array [user_index, item_index, rating]
train_rating_list = [[d['reviewerID'], 
                      d['itemID'], 
                      d['rating']] for d in train_data]

# build array [user_index, item_index, rating]
valid_rating_list = [[d['reviewerID'], 
                      d['itemID'], 
                      d['rating']] for d in valid_data]

In [None]:
# 2.1 (version 1) average predictor: using stringly list
# get averaged rating
ratings = np.array([d[2] for d in train_rating_list])
alpha = np.mean(ratings)
print('alpha', alpha)

# calculate mse
valid_ratings = np.array([d[2] for d in valid_rating_list])
valid_mse = (1. / num_valid) * np.sum((valid_ratings - alpha) ** 2.0)
print('valid_mse', valid_mse)

In [None]:
# map all user_id <-> user_index; item_id <-> item_index

# get all items and users
item_ids = sorted(list(set([d['itemID'] for d in all_data])))
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))

# user and item numbers
num_items = len(item_ids)
num_users = len(user_ids)

# build id <-> index map
item_id_map_index = dict()
item_index_map_id = dict()
for index, item_id in enumerate(item_ids):
    item_id_map_index[item_id] = index
    item_index_map_id[index] = item_id
    
user_id_map_index = dict()
user_index_map_id = dict()
for index, user_id in enumerate(user_ids):
    user_id_map_index[user_id] = index
    user_index_map_id[index] = user_id

# build array [user_index, item_index, rating]
train_rating_array = []
for d in train_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    train_rating_array.append([user_index, item_index, rating])
train_rating_array = np.array(train_rating_array).astype(int)

# build array [user_index, item_index, rating]
valid_rating_array = []
for d in valid_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    valid_rating_array.append([user_index, item_index, rating])
valid_rating_array = np.array(valid_rating_array).astype(int)

In [None]:
# 2.1 (version 2) average predictor (using index based sorted list)
# get averaged rating
alpha = np.mean(train_rating_array[:, 2])
print('alpha', alpha)

# calculate mse
valid_ratings = valid_rating_array[:, 2]
valid_mse = (1. / num_valid) * np.sum((valid_ratings - alpha) ** 2.0)
print('valid_mse', valid_mse)

In [None]:
# build Mapping of Ruis and Rius
Ruis = defaultdict(dict)
Rius = defaultdict(dict)
# Iu = Ruis[user_index].keys() # [the set of items] reviewed by user u
# Ui = Ruis[item_index].keys() # [the set of users] reviewed item i

for t in train_rating_array:
    user_index = t[0]
    item_index = t[1]
    rating = t[2]
    Ruis[user_index][item_index] = rating
    Rius[item_index][user_index] = rating

In [None]:
alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))
lam = 1.0

In [None]:
# 2.2 fit baseline model
# rating(u, i) = alpha + beta_u + beta_i
def get_valid_mse(lam, alpha, beta_us, beta_is, train_rating_array, valid_rating_array):
    predicts = alpha + beta_us[valid_rating_array[:, 0]] + beta_is[valid_rating_array[:, 1]]
    ratings = valid_rating_array[:, 2].astype(float)
    return (1. / num_valid) * np.sum((predicts - ratings) ** 2.0)

def get_cost(lam, alpha, beta_us, beta_is, train_rating_array):
    predicts = alpha + beta_us[train_rating_array[:, 0]] + beta_is[train_rating_array[:, 1]]
    ratings = train_rating_array[:, 2].astype(float)
    return np.sum((predicts - ratings) ** 2.) + lam * (np.sum(beta_us ** 2.) + np.sum(beta_is ** 2.))
    
def alpha_update(lam, alpha, beta_us, beta_is, train_rating_array):
    sum_Rui = np.sum(train_rating_array[:, 2])
    sum_beta_u = np.sum(beta_us[train_rating_array[:, 0]]) # fancy indexing
    sum_beta_i = np.sum(beta_is[train_rating_array[:, 1]]) # fancy indexing
    return (sum_Rui - sum_beta_u - sum_beta_i) / num_train

def beta_us_update(lam, alpha, beta_us, beta_is, train_rating_array):
    new_beta_us = np.zeros_like(beta_us)
    for user_index in xrange(num_users):
        # [the set of items] reviewed by user u
        Iu = Ruis[user_index].keys()
        Iu_size = len(Iu)
        # sums
        sum_Rui = np.sum(Ruis[user_index].values())
        sum_alpha = Iu_size * alpha
        sum_beta_i = np.sum(beta_is[Iu])
        # write result
        new_beta_us[user_index] = float(sum_Rui - sum_alpha - sum_beta_i) / (lam + Iu_size)
    return new_beta_us

def beta_is_update(lam, alpha, beta_us, beta_is, train_rating_array):
    new_beta_is = np.zeros_like(beta_is)
    for item_index in xrange(num_items):
        # [the set of users] reviewd item i
        Ui = Rius[item_index].keys()
        Ui_size = len(Ui)
        # sums
        sum_Rui = np.sum(Rius[item_index].values())
        sum_alpha = Ui_size * alpha
        sum_beta_u = np.sum(beta_us[Ui])
        # write result
        new_beta_is[item_index] = float(sum_Rui - sum_alpha - sum_beta_u) / (lam + Ui_size)
    return new_beta_is

In [None]:
# iterate update put lam = 1
lam = 1.0
max_iter = 30

alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

for i in xrange(max_iter):
    alpha = alpha_update(lam, alpha, beta_us, beta_is, train_rating_array)
    beta_us = beta_us_update(lam, alpha, beta_us, beta_is, train_rating_array)
    beta_is = beta_is_update(lam, alpha, beta_us, beta_is, train_rating_array)
    
print(get_cost(lam, alpha, beta_us, beta_is, train_rating_array))
print(get_valid_mse(lam, alpha, beta_us, beta_is, train_rating_array, valid_rating_array))

In [None]:
# 2.3 report the user and item id that have the largest and smallest values of beta
print('user, largest', user_index_map_id[np.argmax(beta_us)])
print('user, smallest', user_index_map_id[np.argmin(beta_us)])

print('item, largest', item_index_map_id[np.argmax(beta_is)])
print('item, smallest', item_index_map_id[np.argmin(beta_is)])

In [None]:
# 2.4 search for the best lam
def train_and_eval(lam, max_iter):
    alpha = 0.0
    beta_us = np.random.normal(0, 0.5, (num_users,))
    beta_is = np.random.normal(0, 0.5, (num_items,))

    for i in xrange(max_iter):
        alpha = alpha_update(lam, alpha, beta_us, beta_is, train_rating_array)
        beta_us = beta_us_update(lam, alpha, beta_us, beta_is, train_rating_array)
        beta_is = beta_is_update(lam, alpha, beta_us, beta_is, train_rating_array)
    
    cost = get_cost(lam, alpha, beta_us, beta_is, train_rating_array)
    mse = get_valid_mse(lam, alpha, beta_us, beta_is, train_rating_array, valid_rating_array)
    return(cost, mse, alpha, beta_us, beta_is)

In [None]:
lams = [0.001, 0.01, 0.1, 1.0, 10, 100]
max_iter = 5

results = []
for lam in lams:
    cost, mse, _, _, _ = train_and_eval(lam, max_iter)
    print(lam, cost, mse)
    results.append([lam, cost, mse])

In [None]:
# 2.4 do the test set
lam = 1.0
max_iter = 30
cost, mse, alpha, beta_us, beta_is = train_and_eval(lam, max_iter)

In [None]:
# get header_str and user_item_ids to predict
with open('pairs_Rating.txt') as f:
    # read and strip lines
    lines = [l.strip() for l in f.readlines()]
    
    # stirip out the headers
    header_str = lines.pop(0)
    
    # get a list of user_item_ids
    user_item_ids = [l.split('-') for l in lines]
    
# write to output file
f = open('predictions_Rating.txt', 'w')
print(header_str, file=f)

for user_id, item_id in user_item_ids:
    rating = alpha + beta_us[user_id_map_index[user_id]] + beta_is[item_id_map_index[item_id]]
    rating = min(5.0, rating)
    rating = max(0.0, rating)
    print('%s-%s,%s' % (user_id, item_id, rating), file=f)
f.close()