In [1]:
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from __future__ import print_function
from collections import defaultdict

data_root = '/home/linuxthink/data/CSE255/'

In [2]:
# laod raw data
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

17.6534459591


In [3]:
# get train and test set
num_all = len(all_data)
num_train = 900000
num_valid = 100000
assert num_train + num_valid == num_all

train_data = all_data[:num_train]
valid_data = all_data[num_train:]

In [4]:
# pre-process 0: build id <-> index infastructure

# get all items and users
item_ids = sorted(list(set([d['itemID'] for d in all_data])))
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))

# user and item numbers
num_items = len(item_ids)
num_users = len(user_ids)

# build id <-> index map
item_id_map_index = dict()
item_index_map_id = dict()
for index, item_id in enumerate(item_ids):
    item_id_map_index[item_id] = index
    item_index_map_id[index] = item_id
    
user_id_map_index = dict()
user_index_map_id = dict()
for index, user_id in enumerate(user_ids):
    user_id_map_index[user_id] = index
    user_index_map_id[index] = user_id

In [5]:
# pre-process 1: build train_rating_array, valid_rating_array

# build array [user_index, item_index, rating]
train_rating_array = []
for d in train_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    train_rating_array.append([user_index, item_index, rating])
train_rating_array = np.array(train_rating_array).astype(int)

# build array [user_index, item_index, rating]
valid_rating_array = []
for d in valid_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    valid_rating_array.append([user_index, item_index, rating])
valid_rating_array = np.array(valid_rating_array).astype(int)

# build array [user_index, item_index, rating]
all_rating_array = []
for d in all_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    all_rating_array.append([user_index, item_index, rating])
all_rating_array = np.array(all_rating_array).astype(int)

In [6]:
# pre-process 2: # utility and update functions
def get_valid_mse(lam, alpha, beta_us, beta_is, rating_array, valid_rating_array):
    predicts = alpha + beta_us[valid_rating_array[:, 0]] + beta_is[valid_rating_array[:, 1]]
    ratings = valid_rating_array[:, 2].astype(float)
    return (1. / valid_rating_array.shape[0]) * np.sum((predicts - ratings) ** 2.0)

def get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    predicts = alpha + beta_us[rating_array[:, 0]] + beta_is[rating_array[:, 1]]
    ratings = rating_array[:, 2].astype(float)
    return np.sum((predicts - ratings) ** 2.) + lam * (np.sum(beta_us ** 2.) + np.sum(beta_is ** 2.))
    
def alpha_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    sum_Rui = np.sum(rating_array[:, 2])
    sum_beta_u = np.sum(beta_us[rating_array[:, 0]]) # fancy indexing
    sum_beta_i = np.sum(beta_is[rating_array[:, 1]]) # fancy indexing
    return (sum_Rui - sum_beta_u - sum_beta_i) / rating_array.shape[0]

def beta_us_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    new_beta_us = np.zeros_like(beta_us)
    for user_index in xrange(num_users):
        # [the set of items] reviewed by user u
        Iu = Ruis[user_index].keys()
        Iu_size = len(Iu)
        # sums
        sum_Rui = np.sum(Ruis[user_index].values())
        sum_alpha = Iu_size * alpha
        sum_beta_i = np.sum(beta_is[Iu])
        # write result
        new_beta_us[user_index] = float(sum_Rui - sum_alpha - sum_beta_i) / (lam + Iu_size)
    return new_beta_us

def beta_is_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    new_beta_is = np.zeros_like(beta_is)
    for item_index in xrange(num_items):
        # [the set of users] reviewd item i
        Ui = Rius[item_index].keys()
        Ui_size = len(Ui)
        # sums
        sum_Rui = np.sum(Rius[item_index].values())
        sum_alpha = Ui_size * alpha
        sum_beta_u = np.sum(beta_us[Ui])
        # write result
        new_beta_is[item_index] = float(sum_Rui - sum_alpha - sum_beta_u) / (lam + Ui_size)
    return new_beta_is

def train_and_eval(max_iter, 
                   lam, alpha, beta_us, beta_is, 
                   rating_array, valid_rating_array,
                   print_step = False):
    
    # build Mapping of Ruis and Rius
    Ruis = defaultdict(dict)
    Rius = defaultdict(dict)
    # Iu = Ruis[user_index].keys() # [the set of items] reviewed by user u
    # Ui = Ruis[item_index].keys() # [the set of users] reviewed item i
    for t in rating_array:
        user_index = t[0]
        item_index = t[1]
        rating = t[2]
        Ruis[user_index][item_index] = rating
        Rius[item_index][user_index] = rating
    
    # train on this dataset
    for i in xrange(max_iter):
        alpha = alpha_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        beta_us = beta_us_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        beta_is = beta_is_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        if print_step:
            cost = get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
            valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is, 
                                      rating_array, valid_rating_array)
            print(i, cost, valid_mse)
    
    cost = get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
    valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is, rating_array, valid_rating_array)
    
    return(cost, valid_mse, alpha, beta_us, beta_is)

In [7]:
# 3.5 average predictor (using index based sorted list)
# get averaged rating
alpha = np.mean(train_rating_array[:, 2])
print('alpha', alpha)

# calculate mse
valid_ratings = valid_rating_array[:, 2]
valid_mse = (1. / valid_rating_array.shape[0]) * np.sum((valid_ratings - alpha) ** 2.0)
print('valid_mse', valid_mse)

alpha 4.21898777778
valid_mse 0.969062751573


In [8]:
# 3.6 fit baseline model: rating(u, i) = alpha + beta_u + beta_i

# set training
max_iter = 30

# parameters
lam = 1.0
alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

cost, valid_mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
                                                          lam, alpha, beta_us, beta_is, 
                                                          train_rating_array, valid_rating_array)
print(cost, valid_mse)

541296.35229 0.696105601655


In [9]:
# 3.7 report the user and item id that have the largest and smallest values of beta

print('user, largest', user_index_map_id[np.argmax(beta_us)])
print('user, smallest', user_index_map_id[np.argmin(beta_us)])

print('item, largest', item_index_map_id[np.argmax(beta_is)])
print('item, smallest', item_index_map_id[np.argmin(beta_is)])

user, largest U516357151
user, smallest U512598315
item, largest I245219975
item, smallest I502194676


In [14]:
# 3.8 search for the best lam
lams = [0.001, 0.01, 0.1, 1.0, 10, 100]
max_iter = 5

# init variables
alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

results = []
for lam in lams:
    cost, mse, _, _, _ = train_and_eval(max_iter, 
                                        lam, alpha, beta_us, beta_is, 
                                        train_rating_array, valid_rating_array,
                                        print_step=True)
    print(lam, cost, mse)
    results.append([lam, cost, mse])

0 543896.54433 0.723961695045
1 526572.731338 0.70393245252
2 525668.626963 0.702914565381
3 525438.612197 0.702628598207
4 525333.533065 0.702493376457
0.001 525333.533065 0.702493376457
0 544029.732673 0.723867236496
1 526724.287813 0.703853891159
2 525824.308863 0.702835321306
3 525596.466341 0.702549130787
4 525492.898336 0.702413849679
0.01 525492.898336 0.702413849679
0 545351.452044 0.722947810314
1 528226.56025 0.703093833002
2 527366.068988 0.702069722033
3 527158.548716 0.701782183375
4 527068.879616 0.701647061074
0.1 527068.879616 0.701647061074
0 557644.914298 0.715816342586
1 542059.664452 0.697566930949
2 541460.043697 0.696577855493
3 541358.998663 0.696331851468
4 541330.796481 0.69623980295
1.0 541330.796481 0.69623980295
0 631988.894214 0.712959217448
1 622444.409306 0.705048976336
2 622082.110368 0.704993200664
3 621988.45544 0.705102238726
4 621954.555876 0.705171230736
10 621954.555876 0.705171230736
0 773136.843633 0.83047997195
1 770869.120736 0.831323486804
2 7

In [11]:
# now train on all data
max_iter = 30

# init variables
lam = 1.0
alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

cost, mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
                                                    lam, alpha, beta_us, beta_is, 
                                                    all_rating_array, valid_rating_array,
                                                    print_step=True)
print(cost, mse)

0 622816.089258 0.613274152121
1 606236.848759 0.59678518209
2 605576.314146 0.595889007411
3 605459.396885 0.595666116224
4 605422.610479 0.595570767595
5 605406.385868 0.595516986107
6 605397.36772 0.595481607055
7 605391.546075 0.595456252701
8 605387.424419 0.595437189332
9 605384.32894 0.595422454129
10 605381.906159 0.595410872406
11 605379.949232 0.595401670881
12 605378.328211 0.595394306019
13 605376.957651 0.595388378804
14 605375.779558 0.595383587824
15 605374.753551 0.59537970101
16 605373.850839 0.595376537341
17 605373.050392 0.5953739543
18 605372.336456 0.595371838929
19 605371.69692 0.595370101236
20 605371.122221 0.595368669248
21 605370.604625 0.595367485215
22 605370.137722 0.595366502686
23 605369.716101 0.595365684216
24 605369.335106 0.595364999567
25 605368.990684 0.595364424281
26 605368.679261 0.59536393855
27 605368.397663 0.595363526308
28 605368.143051 0.595363174515
29 605367.912873 0.595362872568
605367.912873 0.595362872568


In [13]:
# get header_str and user_item_ids to predict
with open('pairs_Rating.txt') as f:
    # read and strip lines
    lines = [l.strip() for l in f.readlines()]
    # stirip out the headers
    header_str = lines.pop(0)
    # get a list of user_item_ids
    user_item_ids = [l.split('-') for l in lines]
    
# write to output file
f = open('predictions_Rating.txt', 'w')
print(header_str, file=f)
for user_id, item_id in user_item_ids:
    rating = alpha + beta_us[user_id_map_index[user_id]] + beta_is[item_id_map_index[item_id]]
    rating = min(5.0, rating)
    rating = max(0.0, rating)
    print('%s-%s,%s' % (user_id, item_id, rating), file=f)
f.close()