In [1]:
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from __future__ import print_function
from collections import defaultdict

data_root = '/home/linuxthink/data/CSE255/'

In [2]:
# laod raw data
start_time = time.time()
all_data = pickle.load(open(data_root + "all_data.pickle", "rb"))
print(time.time() - start_time)

17.4010329247


In [3]:
# get train and test set
num_all = len(all_data)
num_train = 900000
num_valid = 100000
assert num_train + num_valid == num_all

train_data = all_data[:num_train]
valid_data = all_data[num_train:]

In [4]:
# pre-process 0: build id <-> index infastructure

# get all items and users
item_ids = sorted(list(set([d['itemID'] for d in all_data])))
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))

# user and item numbers
num_items = len(item_ids)
num_users = len(user_ids)

# build id <-> index map
item_id_map_index = dict()
item_index_map_id = dict()
for index, item_id in enumerate(item_ids):
    item_id_map_index[item_id] = index
    item_index_map_id[index] = item_id
    
user_id_map_index = dict()
user_index_map_id = dict()
for index, user_id in enumerate(user_ids):
    user_id_map_index[user_id] = index
    user_index_map_id[index] = user_id

In [5]:
# pre-process 1: build train_rating_array, valid_rating_array

# build array [user_index, item_index, rating]
train_rating_array = []
for d in train_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    train_rating_array.append([user_index, item_index, rating])
train_rating_array = np.array(train_rating_array).astype(int)

# build array [user_index, item_index, rating]
valid_rating_array = []
for d in valid_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    valid_rating_array.append([user_index, item_index, rating])
valid_rating_array = np.array(valid_rating_array).astype(int)

# build array [user_index, item_index, rating]
all_rating_array = []
for d in all_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    all_rating_array.append([user_index, item_index, rating])
all_rating_array = np.array(all_rating_array).astype(int)

In [6]:
# pre-process 2: # utility and update functions
def get_valid_mse(lam, alpha, beta_us, beta_is, rating_array, valid_rating_array):
    predicts = alpha + beta_us[valid_rating_array[:, 0]] + beta_is[valid_rating_array[:, 1]]
    ratings = valid_rating_array[:, 2].astype(float)
    return (1. / valid_rating_array.shape[0]) * np.sum((predicts - ratings) ** 2.0)

def get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    predicts = alpha + beta_us[rating_array[:, 0]] + beta_is[rating_array[:, 1]]
    ratings = rating_array[:, 2].astype(float)
    return np.sum((predicts - ratings) ** 2.) + lam * (np.sum(beta_us ** 2.) + np.sum(beta_is ** 2.))
    
def alpha_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    sum_Rui = np.sum(rating_array[:, 2])
    sum_beta_u = np.sum(beta_us[rating_array[:, 0]]) # fancy indexing
    sum_beta_i = np.sum(beta_is[rating_array[:, 1]]) # fancy indexing
    return (sum_Rui - sum_beta_u - sum_beta_i) / rating_array.shape[0]

def beta_us_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    new_beta_us = np.zeros_like(beta_us)
    for user_index in xrange(num_users):
        # [the set of items] reviewed by user u
        Iu = Ruis[user_index].keys()
        Iu_size = len(Iu)
        # sums
        sum_Rui = np.sum(Ruis[user_index].values())
        sum_alpha = Iu_size * alpha
        sum_beta_i = np.sum(beta_is[Iu])
        # write result
        new_beta_us[user_index] = float(sum_Rui - sum_alpha - sum_beta_i) / (lam + Iu_size)
    return new_beta_us

def beta_is_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    new_beta_is = np.zeros_like(beta_is)
    for item_index in xrange(num_items):
        # [the set of users] reviewd item i
        Ui = Rius[item_index].keys()
        Ui_size = len(Ui)
        # sums
        sum_Rui = np.sum(Rius[item_index].values())
        sum_alpha = Ui_size * alpha
        sum_beta_u = np.sum(beta_us[Ui])
        # write result
        new_beta_is[item_index] = float(sum_Rui - sum_alpha - sum_beta_u) / (lam + Ui_size)
    return new_beta_is

def train_and_eval(max_iter, 
                   lam, alpha, beta_us, beta_is, 
                   rating_array, valid_rating_array,
                   print_step = False):
    
    # build Mapping of Ruis and Rius
    Ruis = defaultdict(dict)
    Rius = defaultdict(dict)
    # Iu = Ruis[user_index].keys() # [the set of items] reviewed by user u
    # Ui = Ruis[item_index].keys() # [the set of users] reviewed item i
    for t in rating_array:
        user_index = t[0]
        item_index = t[1]
        rating = t[2]
        Ruis[user_index][item_index] = rating
        Rius[item_index][user_index] = rating
    
    # train on this dataset
    for i in xrange(max_iter):
        alpha = alpha_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        beta_us = beta_us_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        beta_is = beta_is_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        if print_step:
            cost = get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
            valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is, 
                                      rating_array, valid_rating_array)
            print(i, cost, valid_mse)
    
    cost = get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
    valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is, rating_array, valid_rating_array)
    
    return(cost, valid_mse, alpha, beta_us, beta_is)

In [7]:
# 3.5 average predictor (using index based sorted list)
# get averaged rating
alpha = np.mean(train_rating_array[:, 2])
print('alpha', alpha)

# calculate mse
valid_ratings = valid_rating_array[:, 2]
valid_mse = (1. / valid_rating_array.shape[0]) * np.sum((valid_ratings - alpha) ** 2.0)
print('valid_mse', valid_mse)

alpha 4.21898777778
valid_mse 0.969062751573


In [8]:
# 3.6 fit baseline model: rating(u, i) = alpha + beta_u + beta_i

# set training
max_iter = 30

# parameters
lam = 1.0
alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

cost, valid_mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
                                                          lam, alpha, beta_us, beta_is, 
                                                          train_rating_array, valid_rating_array)
print(cost, valid_mse)

541295.851161 0.696104838292


In [9]:
# 3.7 report the user and item id that have the largest and smallest values of beta

print('user, largest', user_index_map_id[np.argmax(beta_us)])
print('user, smallest', user_index_map_id[np.argmin(beta_us)])

print('item, largest', item_index_map_id[np.argmax(beta_is)])
print('item, smallest', item_index_map_id[np.argmin(beta_is)])

user, largest U516357151
user, smallest U512598315
item, largest I245219975
item, smallest I502194676


In [10]:
# 3.8 search for the best lam
lams = [0.001, 0.01, 0.1, 1.0, 10, 100]
max_iter = 5

# init variables
alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

results = []
for lam in lams:
    cost, mse, _, _, _ = train_and_eval(max_iter, 
                                        lam, alpha, beta_us, beta_is, 
                                        train_rating_array, valid_rating_array,
                                        print_step=True)
    print(lam, cost, mse)
    results.append([lam, cost, mse])

0 544098.523595 0.723812782094
1 526574.000267 0.703764970551
2 525679.849498 0.702909347648
3 525454.616149 0.702652242602
4 525348.801929 0.702517961378
0.001 525348.801929 0.702517961378
0 544230.811429 0.723719078081


KeyboardInterrupt: 

In [11]:
# now train on all data
max_iter = 30

# init variables
lam = 1.0
alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

cost, mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
                                                    lam, alpha, beta_us, beta_is, 
                                                    all_rating_array, valid_rating_array,
                                                    print_step=True)
print(cost, mse)

0 623274.998516 0.61471460324
1 606265.972085 0.596871920416
2 605575.176154 0.595873487936
3 605447.240888 0.595636939027
4 605408.59391 0.595542244584
5 605393.742056 0.595492402759
6 605386.823703 0.595461370772
7 605382.979792 0.595439914564
8 605380.504561 0.595424109402
9 605378.727325 0.595412020918
10 605377.352458 0.595402563455
11 605376.233816 0.59539505853
12 605375.29123 0.595389046393
13 605374.476847 0.595384197429
14 605373.760215 0.595380266238
15 605373.121008 0.595377065484
16 605372.545122 0.59537444973
17 605372.022439 0.595372304738
18 605371.545466 0.595370540022
19 605371.10848 0.59536908346
20 605370.706976 0.595367877288
21 605370.337304 0.595366875065
22 605369.996429 0.595366039332
23 605369.681769 0.595365339799
24 605369.391083 0.595364751913
25 605369.122397 0.595364255735
26 605368.87395 0.59536383504
27 605368.644153 0.595363476609
28 605368.431566 0.59536316965
29 605368.234875 0.595362905344
605368.234875 0.595362905344


In [12]:
# get header_str and user_item_ids to predict
with open('pairs_Rating.txt') as f:
    # read and strip lines
    lines = [l.strip() for l in f.readlines()]
    # stirip out the headers
    header_str = lines.pop(0)
    # get a list of user_item_ids
    user_item_ids = [l.split('-') for l in lines]
    
# write to output file
f = open('predictions_Rating.txt', 'w')
print(header_str, file=f)
for user_id, item_id in user_item_ids:
    rating = alpha + beta_us[user_id_map_index[user_id]] + beta_is[item_id_map_index[item_id]]
    rating = min(5.0, rating)
    rating = max(0.0, rating)
    print('%s-%s,%s' % (user_id, item_id, rating), file=f)
f.close()