In [2]:
from collections import defaultdict
import numpy as np
import scipy as sp
import cPickle as pickle
import time
from __future__ import print_function
from collections import defaultdict

In [3]:
# laod raw data
start_time = time.time()
all_data = pickle.load(open("all_data.pickle", "rb"))
print(time.time() - start_time)

19.0522539616


In [4]:
# get train and test set
num_all = len(all_data)
# train_data = all_data[:100000]
train_data = all_data[:900000]
valid_data = all_data[900000:]

In [5]:
# pre-process 0: build id <-> index infastructure

# get all items and users
item_ids = sorted(list(set([d['itemID'] for d in all_data])))
user_ids = sorted(list(set([d['reviewerID'] for d in all_data])))

# user and item numbers
num_items = len(item_ids)
num_users = len(user_ids)

# build id <-> index map
item_id_map_index = dict()
item_index_map_id = dict()
for index, item_id in enumerate(item_ids):
    item_id_map_index[item_id] = index
    item_index_map_id[index] = item_id
    
user_id_map_index = dict()
user_index_map_id = dict()
for index, user_id in enumerate(user_ids):
    user_id_map_index[user_id] = index
    user_index_map_id[index] = user_id

In [6]:
# pre-process 1: build train_rating_array, valid_rating_array

# build array [user_index, item_index, rating]
train_rating_array = []
for d in train_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    train_rating_array.append([user_index, item_index, rating])
train_rating_array = np.array(train_rating_array).astype(int)

# build array [user_index, item_index, rating]
valid_rating_array = []
for d in valid_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    valid_rating_array.append([user_index, item_index, rating])
valid_rating_array = np.array(valid_rating_array).astype(int)

# build array [user_index, item_index, rating]
all_rating_array = []
for d in all_data:
    user_index = user_id_map_index[d['reviewerID']]
    item_index = item_id_map_index[d['itemID']]
    rating = d['rating']
    all_rating_array.append([user_index, item_index, rating])
all_rating_array = np.array(all_rating_array).astype(int)

In [7]:
# pre-process 2: # utility and update functions
def get_valid_mse(lam, alpha, beta_us, beta_is, rating_array, valid_rating_array):
    predicts = alpha + beta_us[valid_rating_array[:, 0]] + beta_is[valid_rating_array[:, 1]]
    ratings = valid_rating_array[:, 2].astype(float)
    return (1. / valid_rating_array.shape[0]) * np.sum((predicts - ratings) ** 2.0)

def get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    predicts = alpha + beta_us[rating_array[:, 0]] + beta_is[rating_array[:, 1]]
    ratings = rating_array[:, 2].astype(float)
    return np.sum((predicts - ratings) ** 2.) + lam * (np.sum(beta_us ** 2.) + np.sum(beta_is ** 2.))
    
def alpha_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    sum_Rui = np.sum(rating_array[:, 2])
    sum_beta_u = np.sum(beta_us[rating_array[:, 0]]) # fancy indexing
    sum_beta_i = np.sum(beta_is[rating_array[:, 1]]) # fancy indexing
    return (sum_Rui - sum_beta_u - sum_beta_i) / rating_array.shape[0]

def beta_us_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    new_beta_us = np.zeros_like(beta_us)
    for user_index in xrange(num_users):
        # [the set of items] reviewed by user u
        Iu = Ruis[user_index].keys()
        Iu_size = len(Iu)
        # sums
        sum_Rui = np.sum(Ruis[user_index].values())
        sum_alpha = Iu_size * alpha
        sum_beta_i = np.sum(beta_is[Iu])
        # write result
        new_beta_us[user_index] = float(sum_Rui - sum_alpha - sum_beta_i) / (lam + Iu_size)
    return new_beta_us

def beta_is_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius):
    new_beta_is = np.zeros_like(beta_is)
    for item_index in xrange(num_items):
        # [the set of users] reviewd item i
        Ui = Rius[item_index].keys()
        Ui_size = len(Ui)
        # sums
        sum_Rui = np.sum(Rius[item_index].values())
        sum_alpha = Ui_size * alpha
        sum_beta_u = np.sum(beta_us[Ui])
        # write result
        new_beta_is[item_index] = float(sum_Rui - sum_alpha - sum_beta_u) / (lam + Ui_size)
    return new_beta_is

def train_and_eval(max_iter, 
                   lam, alpha, beta_us, beta_is, 
                   rating_array, valid_rating_array,
                   print_step = False):
    
    # build Mapping of Ruis and Rius
    Ruis = defaultdict(dict)
    Rius = defaultdict(dict)
    # Iu = Ruis[user_index].keys() # [the set of items] reviewed by user u
    # Ui = Ruis[item_index].keys() # [the set of users] reviewed item i
    for t in rating_array:
        user_index = t[0]
        item_index = t[1]
        rating = t[2]
        Ruis[user_index][item_index] = rating
        Rius[item_index][user_index] = rating
    
    # train on this dataset
    for i in xrange(max_iter):
        alpha = alpha_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        beta_us = beta_us_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        beta_is = beta_is_update(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
        if print_step:
            cost = get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
            valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is, 
                                      rating_array, valid_rating_array)
            print(i, cost, valid_mse)
    
    cost = get_cost(lam, alpha, beta_us, beta_is, rating_array, Ruis, Rius)
    valid_mse = get_valid_mse(lam, alpha, beta_us, beta_is, rating_array, valid_rating_array)
    
    return(cost, valid_mse, alpha, beta_us, beta_is)

In [8]:
# 3.5 average predictor (using index based sorted list)
# get averaged rating
alpha = np.mean(train_rating_array[:, 2])
print('alpha', alpha)

# calculate mse
valid_ratings = valid_rating_array[:, 2]
valid_mse = (1. / valid_rating_array.shape[0]) * np.sum((valid_ratings - alpha) ** 2.0)
print('valid_mse', valid_mse)

alpha 4.21898777778
valid_mse 0.969062751573


In [9]:
# # 3.6 fit baseline model: rating(u, i) = alpha + beta_u + beta_i

# # set training
# max_iter = 30

# # parameters
# lam = 1.0
# alpha = 0.0
# beta_us = np.random.normal(0, 0.5, (num_users,))
# beta_is = np.random.normal(0, 0.5, (num_items,))

# cost, valid_mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
#                                                           lam, alpha, beta_us, beta_is, 
#                                                           train_rating_array, valid_rating_array,
#                                                           print_step=True)
# print(cost, valid_mse)

In [10]:
# # 3.7 report the user and item id that have the largest and smallest values of beta
# print('user, largest', user_index_map_id[np.argmax(beta_us)])
# print('user, smallest', user_index_map_id[np.argmin(beta_us)])

# print('item, largest', item_index_map_id[np.argmax(beta_is)])
# print('item, smallest', item_index_map_id[np.argmin(beta_is)])

In [11]:
# # 3.8 search for the best lam
# lams = [0.001, 0.01, 0.1, 1.0, 10, 100]
# max_iter = 30

# # init variables
# alpha = 0.0
# beta_us = np.random.normal(0, 0.5, (num_users,))
# beta_is = np.random.normal(0, 0.5, (num_items,))

# results = []
# for lam in lams:
#     cost, mse, _, _, _ = train_and_eval(max_iter, 
#                                         lam, alpha, beta_us, beta_is, 
#                                         train_rating_array, valid_rating_array,
#                                         print_step=True)
#     print(lam, cost, mse)
#     results.append([lam, cost, mse])

In [12]:
# now train on all data
max_iter = 30

# init variables
lam = 1.0
alpha = 0.0
beta_us = np.random.normal(0, 0.5, (num_users,))
beta_is = np.random.normal(0, 0.5, (num_items,))

cost, mse, alpha, beta_us, beta_is = train_and_eval(max_iter, 
                                                    lam, alpha, beta_us, beta_is, 
                                                    all_rating_array, valid_rating_array,
                                                    print_step=True)
print(cost, mse)

0 622849.73205 0.612895306398
1 606249.192095 0.596753368409
2 605568.764076 0.595845565023
3 605443.983258 0.595622084013
4 605407.1365 0.595532575653
5 605393.102107 0.595485618904
6 605386.472208 0.595456392733
7 605382.672363 0.595436155627
8 605380.1385 0.595421215662
9 605378.265278 0.595409760391
10 605376.786488 0.595400775068
11 605375.569054 0.595393626291
12 605374.538378 0.595387884822
13 605373.648657 0.595383242554
14 605372.869788 0.595379469655
15 605372.180956 0.595376390259
16 605371.567168 0.595373867498
17 605371.01724 0.595371793607
18 605370.522558 0.595370083033
19 605370.076293 0.595368667445
20 605369.672886 0.595367492012
21 605369.307702 0.595366512574
22 605368.976801 0.595365693458
23 605368.676777 0.595365005773
24 605368.404646 0.59536442607
25 605368.157769 0.595363935279
26 605367.933788 0.595363517863
27 605367.73059 0.595363161145
28 605367.546267 0.595362854761
29 605367.379097 0.595362590231
605367.379097 0.595362590231


In [13]:
pickle.dump((alpha, beta_us, beta_is), open( "alpha_beta_init.feature", "wb"))

In [14]:
# get header_str and user_item_ids to predict
with open('pairs_Rating.txt') as f:
    # read and strip lines
    lines = [l.strip() for l in f.readlines()]
    # stirip out the headers
    header_str = lines.pop(0)
    # get a list of user_item_ids
    user_item_ids = [l.split('-') for l in lines]
    
# write to output file
f = open('predictions_Rating.txt', 'w')
print(header_str, file=f)
for user_id, item_id in user_item_ids:
    rating = alpha + beta_us[user_id_map_index[user_id]] + beta_is[item_id_map_index[item_id]]
    rating = min(5.0, rating)
    rating = max(0.0, rating)
    print('%s-%s,%s' % (user_id, item_id, rating), file=f)
f.close()