In [4]:
# %load item-item.py
import pandas as pd
import numpy as np
from scipy import spatial

number_of_data_sets = 1
data_sets_dir = 'data/sets/'
k = 5

################################################################################
# self defined helper functions
################################################################################
# compute centered cosine similarity
# between two pandas.Series
def calc_sim(s1, s2):
    def get_norm(s):
        return (s - s.mean()).fillna(0)

    norm_s1 = get_norm(s1)
    norm_s2 = get_norm(s2)

    if norm_s1.dot(norm_s2) == 0:
        result = 0
    else:
        result = 1 - spatial.distance.cosine(norm_s1, norm_s2)
    return result

# get one user ratings for a variaties of product in knn
def get_user_ratings(user_df, knn):
    s = pd.Series(data=user_df.review_score.values, index=user_df.product_productid)
    return s.loc[knn.index]
################################################################################
# end of functions
################################################################################

# loop throught data sets
i = 0

test_file = data_sets_dir + str(i) + '/test.csv'
train_file = data_sets_dir + str(i) + '/train.csv'
# load test and train data
test_df = pd.read_csv(test_file)
train_df = pd.read_csv(train_file)
test_productid_array = test_df.product_productid.unique()
square_errors_array = []
target_productid = test_productid_array[0]
# check whether this product id exists
if not target_productid in train_df.product_productid.unique():
    print 'Cannot find product in data with id: ' + target_productid
    exit()
# get products with which have common reviewers
# instead of looping through all other products
# to improve performance
common_reviewers = train_df[train_df.product_productid == target_productid].review_userid
productid_array = train_df[train_df.review_userid.isin(common_reviewers)].product_productid.unique()
print 'Number of compared products', len(productid_array)

Number of compared products 126


In [6]:
result_sim = pd.Series()
for productid in productid_array:
    df = train_df[train_df.product_productid.isin([target_productid, productid])]
    # convert to pivot table to simplify calculation
    table = pd.pivot_table(
        df,
        values='review_score',
        index=['review_userid'],
        columns=['product_productid']
    )

    # calculate the similarity and store the result
    similarity = calc_sim(table[target_productid], table[productid])
    result_sim.set_value(productid, similarity)

# drop the target product
result_sim = result_sim.drop(target_productid)

In [20]:
userid = test_df[test_df.product_productid == target_productid].review_userid.unique()[1]
# find knn for this user
rated_items = train_df[train_df.review_userid == userid].product_productid.unique()
knn = result_sim.get(rated_items).sort_values(ascending=False)[:k]

# predict the rating
sim_weights = knn / knn.sum()
user_df = train_df[train_df.review_userid == userid]
user_ratings = get_user_ratings(user_df, knn)

predict_rating = sim_weights.dot(user_ratings)

# compute error
actual_rating = test_df[(test_df.product_productid == target_productid) & (test_df.review_userid == userid)].review_score.values[0]
square_errors_array.append(np.square(predict_rating - actual_rating))

result_sim.get(rated_items).sort_values(ascending=False)[:k].fillna(0)

B00006HB30    0.972518
B00074DXEE    0.038429
B000FUTUYC    0.015663
B0007OP2CU    0.015045
B000M7FSSA    0.000000
dtype: float64