In [2]:
import pandas as pd
import numpy as np
from scipy import spatial

number_of_data_sets = 10
data_sets_dir = '../data/sets/'
k = 5

i = 0
test_file = data_sets_dir + str(i) + '/test.csv'
train_file = data_sets_dir + str(i) + '/train.csv'
test_df = pd.read_csv(test_file)
train_df = pd.read_csv(train_file)

In [2]:
target_userid = 'ALX2RTJW3NF0O'
common_products = train_df[train_df.review_userid == target_userid].product_productid
userid_array = train_df[train_df.product_productid.isin(common_products)].review_userid.unique()
len(userid_array)

4609

In [3]:
def calc_sim(s1, s2):
    def get_norm(s):
        return (s - s.mean()).fillna(0)

    norm_s1 = get_norm(s1)
    norm_s2 = get_norm(s2)

    if norm_s1.dot(norm_s2) == 0:
        result = 0
    else:
        result = 1 - spatial.distance.cosine(norm_s1, norm_s2)
    return result

In [4]:
result_sim = pd.Series()
for index, userid in enumerate(userid_array):
    df = train_df[train_df.review_userid.isin([target_userid, userid])]
    # convert to pivot table to simplify calculation
    table = pd.pivot_table(
        df,
        values='review_score',
        index=['product_productid'],
        columns=['review_userid']
    )

    # calculate the similarity and store the result
    similarity = calc_sim(table[target_userid], table[userid])
    result_sim.set_value(userid, similarity)
    if index > 0 and index % 500 == 0:
        print 'Number of users processed: %d' % index
result_sim = result_sim.drop(target_userid)

Number of users processed: 500
Number of users processed: 1000
Number of users processed: 1500
Number of users processed: 2000
Number of users processed: 2500
Number of users processed: 3000
Number of users processed: 3500
Number of users processed: 4000
Number of users processed: 4500


In [5]:
test_df[test_df.review_userid == target_userid].product_productid.unique()

array(['B000063W1R', 'B000JGG6T4', 'B00004XPPG', 'B00061S0QE',
       '7883704591', 'B00004RRGB', 'B0007A2GTG', 'B00005R23Y'], dtype=object)

In [6]:
productid = 'B00004XPPG'
rated_users = train_df[train_df.product_productid == productid].review_userid.unique()
knn = result_sim.get(rated_users).sort_values(ascending=False)[:k].fillna(0)
knn

A1JUUZAYZCFAI3    0.217597
A2PL4GB1S2Q5EK    0.106751
A258MSMKY2710V    0.099433
A1D2ZN57YG2NMH    0.053544
A1FNES0QEBJZD1    0.051649
dtype: float64

In [28]:
sim_weights = knn / knn.sum()
sim_weights

A1JUUZAYZCFAI3    0.411357
A2PL4GB1S2Q5EK    0.201807
A258MSMKY2710V    0.187973
A1D2ZN57YG2NMH    0.101222
A1FNES0QEBJZD1    0.097641
dtype: float64

In [7]:
def get_user_ratings(df, knn):
    s = pd.Series(data=df.review_score.values, index=df.review_userid)
    return s.loc[knn.index]

In [8]:
product_df = train_df[train_df.product_productid == productid]
user_ratings = get_user_ratings(product_df, knn)
user_ratings

review_userid
A1JUUZAYZCFAI3    5
A2PL4GB1S2Q5EK    3
A2PL4GB1S2Q5EK    5
A258MSMKY2710V    4
A1D2ZN57YG2NMH    5
A1FNES0QEBJZD1    5
dtype: float64

In [21]:
product_df[product_df.review_userid == 'A2PL4GB1S2Q5EK']

Unnamed: 0,product_productid,review_userid,review_profilename,review_helpfulness,review_score
1692271,B00004XPPG,A2PL4GB1S2Q5EK,Deborah G. Hall,2/2,3
1692294,B00004XPPG,A2PL4GB1S2Q5EK,Deborah G. Hall,7/13,5


In [30]:
user_ratings = product_df.groupby('review_userid').mean().loc[knn.index]
user_ratings

Unnamed: 0,review_score
A1JUUZAYZCFAI3,5
A2PL4GB1S2Q5EK,4
A258MSMKY2710V,4
A1D2ZN57YG2NMH,5
A1FNES0QEBJZD1,5


In [31]:
sim_weights.dot(user_ratings)

review_score    4.61022
dtype: float64