In [1]:
import os
import time
import gc
import argparse
import pickle
import numpy as np

# data science imports
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# metrics imports
from metrics import *

In [2]:
def read_pickle(path):
    ret = None
    with open(path, 'rb') as f:
        ret = pickle.load(f)
    return ret

def write_pickle(path, data):
    with open(path, 'wb') as fw:
        pickle.dump(data, fw)

In [3]:
homedir = os.getenv('HOME')
datapath = os.path.realpath(os.path.join(homedir, 'datasets/yelp_dataset/rates'))
print(datapath)

/home1/lyt/datasets/yelp_dataset/rates


In [4]:
# train_data = read_pickle(os.path.join(datapath, 'train_data.pickle'))
# users = read_pickle(os.path.join(datapath, 'users-complete.pickle'))
# items = read_pickle(os.path.join(datapath, 'businesses-complete.pickle'))
# test_data = read_pickle(os.path.join(datapath, 'test_with_neg_sample.pickle'))
train_data = read_pickle(os.path.join(datapath, 'rate_train'))
users = read_pickle(os.path.join(datapath, 'num_to_userid'))
items = read_pickle(os.path.join(datapath, 'num_to_businessid'))
test_data = read_pickle(os.path.join(datapath, 'test_with_neg'))

In [5]:
print(train_data[0])
print(len(test_data))
print(test_data[1])
print(len(users))
print(len(items))

{'business_id': 6024, 'rate': 1.0, 'user_id': 11133, 'timestamp': 1098125200.0}
6492
{'user_id': 13008, 'pos_business_id': [2217, 1498, 4272, 10321, 1220, 3508, 9361, 2318, 2829, 3073, 11482, 1068, 4811, 10076, 6116, 9515, 10201, 10150, 12376], 'neg_business_id': [11737, 12489, 13866, 7111, 9142, 4656, 11961, 11390, 7289, 6500, 9508, 11147, 8812, 9306, 12889, 11349, 5256, 4459, 3689, 2525, 3959, 7128, 4355, 5813, 5505, 11852, 6591, 3089, 11537, 4905, 6536, 4840, 13801, 3846, 1559, 8957, 13588, 10976, 65, 7695, 13721, 10277, 3236, 7161, 10122, 13057, 5948, 5962, 9833, 5065]}
13262
13902


In [6]:
test_users = set(i['user_id'] for i in test_data)
print(len(test_users))

6492


In [7]:
def make_matrix(n_user, n_item, reviews):
    """
    row for item, column for user
    """
    user_ids = [i for i in range(n_user)]
    item_ids = [i for i in range(n_item)]
    train_mat = np.zeros((n_item, n_user))
    for i in reviews:
        user = i['user_id']
        item = i['business_id']
        train_mat[item][user] = 1
    return user_ids, item_ids, train_mat

In [8]:
user_ids, item_ids, train_mat = make_matrix(len(users), len(items), train_data)
print(train_mat.shape)

(13902, 13262)


In [17]:
model = NearestNeighbors(10, algorithm='brute',metric='cosine', p=1)
# test hamming
model.fit(train_mat)    # the shape of train_mat need to be (n_queries, n_features), thus (n_items, n_users)
t0 = time.time()
distance, indices = model.kneighbors(train_mat, 11)
t1 = time.time()
print("time cost:", t1 - t0)
print(len(distance))
print(len(indices))

time cost: 32.846928119659424
13902
13902


In [18]:
a = np.expand_dims(distance[:,1:], -1)
b = np.expand_dims(indices[:,1:], -1)
predictions = np.concatenate((a, b), axis=2)
print(predictions.shape)

(13902, 10, 2)


In [9]:
def evaluation(test_data, train_mat, predictions):
    """
    predictions is calculated above
    """
    precs = []
    hrs = []
    recalls = []
    ndcgs = []
    
    for i in test_data:
        user = i['user_id']
        gt_items = i['pos_business_id']
#         print(gt_items)
        interactions = np.nonzero(train_mat[:,user])
#         print(interactions)
        for item in gt_items:
            try:
                assert item not in interactions[0]
            except AssertionError:
                print("user id:", user)
#                 print(gt_items)
#                 print(interactions)
                print(item)
        # predictions[interactions] is the top 10 neighbors of the item
        #-----------------------
        # step 1: select preds
        #-----------------------
        unsorted = predictions[interactions].reshape(-1, 2)
#         print("unsorted:", unsorted)
        #-----------------------
        # step 2: sort preds
        #-----------------------
        sorted_preds = unsorted[np.argsort(unsorted[:, 0])]
#         print("sorted:", sorted_preds)
        #-----------------------------------------
        # step 3: select top 10, but keep unique
        #-----------------------------------------
        pred_items = []
        idx = 0
        while(len(pred_items) < 10):
            item = int(sorted_preds[idx, 1])
            if item not in pred_items:
                pred_items.append(item)
            idx += 1
#         print("top10:", pred_items)
        
        #-----------------------------
        # step 4: Calculate metrics
        #-----------------------------
        prec = getP(pred_items, gt_items)
        hr = getHitRatio(pred_items, gt_items)
        recall = getR(pred_items, gt_items)
        ndcg = getNDCG(pred_items, gt_items)
#         print("prec: %.4f, hr: %.4f, recall: %4f, ndcg: %4f" % (prec, hr, recall, ndcg))
        precs.append(prec)
        hrs.append(hr)
        recalls.append(recall)
        ndcgs.append(ndcg)
        
    mean_prec = np.mean(precs)
    mean_hr = np.mean(hrs)
    mean_recall = np.mean(recalls)
    mean_ndcg = np.mean(ndcgs)
    
    return mean_prec, mean_hr, mean_recall, mean_ndcg
        

In [20]:
prec, hr, recall, ndcg = evaluation(test_data, train_mat, predictions)
print("final: prec@10: %.4f, hr@10: %.4f, recall@10: %4f, ndcg@10: %4f" % (prec, hr, recall, ndcg))

final: prec@10: 0.0120, hr@10: 0.1198, recall@10: 0.016950, ndcg@10: 0.050127


# Test different metrics

In [None]:
metrics = ['cosine', 'hamming', 'jaccard', 'matching', 'dice', 'kulsinski', 'rogerstanimoto', 'russellrao',
          'sokalmichener', 'sokalsneath']
for metric in metrics:
    model = NearestNeighbors(10, algorithm='brute', metric=metric, p=1)
    model.fit(train_mat)    # the shape of train_mat need to be (n_queries, n_features), thus (n_items, n_users)
    t0 = time.time()
    distance, indices = model.kneighbors(train_mat, 11)
    t1 = time.time()
    print("time cost:", t1 - t0)
    
    a = np.expand_dims(distance[:,1:], -1)
    b = np.expand_dims(indices[:,1:], -1)
    predictions = np.concatenate((a, b), axis=2)
    
    prec, hr, recall, ndcg = evaluation(test_data, train_mat, predictions)
    print("%s final: prec@10: %.4f, hr@10: %.4f, recall@10: %4f, ndcg@10: %4f" % (metric, prec, hr, recall, ndcg))

time cost: 23.056436777114868
cosine final: prec@10: 0.0120, hr@10: 0.1198, recall@10: 0.016950, ndcg@10: 0.050127
time cost: 2518.6911492347717
hamming final: prec@10: 0.0003, hr@10: 0.0034, recall@10: 0.000369, ndcg@10: 0.002612




time cost: 3693.1809375286102
jaccard final: prec@10: 0.0121, hr@10: 0.1206, recall@10: 0.017226, ndcg@10: 0.050524




time cost: 3705.852528333664
matching final: prec@10: 0.0003, hr@10: 0.0034, recall@10: 0.000369, ndcg@10: 0.002612




time cost: 4836.7737782001495
dice final: prec@10: 0.0121, hr@10: 0.1206, recall@10: 0.017226, ndcg@10: 0.050524




In [10]:
from scipy.stats import pearsonr
def my_pearson(x, y):
    pearson, p_value = pearsonr(x, y)
    return -pearson

In [None]:
model = NearestNeighbors(10, algorithm='brute', metric=my_pearson, n_jobs=16)
model.fit(train_mat)    # the shape of train_mat need to be (n_queries, n_features), thus (n_items, n_users)
t0 = time.time()
distance, indices = model.kneighbors(train_mat, 11)
t1 = time.time()
print("time cost:", t1 - t0)

a = np.expand_dims(distance[:,1:], -1)
b = np.expand_dims(indices[:,1:], -1)
predictions = np.concatenate((a, b), axis=2)

prec, hr, recall, ndcg = evaluation(test_data, train_mat, predictions)
print("%s final: prec@10: %.4f, hr@10: %.4f, recall@10: %4f, ndcg@10: %4f" % (metric, prec, hr, recall, ndcg))



# Function Experiments

In [11]:
m, n = indices.shape
a = indices.reshape(m, n, 1)
print(a.shape)
b = np.expand_dims(indices, -1)
print(b.shape)

(13902, 11, 1)
(13902, 11, 1)


In [12]:
print(predictions[0,:,0])
print(predictions[0,:,1])
data = predictions[0:2].reshape(-1, 2)
print(data)
print()
print(data[np.argsort(data[:,0])])

[36. 36. 36. 36. 36. 36. 36. 36. 36. 36.]
[234. 264. 245. 163. 263.  93. 393.  60. 160. 112.]
[[ 36. 234.]
 [ 36. 264.]
 [ 36. 245.]
 [ 36. 163.]
 [ 36. 263.]
 [ 36.  93.]
 [ 36. 393.]
 [ 36.  60.]
 [ 36. 160.]
 [ 36. 112.]
 [ 25. 393.]
 [ 25. 264.]
 [ 25. 263.]
 [ 25. 163.]
 [ 25. 245.]
 [ 25. 160.]
 [ 25. 234.]
 [ 25.  93.]
 [ 25. 112.]
 [ 25.  60.]]

[[ 25.  60.]
 [ 25.  93.]
 [ 25. 234.]
 [ 25. 160.]
 [ 25. 245.]
 [ 25. 163.]
 [ 25. 263.]
 [ 25. 264.]
 [ 25. 393.]
 [ 25. 112.]
 [ 36. 234.]
 [ 36.  60.]
 [ 36. 393.]
 [ 36.  93.]
 [ 36. 263.]
 [ 36. 163.]
 [ 36. 245.]
 [ 36. 264.]
 [ 36. 160.]
 [ 36. 112.]]


In [142]:
i = test_data[1302]
user = i['user_id']
gt_items = i['pos_business_id']
print(gt_items)
interactions = np.nonzero(train_mat[:,user])
# predictions[interactions] is the top 10 neighbors of the item
unsorted = predictions[interactions].reshape(-1, 2)
# print("unsorted:", unsorted)
sorted_preds = unsorted[np.argsort(unsorted[:, 0])]
# print("sorted:", sorted_preds)
pred_items = []
idx = 0
while(len(pred_items) < 10):
    item = int(sorted_preds[idx, 1])
    if item not in pred_items:
        pred_items.append(item)
    idx += 1
print("top10:", pred_items)
print(getP(pred_items, gt_items))
print(getHitRatio(pred_items, gt_items))
print(getR(pred_items, gt_items))
print(getNDCG(pred_items, gt_items))

[6966, 4858, 601, 11153, 4798, 3553, 3142, 7412, 3659, 7741, 4661, 11908]
top10: [2279, 5900, 5124, 5567, 3382, 1918, 3735, 1245, 5914, 1964]
0.0
0
0.0
0


In [159]:
row = train_mat[:,0]
print(col)
inter = np.nonzero(col)
print(type(inter[0]))
for i in inter:
    print(col[i])

[0. 0. 0. ... 0. 0. 0.]
<class 'numpy.ndarray'>
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
