## Load data

In [28]:
import pandas as pd
import pickle

data_name = 'ml-1m'
data_type = 'train'

with open(f'./datasets/{data_name}/{data_name}-dataset_{data_type}.pth', 'rb') as f:
    data = pickle.load(f)
data

[1;35mml-1m[0m
[1;34mThe number of users[0m: 6041
[1;34mAverage actions of users[0m: 116.68427152317881
[1;34mThe number of items[0m: 3417
[1;34mAverage actions of items[0m: 206.3756954612006
[1;34mThe number of inters[0m: 704773
[1;34mThe sparsity of the dataset[0m: 96.58574901571288%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'timestamp', 'movie_title', 'release_year', 'genre']

## Load scores tensor (pretrained)

In [29]:
import pandas as pd
model_name = 'SimpleX'
scores_tensor_df = pd.read_csv(f'./pretrained_scores/{data_name}/{model_name}_{data_name}_scores_tensor.csv', header=None)

## ItemRec

In [30]:
import torch
from collections import defaultdict
import pandas as pd
import numpy as np

limit_num = 50
topk = 10
item_limit_num = 5
# scores_tensor_df = scores_tensor_df.iloc[:,:]
display(scores_tensor_df)

scores_tensor = torch.tensor(scores_tensor_df.to_numpy())
scores_sorted, idx_sorted = torch.sort(scores_tensor, dim=-1, descending=True)

scores_tensor = scores_sorted[:,:limit_num].numpy()
scores_tensor_user = idx_sorted[:,:limit_num].numpy()

print(scores_tensor_user.shape)

scores_tensor_item = defaultdict(list)
scores_tensor_item_scores = defaultdict(list)
scores_tensor_item_rank = defaultdict(list)


for uid in range(scores_tensor.shape[0]):
    for idx in range(limit_num):
        scores_tensor_item[scores_tensor_user[uid,idx]].append(uid)
        scores_tensor_item_scores[scores_tensor_user[uid,idx]].append(scores_tensor[uid,idx])

scores_tensor_item_max = defaultdict(int)
scores_tensor_item_min = defaultdict(int)

for item_id in scores_tensor_item.keys():
    scores_tensor_item_max[item_id] = max(scores_tensor_item_scores[item_id])
    scores_tensor_item_min[item_id] = min(scores_tensor_item_scores[item_id])

for uid in range(scores_tensor.shape[0]):
    for idx in range(limit_num):
        item_id = scores_tensor_user[uid,idx]
        rank_ = idx-((scores_tensor[uid,idx] - scores_tensor_item_min[item_id]) / (scores_tensor_item_max[item_id] - scores_tensor_item_min[item_id]+1e-8))
        scores_tensor_item_rank[item_id].append(rank_)


for item_id in scores_tensor_item.keys():
    scores_tensor_item[item_id] = np.array(scores_tensor_item[item_id])
    scores_tensor_item_scores[item_id] = np.array(scores_tensor_item_scores[item_id])
    scores_tensor_item_rank[item_id] = np.array(scores_tensor_item_rank[item_id])
    scores_tensor_item[item_id] = scores_tensor_item[item_id][np.argsort(scores_tensor_item_rank[item_id])]
    scores_tensor_item_scores[item_id] = scores_tensor_item_scores[item_id][np.argsort(scores_tensor_item_rank[item_id])]


# user
topk_idx = scores_tensor_user[:,:0].tolist()
topk_idx2 = scores_tensor_user[:,:topk]



user_item_rank = defaultdict(dict)
user_item_scores = defaultdict(dict)

for item_id in scores_tensor_item.keys():
    uid_list = scores_tensor_item[item_id]
    uid_len = len(uid_list)
    for idx,uid in enumerate(uid_list):
        user_item_rank[uid][item_id] = idx + 1/uid_len
        user_item_scores[uid][item_id] = scores_tensor_item_scores[item_id][idx]

topk_idx_scores = scores_tensor_user[:,:0].tolist()
for uid in user_item_rank.keys():
    user_item_rank[uid] = dict(sorted(user_item_rank[uid].items(),key=lambda x:x[1]))
    item_list = list(user_item_rank[uid].keys())[:10]
    item_list = np.array(item_list)

    for item_id in list(user_item_rank[uid].keys())[:10]:
        if user_item_rank[uid][item_id] < item_limit_num:
            topk_idx[uid].append(item_id)
            topk_idx_scores[uid].append(user_item_scores[uid][item_id])

# fill left by user
for uid in range(scores_tensor_user.shape[0]):
    left = topk - len(topk_idx[uid])
    idx = 0
    while left > 0:
        if scores_tensor_user[uid,idx].item() not in topk_idx[uid]:
            topk_idx[uid].append(scores_tensor_user[uid,idx].item())
            topk_idx_scores[uid].append(user_item_scores[uid][scores_tensor_user[uid,idx]])
            left -= 1
        idx += 1



topk_idx = torch.tensor(topk_idx).cuda()
topk_idx_scores = torch.tensor(topk_idx_scores).cuda()
topk_idx = topk_idx.gather(-1, torch.argsort(-1*topk_idx_scores))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3407,3408,3409,3410,3411,3412,3413,3414,3415,3416
0,-100000000.0,-1.000000e+08,-1.000000e+08,-1.000000e+08,-1.000000e+08,6.756673e-01,-1.000000e+08,-1.000000e+08,-1.000000e+08,-1.000000e+08,...,0.252934,-0.031541,0.263252,0.304989,0.156592,0.201586,0.360647,0.363089,0.154377,0.338879
1,-100000000.0,-1.000000e+08,3.782592e-01,5.524825e-01,4.967356e-01,5.092890e-01,8.254577e-01,5.754841e-01,7.531686e-01,4.517520e-01,...,0.308424,0.168910,0.325407,0.242591,0.319118,0.406534,0.392990,0.431927,0.169362,0.386330
2,-100000000.0,6.944596e-01,4.735289e-01,4.524828e-01,4.603328e-01,6.280461e-01,-1.000000e+08,6.119211e-01,7.494447e-01,5.091426e-01,...,0.298325,0.283602,0.280164,0.289092,0.262707,0.317803,0.247504,0.246818,0.205698,0.392545
3,-100000000.0,7.799497e-01,4.562679e-01,4.928902e-01,4.834377e-01,5.594931e-01,8.871518e-01,7.042866e-01,7.028444e-01,5.205972e-01,...,0.220636,0.233892,0.232034,0.258890,0.302472,0.334779,0.403818,0.273032,0.143234,0.467245
4,-100000000.0,6.549782e-01,5.042842e-01,4.055977e-01,-1.000000e+08,-1.000000e+08,6.936802e-01,2.877638e-01,5.722759e-01,4.374422e-01,...,0.300434,0.299848,0.129788,0.346744,0.410617,0.334098,0.383545,0.423788,0.277952,0.317120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,-100000000.0,-1.000000e+08,5.421054e-01,-1.000000e+08,-1.000000e+08,-1.000000e+08,-1.000000e+08,4.635233e-01,6.806068e-01,-1.000000e+08,...,0.345968,0.298617,0.155096,0.269554,0.321760,0.321274,0.457663,0.454319,0.197495,0.468276
6036,-100000000.0,-1.000000e+08,3.980531e-01,5.719991e-01,4.973604e-01,5.023095e-01,7.841001e-01,5.633476e-01,-1.000000e+08,4.977273e-01,...,0.345260,0.186628,0.313266,0.202743,0.389201,0.378273,0.426764,0.442897,0.255339,0.433298
6037,-100000000.0,8.531635e-01,4.261494e-01,5.614003e-01,4.270906e-01,5.921570e-01,8.563724e-01,5.570086e-01,8.949473e-01,5.769461e-01,...,0.284753,0.092900,0.243209,0.178791,0.185866,0.215311,0.373633,0.337260,0.139494,0.295327
6038,-100000000.0,6.980346e-01,-1.000000e+08,-1.000000e+08,4.547546e-01,6.045275e-01,-1.000000e+08,5.605616e-01,-1.000000e+08,7.403403e-01,...,0.364727,0.092682,0.330342,0.206071,0.223225,0.215451,0.347115,0.373649,0.099718,0.376735


(6040, 50)


In [31]:
import torch
import pickle
from collections import defaultdict

path = f'./datasets/{data_name}/{data_name}-dataset_test.pth'
with open(path, "rb") as f:
    data = pickle.load(f)
item_df = data.item_feat
inter_df = data.inter_feat
display(inter_df)
inter_matrix = data.inter_matrix()
d = inter_matrix.todok()
test_user_list = defaultdict(list)
item_set = set()
train_df = []
item_info = []
for k,v in d.items():
    test_user_list[k[0]-1].append(k[1])
    item_set.add(k[1])
    train_df.append([k[0]-1, k[1], v])

The batch_size of interaction: 197526
    user_id, torch.Size([197526]), cpu, torch.int64
    item_id, torch.Size([197526]), cpu, torch.int64
    timestamp, torch.Size([197526]), cpu, torch.float32


## Valid Coverage

In [32]:
valid_coverage = set()
for uid in range(topk_idx.shape[0]):
    valid_coverage = valid_coverage | set(topk_idx[uid].tolist()) & set(test_user_list[uid])
print(len(valid_coverage))

item_num = item_df.length

valid_coverage = len(valid_coverage) / item_num
valid_coverage

912


0.266900790166813

## NDCG

In [33]:
pos_len = [0] * item_num
pos_index = np.zeros(topk_idx.shape)

for u in range(topk_idx.shape[0]):
    for j in range(topk):
        if topk_idx[u,j].item() in test_user_list[u]:
            pos_index[u,j] = 1


len_rank = np.full_like(pos_len, pos_index.shape[1])
idcg_len = np.where(pos_len > len_rank, len_rank, pos_len)
iranks = np.zeros_like(pos_index, dtype=np.float32)
iranks[:, :] = np.arange(1, pos_index.shape[1] + 1)
idcg = np.cumsum(1.0 / np.log2(iranks + 1), axis=1)
for row, idx in enumerate(idcg_len):
    idcg[row, idx:] = idcg[row, idx - 1]

ranks = np.zeros_like(pos_index, dtype=np.float32)
ranks[:, :] = np.arange(1, pos_index.shape[1] + 1)
dcg = 1.0 / np.log2(ranks + 1)
dcg = np.cumsum(np.where(pos_index, dcg, 0), axis=1)
ndcg = dcg / idcg
np.mean(ndcg,axis=0)[-1]

0.112563014

## Gini

In [34]:
cnt = [0 for i in range(item_num)]
row, col = topk_idx.shape
for i in range(row):
    for j in range(col):
        cnt[topk_idx[i,j].item()] += 1

cnt.sort()
giny = 0
height, area = 0, 0
for c in cnt:
    height += c
    area += height-c/2
fair_area = height*item_num/2
giny = (fair_area-area)/fair_area
giny

0.8758658014950569