# Precomputing $IFD_\times$

In [2]:
import numpy as np
from scipy.spatial.distance import pdist
import builtins
import pickle
import time

import warnings 
warnings.filterwarnings('ignore')

import os
import datetime

In [3]:
path = "precomputeIFD"
best_struct_path = "../cluster/best_struct"

def print(*args, **kwargs):
    with open(f"{path}/log_{dataset}.txt", 'a+') as f:
        return builtins.print(*args, file=f, **kwargs)
    
def compute_IFD_mul(ranks, cut_off=10):
    num_item = len(ranks)
    pos_index = np.arange(num_item)+1
    dcg_weight =  1/ np.log2(pos_index+1)
    if cut_off:
        dcg_weight[cut_off:] = 0

    ranks = np.asarray(ranks)
    to_input_to_pairwise = (ranks * dcg_weight ).reshape(-1,1)
    IFD_mul_pdist = pdist(to_input_to_pairwise, 'sqeuclidean')
    IFD_mul = IFD_mul_pdist.mean()
    return IFD_mul

def precompute_max_IFD_mul(cut_off, num_items, num_rel_u, prev_rel_u, prev_strategy):
    num_non_rel = num_items - num_rel_u
    place_all_top = tuple([1]*num_rel_u + [0]*num_non_rel)

    #x-y strategy
    _, bottom = prev_strategy
    if bottom != 0:
        bottom = bottom + num_rel_u - prev_rel_u
    else:
        bottom = 1
    top = num_rel_u - bottom

    assert top + bottom == num_rel_u
    place_x_y = tuple([1]*top + [0]*num_non_rel + [1]*bottom)

    IFD_place_all_top = compute_IFD_mul(place_all_top, cut_off)
    IFD_place_x_y = compute_IFD_mul(place_x_y, cut_off)

    if IFD_place_all_top >= IFD_place_x_y:
        return [num_rel_u,0], IFD_place_all_top
    else:
        return [top,bottom], IFD_place_x_y
list_dataset = [
                "Amazon-lb", 
                "Lastfm", 
                "QK-video",
                "ML-10M", 
                ]
model_name = "NCL"

list_k = [10]
max_k = max(list_k)


In [None]:

for dataset in list_dataset:
    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print(now)

    print(f"Doing {dataset} - {model_name}")

    try:
        with open(f"{path}/precomputeIFD_{dataset}.pickle","rb") as f:
            found = pickle.load(f)
            print("found existing precomputed result ")
            print(found)
    except:
        print(f"Cannot find existing result for {dataset}, proceed with precomputing")

        list_filename = [f for f in os.listdir(best_struct_path) if dataset in f and model_name in f]

        assert len(list_filename) == 1

        with open(f"{best_struct_path}/{list_filename[0]}","rb") as f:
            struct = pickle.load(f)
            num_items = struct.get("data.num_items") - 1
            pos_items = struct.get("data.pos_items")
            unique_rel_item = sorted(list(set([pos_items_u.size for pos_items_u in pos_items])))
        print("List of number of unique relevant items per user", unique_rel_item)

        result = dict()
        start_time = time.time()
        prev_rel_u = 0
        prev_strategy = [1,0] #at least 1 relevant item
        for num_rel_u in unique_rel_item:
            strategy, score = precompute_max_IFD_mul(max_k, num_items, num_rel_u, prev_rel_u, prev_strategy)
            #save strategy
            result[num_rel_u] = {"strategy": strategy, "score": score}
            prev_strategy = strategy
            prev_rel_u = num_rel_u
            # time.sleep(0.100) # add delay
            if "ML" not in dataset:
                print(num_rel_u, strategy, score)

        print("total time taken: ", time.time() - start_time)
        print(result)

        #save results
        with open(f"{path}/precomputeIFD_{dataset}.pickle","wb") as f:
            pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

k = 1, 3, 5 (for non-real and non-localisation)

In [11]:
def print(*args, **kwargs):
    with open(f"{path}/log_{dataset}_{k}.txt", 'a+') as f:
        return builtins.print(*args, file=f, **kwargs)

list_dataset = [
                "Amazon-lb", 
                "Lastfm", 
                "QK-video",
                "ML-10M"
]

for dataset in list_dataset:
    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    list_filename = [f for f in os.listdir(best_struct_path) if dataset in f and model_name in f]

    assert len(list_filename) == 1

    with open(f"{best_struct_path}/{list_filename[0]}","rb") as f:
        struct = pickle.load(f)
        num_items = struct.get("data.num_items") - 1
        pos_items = struct.get("data.pos_items")
        unique_rel_item = sorted(list(set([pos_items_u.size for pos_items_u in pos_items])))

    result = dict()
    for k in [1, 3, 5]:
        prev_rel_u = 0
        prev_strategy = [1,0] #at least 1 relevant item

        print(now)
        print(f"Doing {dataset} - {k}")

        try: 
            with open(f"{path}/precomputeIFD_{dataset}_{k}.pickle","rb") as f:
                found = pickle.load(f)
                print("found existing precomputed result ")
                print(found)
        except:
            print(f"Cannot find existing result for {dataset}, k={k}, proceed with precomputing")    
            start_time = time.time()
            for num_rel_u in unique_rel_item:
                strategy, score = precompute_max_IFD_mul(k, num_items, num_rel_u, prev_rel_u, prev_strategy)
                #save strategy
                result[num_rel_u] = {"strategy": strategy, "score": score}
                prev_strategy = strategy
                prev_rel_u = num_rel_u

            print("total time taken: ", time.time() - start_time)
            print(result)

            #save results
            with open(f"{path}/precomputeIFD_{dataset}_{k}.pickle","wb") as f:
                pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

In [5]:
dataset = "artificial"
num_items = 10000
unique_rel_item = [10]
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

def print(*args, **kwargs):
    with open(f"{path}/log_{dataset}.txt", 'a+') as f:
        return builtins.print(*args, file=f, **kwargs)

print(now)

print(f"Doing {dataset}")

try:
    with open(f"{path}/precomputeIFD_{dataset}.pickle","rb") as f:
        found = pickle.load(f)
        print("found existing precomputed result ")
        print(found)
except:
    print(f"Cannot find existing result for {dataset}, proceed with precomputing")


    print("List of number of unique relevant items per user", unique_rel_item)

    result = dict()
    start_time = time.time()
    prev_rel_u = 0
    prev_strategy = [1,0] #at least 1 relevant item
    for num_rel_u in unique_rel_item:
        strategy, score = precompute_max_IFD_mul(max_k, num_items, num_rel_u, prev_rel_u, prev_strategy)
        #save strategy
        result[num_rel_u] = {"strategy": strategy, "score": score}
        prev_strategy = strategy
        prev_rel_u = num_rel_u
        if "ML" not in dataset:
            print(num_rel_u, strategy, score)

    print("total time taken: ", time.time() - start_time)
    print(result)

    #save results
    with open(f"{path}/precomputeIFD_{dataset}.pickle","wb") as f:
        pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

# Check how many users have only 1 relevant item in the test set

In [1]:
import os, pickle
from collections import Counter
best_struct_path = "../cluster/best_struct"
list_dataset = [
                "Amazon-lb", 
                "Lastfm", 
                "QK-video",
                "ML-10M", 
                ]
model_name = "NCL"

list_k = [10]
max_k = max(list_k)

for dataset in list_dataset:
    print(f"Doing {dataset} - {model_name}")

    list_filename = [f for f in os.listdir(best_struct_path) if dataset in f and model_name in f]

    assert len(list_filename) == 1

    with open(f"{best_struct_path}/{list_filename[0]}","rb") as f:
        struct = pickle.load(f)
        pos_items = struct.get("data.pos_items")
        unique_rel_item = Counter([pos_items_u.size for pos_items_u in pos_items])
    print(unique_rel_item[1]) #num users with 1 rel item

Doing Amazon-lb - NCL
104
Doing Lastfm - NCL
21
Doing QK-video - NCL
1320
Doing ML-10M - NCL
61
