In [None]:
%cd ../

In [3]:
import pickle
import pandas as pd
import numpy as np
import os
import torch
import copy, builtins, time

from collections import OrderedDict
from recbole.config import Config
from recbole.evaluator.evaluator import Evaluator
    

import warnings
warnings.simplefilter("ignore")

def print(*args, **kwargs):
    with open(f"{path}/log_{dataset}.txt", 'a+') as f:
        return builtins.print(*args, file=f, **kwargs)

def load_dataset(dataset, list_k):

    with open(f"train_val_test/{dataset}_train.pickle","rb") as f:
        data = pickle.load(f)
    train = pd.DataFrame(data)

    with open(f"train_val_test/{dataset}_valid.pickle","rb") as f:
        data = pickle.load(f)
    val = pd.DataFrame(data)

    with open(f"train_val_test/{dataset}_test.pickle","rb") as f:
        data = pickle.load(f)
    test = pd.DataFrame(data)

    config = Config(
                model="Pop", 
                dataset=dataset, 
                config_file_list=["RecBole/recbole/properties/overall.yaml"],
                config_dict={"topk":list_k,"metrics":[
                                    "FixedIAA",
                                    "FixedIFD",
                                    "FixedIIF"]}
                )

    item_id = config.final_config_dict["ITEM_ID_FIELD"]

    train = train.groupby("user_id")\
        .agg(lambda x: [x for x in x])\
        [item_id]

    val = val.groupby("user_id")\
        .agg(lambda x: [x for x in x])\
        [item_id]

    test = test.groupby("user_id")\
        .agg(lambda x: [x for x in x])\
        [item_id]

    df = pd.DataFrame()
    df["train"] = train.apply(set)
    df["valid"] = val.apply(set)
    df["pure_test"] = test.apply(set)

    df_test = df[~df.pure_test.isna()]

    df = df.applymap(lambda x: set() if type(x) == float else x)
    df_test = df_test.applymap(lambda x: set() if type(x) == float else x)
    return df, df_test

list_k = [1,3,5,10]
list_dataset= ["Amazon-lb", "Lastfm", "QK-video", "ML-10M"]
path = "experiments/most_unfair_fair"

In [4]:
def save_result(dataset, result, strategy):
    with open(f"{path}/{dataset}_{strategy}.pickle","wb") as f:
        pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)

def exist(dataset, strategy):
    try:
        res = pd.read_pickle(f"{path}/{dataset}_{strategy}.pickle")
        print(f"Found existing result for {dataset}, {strategy}:")
        print(res)
        return True
    except:
        return False

In [None]:
max_k = max(list_k)

for dataset in list_dataset:
    df, df_test = load_dataset("new_"+dataset, list_k)

    train_val = df_test\
                    .apply(lambda x: x.train | x.valid, axis=1)\
                    .apply(lambda x: np.asarray(list(x)))\
                    .values #np array

    list_file = os.listdir("struct/")
    file_for_dataset = [x for x in list_file if dataset in x]
    assert len(file_for_dataset) == 1
    
    with open("struct/"+file_for_dataset[0],"rb") as f:
        struct = pickle.load(f)

    struct.set("rec.topk", None) #most measures don't use this, so let's set this as None, we will update rec.topk for IFD later
    struct.set("data.name", dataset)


    item_matrix = struct.get('rec.items')
    num_items = struct.get('data.num_items') - 1
    pos_items = struct.get('data.pos_items')

    m = item_matrix.shape[0]
    all_choices = [i for i in range(1,num_items+1)]

    assert m == train_val.shape[0]

    all_rel_item_on_top = np.zeros((m,num_items))
    all_rel_item_back = np.zeros((m,num_items))
    half_half = np.zeros((m,num_items))


    #===STRATEGY 1===
    #all rel item at the top = fairest for IAA, II-F

    for u in range(m):
        copy_of_pos_items_of_u = copy.deepcopy(pos_items[u]) #copying because shuffle is inplace
        np.random.seed(u) #seed follows user idx
        np.random.shuffle(copy_of_pos_items_of_u)
        rel_item_u = copy_of_pos_items_of_u

        num_rel = rel_item_u.shape[0]
        #random the rest from non-positive items (including the ones not in top k), i.e., skip the pos_items[u]

        #exclude items from train and val too, start from items not in train/val
        choice_for_u = np.setdiff1d(all_choices, copy_of_pos_items_of_u)

        train_val_u = train_val[u]
        choice_for_u = np.setdiff1d(choice_for_u, train_val_u)

        rng = np.random.default_rng(u) #seed follows user idx
        num_train_val_u = train_val_u.shape[0]

        irrel_non_train_val = rng.choice(choice_for_u, size=num_items-num_rel-num_train_val_u, replace=False)

        #place items in train/val at the back 
        all_rel_item_on_top_u = np.concatenate([rel_item_u, irrel_non_train_val, train_val_u])

        assert all_rel_item_on_top_u.shape[0] == num_items
        all_rel_item_on_top[u] = all_rel_item_on_top_u

        #===STRATEGY 3===
        # half rel items in front, half rel items at the back: unfairest for IFD div
        # IDEA: modify from the all rel items at the top (cut until ~half//2, and concat it to the back)

        #copy the all_rel_items_on_top
        copy_all_rel_item_on_top_u =  copy.deepcopy(all_rel_item_on_top_u)
        half_u = num_rel //2

        if half_u != 0:
            # suppose num_rel == 5, half_u = 2, same strategy as num_rel = 4 to place 1 more rel item at the front
            # place all train and val items at the back
            half_half_u = np.concatenate([copy_all_rel_item_on_top_u[half_u:num_rel], irrel_non_train_val, copy_all_rel_item_on_top_u[:half_u], train_val_u])
        else:
            # there is only one rel item, and it's already in front, so leave it alone
            half_half_u = copy_all_rel_item_on_top_u

        assert half_half_u.shape[0] == num_items

        half_half[u] = half_half_u

    #===STRATEGY 2===
    # all rel items at the bottom
    # - fairest for IFD div and IFD mul
    # - unfairest for IAA and II-F

    #just reverse the strategy 1
    copy_all_rel_item_on_top =  copy.deepcopy(all_rel_item_on_top)
    all_rel_item_back = copy_all_rel_item_on_top[:, ::-1]
  
    rec_top_k_strategy2 = np.array([np.in1d(all_rel_item_back[u], pos_items[u], assume_unique=True) for u in range(pos_items.shape[0])], dtype=int) 

    assert all_rel_item_back.shape == all_rel_item_on_top.shape

    #===EVALUATE STRATEGY 1===

    print("Start evaluating strategy 1")
 
    #update structs 
    struct.set("rec.items", torch.from_numpy(all_rel_item_on_top.astype(int)))
    struct.set("rec.score",torch.empty((m, num_items+1))) #needed for IAArerank, +1 to add a dummy col for pred_rel (pred_rel only taken from 1: onwards)
    struct.set("rec.all_items", all_rel_item_on_top)
    # we do not update rec.topk as it is not needed by any of these 4 measures

    if not exist(dataset, "strategy1"):
        config1 = Config(
                        model="Pop", 
                        dataset="new_"+dataset, 
                        config_file_list=["RecBole/recbole/properties/overall.yaml"],
                        config_dict={"topk":list_k,
                                    "metrics":
                                        ["IAArerank", 
                                        "FixedIAAinsert", 
                                        "IIF", 
                                        "FixedIIF"
                                        ]
                                    }
                        )
                        #IAA and FixedIAA use rerank/insert version as we don't have the actual pred_rel, #and we need to unfixed att

        evaluator1 = Evaluator(config1)

        start_time = time.time()
        result1 = evaluator1.evaluate(struct)
        print("Strategy 1: ", result1)
        print("total time taken: ", time.time() - start_time)

        save_result(dataset, result1, "strategy1")
 
    #===EVALUATE STRATEGY 2===
    print("Start evaluating strategy 2")
 
    #update structs, but not updating rec score as they are still empty
    struct.set("rec.items", torch.from_numpy(all_rel_item_back.astype(int)))
    struct.set("rec.all_items", all_rel_item_back)

    #update rec.topk as well, because there is IFD mul
    struct.set("rec.topk", torch.from_numpy(rec_top_k_strategy2)) # actually the relevance matrix based on full_item_matrix

    
    if not exist(dataset, "strategy2"):
        #another config - needs all measures
        config2 = Config(
                    model="Pop", 
                    dataset="new_"+dataset, 
                    config_file_list=["RecBole/recbole/properties/overall.yaml"],
                    config_dict={"topk":list_k,
                                 "metrics":
                                    ["IAArerank", 
                                     "FixedIAAinsert", #IAA and FixedIAA use rerank version as we don't have the actual pred_rel
                                     "IFDrerank",
                                     "FixedIFDrerank", #using rerank version for the same reason as IAA & FixedIAA
                                     "IIF", 
                                     "FixedIIF"
                                     ]
                                 }
                    )
        evaluator2 = Evaluator(config2)

        start_time = time.time()
        result2 = evaluator2.evaluate(struct)
        print(result2)
        print("total time taken: ", time.time() - start_time)
        save_result(dataset, result2, "strategy2")


    #===EVALUATE STRATEGY 3===
    print("Start evaluating strategy 3")
    #another config - only IFD div
    #update structs
    struct.set("rec.all_items", half_half)
    struct.set("rec.items", None)
    #no need to update rec.topk and rec.items because it is not used in IFD div

    if not exist(dataset, "strategy3"):
        config3 = Config(
                        model="Pop", 
                        dataset=dataset, 
                        config_file_list=["RecBole/recbole/properties/overall.yaml"],
                        config_dict={"topk":list_k,
                                    "metrics": ["FixedIFDdiv"]
                                    }
                        )
        evaluator3 = Evaluator(config3)

        start_time = time.time()
        result3 = evaluator3.evaluate(struct)
        print(result3)
        print("total time taken: ", time.time() - start_time)
        save_result(dataset, result3, "strategy3")

    print("Start evaluating strategy 4")
    for k in list_k:
        print(f"k = {k}")

        if exist(dataset, f"strategy4_{k}"):
            continue

        #===STRATEGY 4===
        # custom strategy (look-up precomputed): unfairest IFD mul
        precomputed_strategy = np.zeros((m,num_items))          
        if k != 10:
            precomputed = pd.read_pickle(f"experiments/precomputeIFD/precomputeIFD_{dataset}_{k}.pickle")
        elif k==10:
            precomputed = pd.read_pickle(f"experiments/precomputeIFD/precomputeIFD_{dataset}.pickle")

        strategies = [precomputed[len(pos_item_u)]["strategy"] for pos_item_u in pos_items]


        # may not be the most efficient code due to repetition from above, sorry :(
        for u, strat in enumerate(strategies):
            copy_of_pos_items_of_u = copy.deepcopy(pos_items[u]) #copying because shuffle is inplace
            np.random.seed(u) #seed follows user idx
            np.random.shuffle(copy_of_pos_items_of_u)
            rel_item_u = copy_of_pos_items_of_u

            num_rel = rel_item_u.shape[0]

            top, bottom = strat

            item_at_top = rel_item_u[:top]

            #exclude items from train and val too, start from items not in train/val
            choice_for_u = np.setdiff1d(all_choices, copy_of_pos_items_of_u)

            train_val_u = train_val[u]
            choice_for_u = np.setdiff1d(choice_for_u, train_val_u)

            rng = np.random.default_rng(u) #seed follows user idx
            num_train_val_u = train_val_u.shape[0]

            irrel_non_train_val = rng.choice(choice_for_u, size=num_items-top-num_train_val_u, replace=False)

            #place items in train/val at the back 
            if bottom != 0:
               item_at_bottom = rel_item_u[top:] #i.e., the rest of the relevant items
               precomputed_strategy_u = np.concatenate([item_at_top, irrel_non_train_val, item_at_bottom, train_val_u])
            
            else: #everything on top
               precomputed_strategy_u = np.concatenate([item_at_top, irrel_non_train_val, train_val_u])

            assert precomputed_strategy_u.shape[0] == num_items

            precomputed_strategy[u] = precomputed_strategy_u


        # === EVALUATION === 


        config4 = Config(
                    model="Pop", 
                    dataset=dataset, 
                    config_file_list=["RecBole/recbole/properties/overall.yaml"],
                    config_dict={"topk":[k],
                                "metrics":
                                    ["FixedIFDmul"]
                                }
                    )
        #update structs
        struct.set("rec.all_items", precomputed_strategy)
        #set dataset name for struct
        struct.set("data.name", dataset) 

        evaluator4 = Evaluator(config4)

        start_time = time.time()
        result4 = evaluator4.evaluate(struct)
        print(result4)
        print("total time taken: ", time.time() - start_time)

        save_result(dataset, result4, f"strategy4_{k}")
