In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import glob

from datetime import datetime

In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

In [None]:
top_k = 10

list_dataset = [
                "Lastfm",
                "Amazon-lb", 
                "QK-video",
                "Jester",
                "ML-10M",
                "ML-20M"
                ]

experiment_name = "simple_measure"

def get_file_names(dataset, model_type):
    if model_type == "base":
        path = "../cluster/best_struct"
    elif model_type == "rerank":
        path = "../reranking/rerank_struct"

    file_names =  glob.glob(f"{path}/*{dataset}*")

    return file_names

def load_struct(file_name):
    return pd.read_pickle(file_name)

def clean_name(name, data):
    name = name.replace("../cluster/best_struct\\struct_","")
    name = name.replace("../reranking/rerank_struct\\","")
    name = name.replace("new_","")
    name = name.replace(".pth","")
    name = name.replace(data,"")
    name = name.strip("_")

    name = name.replace("--","_")

    model_name = name.split("_")[0]

    if ".pickle" in name:
        if "GS" in name:
            model_name += "-GS"
        elif "borda" in name:
            model_name += "-BC"
        elif "combmnz" in name:
            model_name += "-CM"

    return model_name

def get_simple_measure(struct):
    num_unique_rec_items = struct.get("rec.items")[:,:top_k].unique().shape[0]
    num_rel_items = struct.get("rec.topk")[:,:top_k].sum().item()

    return num_unique_rec_items, num_rel_items

In [4]:
df_result = pd.DataFrame(columns=["dataset","source","num_unique_rec_items","num_rel_items"])

In [None]:
for dataset in list_dataset:
    file_names_base = get_file_names(dataset,"base")
    file_names_rerank = get_file_names(dataset,"rerank")

    file_names = file_names_base + file_names_rerank

    for file_name in file_names:
        struct = load_struct(file_name)
        model_name = clean_name(file_name, dataset)
        num_unique_rec_items, num_rel_items = get_simple_measure(struct)

        print(f"{dataset}, {model_name}, {num_unique_rec_items}, {num_rel_items}")
        df_result = pd.concat([df_result, pd.DataFrame([[dataset, model_name, num_unique_rec_items, num_rel_items]], columns=df_result.columns)], ignore_index=True)

Lastfm, BPR, 1028, 3271
Lastfm, ItemKNN, 1343, 3159
Lastfm, MultiVAE, 1465, 3227
Lastfm, NCL, 1284, 3379
Lastfm, BPR-BC, 1392, 2579
Lastfm, BPR-CM, 1494, 1698
Lastfm, BPR-GS, 1141, 3099
Lastfm, ItemKNN-BC, 1817, 2704
Lastfm, ItemKNN-CM, 1919, 1630
Lastfm, ItemKNN-GS, 1508, 3070
Lastfm, MultiVAE-BC, 1860, 2370
Lastfm, MultiVAE-CM, 1916, 1390
Lastfm, MultiVAE-GS, 1567, 2964
Lastfm, NCL-BC, 1761, 2591
Lastfm, NCL-CM, 1859, 1599
Lastfm, NCL-GS, 1422, 3182
Amazon-lb, BPR, 439, 6
Amazon-lb, ItemKNN, 518, 22
Amazon-lb, MultiVAE, 183, 18
Amazon-lb, NCL, 189, 17
Amazon-lb, BPR-BC, 502, 6
Amazon-lb, BPR-CM, 474, 9
Amazon-lb, BPR-GS, 456, 6
Amazon-lb, ItemKNN-BC, 540, 10
Amazon-lb, ItemKNN-CM, 488, 8
Amazon-lb, ItemKNN-GS, 528, 21
Amazon-lb, MultiVAE-BC, 240, 4
Amazon-lb, MultiVAE-CM, 233, 7
Amazon-lb, MultiVAE-GS, 208, 23
Amazon-lb, NCL-BC, 256, 9
Amazon-lb, NCL-CM, 252, 5
Amazon-lb, NCL-GS, 217, 18
QK-video, BPR, 4017, 370
QK-video, ItemKNN, 5787, 144
QK-video, MultiVAE, 650, 408
QK-video, NCL,

In [6]:
df_result.shape

(96, 4)

In [7]:
df_result 

Unnamed: 0,dataset,source,num_unique_rec_items,num_rel_items
0,Lastfm,BPR,1028,3271
1,Lastfm,ItemKNN,1343,3159
2,Lastfm,MultiVAE,1465,3227
3,Lastfm,NCL,1284,3379
4,Lastfm,BPR-BC,1392,2579
...,...,...,...,...
91,ML-20M,MultiVAE-CM,2531,1979
92,ML-20M,MultiVAE-GS,2069,2794
93,ML-20M,NCL-BC,1697,2640
94,ML-20M,NCL-CM,1846,2048


In [8]:
# commented out to avoid rewriting
df_result.to_csv(f"{experiment_name}/simple_measure_results.csv", index=False)

In [9]:
rel_measures = ["P@10",	"MAP@10", "R@10","NDCG@10"]
fair_measures = ["Jain_our@10",	"Ent_our@10", "Gini_our@10"]

def distance_based_rank_for_corr(model_distance_dict, data):
    rank_based_on_distance = model_distance_dict[data].unstack().reset_index()
    rank_based_on_distance.columns = ["rel","fair","models"]
    rank_based_on_distance = rank_based_on_distance.loc[rank_based_on_distance.rel.str.contains("^P|^R|NDCG|MAP")]
    rank_based_on_distance = rank_based_on_distance.loc[rank_based_on_distance.fair.str.contains("Jain|Gini|Ent")]
    rank_based_on_distance = rank_based_on_distance.loc[rank_based_on_distance.fair.str.contains("our")]
    rank_based_on_distance = rank_based_on_distance.loc[rank_based_on_distance.models.apply(lambda x: x[1]).dropna().index]
    rank_based_on_distance["col_name"] = rank_based_on_distance.rel + "-" + rank_based_on_distance.fair
    rank_based_on_distance = rank_based_on_distance[["col_name","models"]].T
    rank_based_on_distance.columns = rank_based_on_distance.loc["col_name"]
    rank_based_on_distance = rank_based_on_distance.iloc[1].T
    
    dict_rank_based_on_distance = {}

    for row, item in pd.DataFrame(rank_based_on_distance).iterrows():
        the_tup = item[0]
        model_name = the_tup[0]
        scores = the_tup[1]
        dict_rank_based_on_distance[row] = dict((key,val) for key,val in zip(model_name, scores))


    for_corr = pd.DataFrame(dict_rank_based_on_distance).T.applymap(lambda x: -x)

    return for_corr, dict_rank_based_on_distance


def get_avg(this_data):

    for_val = this_data.loc[this_data.source!="pareto"]
    for_val_rel = for_val[rel_measures]
    for_val_fair = for_val[fair_measures]
    for_val_fair.loc[:,for_val_fair.columns.str.contains("Gini")] = 1 - for_val_fair.loc[:,for_val_fair.columns.str.contains("Gini")]

    df_average = pd.DataFrame(columns=["rel", "fair", "score", "source"])

    for col in for_val_fair.columns:
        avg_val_for_col = (for_val_rel.values + for_val_fair[col].values.reshape(-1,1))/2
        df_avg_col = pd.DataFrame(avg_val_for_col, columns=rel_measures)
        df_avg_col["source"] = for_val.source.values
        df_avg_col["fair"] = col
        melted = df_avg_col.melt(["fair", "source"], var_name="rel", value_name="score")
        df_average = pd.concat([df_average, melted])

    return df_average

In [10]:
df_result = pd.read_csv(f"{experiment_name}/simple_measure_results.csv")
df_result

Unnamed: 0,dataset,source,num_unique_rec_items,num_rel_items
0,Lastfm,BPR,1028,3271
1,Lastfm,ItemKNN,1343,3159
2,Lastfm,MultiVAE,1465,3227
3,Lastfm,NCL,1284,3379
4,Lastfm,BPR-BC,1392,2579
...,...,...,...,...
91,ML-20M,MultiVAE-CM,2531,1979
92,ML-20M,MultiVAE-GS,2069,2794
93,ML-20M,NCL-BC,1697,2640
94,ML-20M,NCL-CM,1846,2048


In [11]:
combined_df_full = pd.read_csv("corr/combined_df_full.csv")
combined_df_full

Unnamed: 0,P@10,MAP@10,R@10,NDCG@10,Jain_our@10,Ent_our@10,Gini_our@10,dataset,source
0,0.382609,1.000000,0.991438,1.000000,0.355592,0.936316,0.213694,Amazon-lb,pareto
1,0.382380,0.999746,0.991184,0.999838,0.356685,0.936480,0.213537,Amazon-lb,pareto
2,0.382151,0.999517,0.990955,0.999693,0.357775,0.936644,0.213378,Amazon-lb,pareto
3,0.381922,0.999231,0.990669,0.999510,0.358859,0.936807,0.213219,Amazon-lb,pareto
4,0.381693,0.999002,0.990478,0.999364,0.359938,0.936970,0.213060,Amazon-lb,pareto
...,...,...,...,...,...,...,...,...,...
29085,0.010899,0.017391,0.047364,0.030122,0.014100,0.457582,0.975616,QK-video,MultiVAE-GS
29086,0.014001,0.021764,0.060801,0.038090,0.020076,0.507278,0.965565,QK-video,NCL
29087,0.010757,0.012823,0.045233,0.025319,0.075834,0.667371,0.909017,QK-video,NCL-BC
29088,0.008196,0.010063,0.033125,0.019387,0.071264,0.673934,0.902198,QK-video,NCL-CM


In [12]:
model_scores = pd.read_csv("corr/model_scores.csv")
model_scores

Unnamed: 0,dataset,source,AI-F_ori@10,Ent_our@10,Gini_our@10,IAA_true_ori@10,IBO_our@10,II-F_ori@10,Jain_our@10,MAP@10,MME_ori@10,NDCG@10,P@10,R@10
0,Amazon-lb,BPR,0.000327,0.746980,0.746800,0.011300,0.019139,0.005688,0.222596,0.002025,0.001003,0.003228,0.001373,0.005435
1,Amazon-lb,BPR-BC,0.000234,0.839115,0.600567,0.011292,0.028708,0.005682,0.431615,0.003485,0.000744,0.004925,0.001373,0.007532
2,Amazon-lb,BPR-CM,0.000244,0.809428,0.660077,0.011292,0.038278,0.005680,0.358523,0.004335,0.000760,0.006370,0.002059,0.010278
3,Amazon-lb,BPR-GS,0.000316,0.776303,0.702932,0.011300,0.019139,0.005688,0.258864,0.002025,0.000997,0.003228,0.001373,0.005435
4,Amazon-lb,ItemKNN,0.000294,0.802021,0.664811,0.011228,0.062201,0.005649,0.271328,0.006094,0.001011,0.010973,0.005034,0.013440
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,QK-video,MultiVAE-GS,0.000051,0.457582,0.975616,0.001144,0.032675,0.000656,0.014100,0.017391,0.000242,0.030122,0.010899,0.047364
92,QK-video,NCL,0.000035,0.507278,0.965565,0.001139,0.042916,0.000653,0.020076,0.021764,0.000243,0.038090,0.014001,0.060801
93,QK-video,NCL-BC,0.000005,0.667371,0.909017,0.001148,0.060229,0.000659,0.075834,0.012823,0.000189,0.025319,0.010757,0.045233
94,QK-video,NCL-CM,0.000005,0.673934,0.902198,0.001151,0.053645,0.000662,0.071264,0.010063,0.000191,0.019387,0.008196,0.033125


In [13]:
# get the big corr table

for dataset in list_dataset:
    print(dataset)
    #rel and fair measures
    model_scores_for_data = model_scores.query("dataset==@dataset")
    for_corr = model_scores_for_data.drop(columns=["dataset"])
    for_corr = for_corr.loc[:, ~for_corr.columns.str.contains("IBO_ori|IWO_ori")]
    for_corr = for_corr.T
    for_corr.columns = for_corr.loc["source"]
    for_corr.drop(index=["source"], inplace=True)

    for_corr.loc[for_corr.index.str.contains("AI|IAA|II|MME|Gini")] = for_corr.loc[for_corr.index.str.contains("AI|IAA|II|MME|Gini")].apply(lambda x: -x)


    #avg = higher score is better, so no need to invert
    this_data = combined_df_full.query("dataset==@dataset")
    avg = get_avg(this_data)
    avg["rel_fair"] = avg["rel"] + "-" + avg["fair"]
    avg = avg.drop(columns=["rel", "fair"])
    avg = avg.set_index("rel_fair")

    avg.index = avg.index.str.replace("@10","") + "-avg"
    for_corr_avg = avg.pivot(columns="source", values="score")

    #both num_unique_rec_items and num_rel_items are higher-is-better
    model_scores_for_data = df_result.query("dataset==@dataset")
    for_corr_fixed_quant = model_scores_for_data.drop(columns=["dataset"])
    for_corr_fixed_quant = for_corr_fixed_quant.T
    for_corr_fixed_quant.columns = for_corr_fixed_quant.loc["source"]
    for_corr_fixed_quant = for_corr_fixed_quant.loc["num_unique_rec_items":]
    
    for_corr_all_except_DPFR = pd.concat([for_corr, for_corr_avg, for_corr_fixed_quant], axis=0)
    for_corr_all = for_corr_all_except_DPFR.copy()

    #repeat this across different alpha
    for alpha in range(101):
        alpha = alpha / 100
        model_distance_dict = pd.read_pickle(f"distance_dict/model_distance_dict_full-alpha-{alpha}.pickle")
        to_append, _ = distance_based_rank_for_corr(model_distance_dict, dataset)
        
        to_append.index = to_append.index + f"-{alpha}"

        for_corr_all = pd.concat([for_corr_all, to_append], axis=0)

    for_corr_all.index = for_corr_all.index.str.replace("@10","")
    for_corr_all.index = for_corr_all.index.str.replace("_our","")
    the_corr = for_corr_all.T.corr("kendall").round(2)

    # save the_corr
    the_corr.to_csv(f"{experiment_name}/corr_simple_measure_{dataset}.csv")


Lastfm
Amazon-lb
QK-video
Jester
ML-10M
ML-20M
