# All Data

In [2]:
import glob
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from collections import OrderedDict
from datetime import datetime

plt.rcParams['figure.dpi'] = 200
plt.rcParams['savefig.dpi'] = 600


path = "most_unfair_fair"
list_k = [1,3,5,10]
list_dataset = ["Lastfm", "Amazon-lb", "QK-video", "ML-10M"]

list_strat1 = glob.glob(f"{path}/*strategy1*.pickle")
list_strat2 = glob.glob(f"{path}/*strategy2*.pickle")
list_strat3 = glob.glob(f"{path}/*strategy3*.pickle")
list_strat4 = glob.glob(f"{path}/*strategy4*.pickle")


In [3]:
def read_file(file):
    result_dict = pd.read_pickle(file)

    splitted_file_name = file\
                            .replace(path,"")\
                            .replace("\\","")\
                            .split("_")
    dataset = splitted_file_name[0]

    return dataset, result_dict

In [4]:
most_unfair_results = OrderedDict()
most_fair_results = OrderedDict()
for file in list_strat1:
    dataset, result_dict = read_file(file)

    #add to mostfair results
    most_fair_results[dataset] = result_dict

for file in list_strat2:
    dataset, result_dict = read_file(file)
    for key, val in result_dict.items():
        if "IFD" in key:
            #add to mostfair results 
            most_fair_results[dataset][key] = val
        else:
            if dataset not in most_unfair_results:
                most_unfair_results[dataset] = OrderedDict([(key,val)])
            else:
                most_unfair_results[dataset][key] = val


for file in list_strat3:
    dataset, result_dict = read_file(file)
    most_unfair_results[dataset].update(result_dict)

for file in list_strat4:
    dataset, result_dict = read_file(file)
    most_unfair_results[dataset].update(result_dict)



In [5]:
def prepare_df(most_results):
    df = pd.DataFrame(most_results).reset_index()
    df[["measure", "k"]] = df["index"].str.split("@", expand=True)
    df.drop(columns="index", inplace=True)

    df = df.melt(["measure","k"])
    df.k = df.k.astype(int)
    df.measure = df.measure\
                            .str.replace("_true","")\
                            .str.replace("_div$", "_div_ori", regex=True)\
                            .str.replace("_mul$", "_mul_ori", regex=True)\
                            .str.replace("_div", "$_\div$", regex=False)\
                            .str.replace("_mul", "$_\\times$", regex=False)
    
    df[["measure", "version"]] = df.measure.str.rsplit("_", n=1, expand=True)

    df.rename(columns={"value":"unfairness", "variable":"dataset", "k":"cut-off $k$"}, inplace=True)

    df = df.set_index(["dataset","measure", "version","cut-off $k$"])
    df = df.reindex(list_dataset, level=0)
    df.reset_index(inplace=True)
    df.sort_values("measure",kind="stable",inplace=True)

    df = df[~df.version.str.contains("att|cut")]

    return df

df_most_fair = prepare_df(most_fair_results)
df_most_unfair = prepare_df(most_unfair_results)


In [6]:
df_most_fair

Unnamed: 0,dataset,measure,version,cut-off $k$,unfairness
0,Lastfm,IAA,ori,1,
1,Lastfm,IAA,ori,3,0.002220
2,Lastfm,IAA,ori,5,0.001894
3,Lastfm,IAA,ori,10,0.001386
5,Lastfm,IAA,our,1,0.000000
...,...,...,...,...,...
135,ML-10M,II-F,ori,10,0.000205
136,ML-10M,II-F,our,1,0.000000
137,ML-10M,II-F,our,3,0.000000
138,ML-10M,II-F,our,5,0.000000


In [7]:
#for ori measures, MostFair, nothing hits 0 tho they are all close to 0
df_most_fair.query("version=='ori'").unfairness.unique()

array([          nan, 2.2195520e-03, 1.8939701e-03, 1.3855693e-03,
       3.3977788e-03, 3.1620027e-03, 4.1809667e-03, 2.1067400e-04,
       2.4529890e-04, 5.1437030e-04, 7.9882624e-03, 7.8994434e-03,
       7.6995198e-03, 4.9266000e-06, 1.1512000e-05, 4.3920000e-07,
       1.1266100e-05, 0.0000000e+00, 6.9042640e-04, 5.1591890e-04,
       3.6451990e-04, 2.5181840e-04, 1.5188063e-03, 1.1347847e-03,
       1.1346535e-03, 1.2519355e-03, 1.2648480e-04, 1.1268640e-04,
       1.4964950e-04, 1.8441170e-04, 1.3677480e-04, 1.8911650e-04,
       2.0437320e-04, 2.0502240e-04])

In [8]:
df_most_fair.query("version=='ori' & unfairness==0")

Unnamed: 0,dataset,measure,version,cut-off $k$,unfairness
21,Lastfm,IFD$_\times$,ori,1,0.0
23,Lastfm,IFD$_\times$,ori,3,0.0
25,Lastfm,IFD$_\times$,ori,5,0.0
27,Lastfm,IFD$_\times$,ori,10,0.0
61,Amazon-lb,IFD$_\times$,ori,1,0.0
63,Amazon-lb,IFD$_\times$,ori,3,0.0
65,Amazon-lb,IFD$_\times$,ori,5,0.0
67,Amazon-lb,IFD$_\times$,ori,10,0.0
101,QK-video,IFD$_\times$,ori,1,0.0
103,QK-video,IFD$_\times$,ori,3,0.0


In [9]:
df_most_fair.query("version=='ori' & measure == 'IFD$_\div$'")

Unnamed: 0,dataset,measure,version,cut-off $k$,unfairness
20,Lastfm,IFD$_\div$,ori,1,4.9266e-06
22,Lastfm,IFD$_\div$,ori,3,4.9266e-06
24,Lastfm,IFD$_\div$,ori,5,4.9266e-06
26,Lastfm,IFD$_\div$,ori,10,4.9266e-06
60,Amazon-lb,IFD$_\div$,ori,1,1.1512e-05
62,Amazon-lb,IFD$_\div$,ori,3,1.1512e-05
64,Amazon-lb,IFD$_\div$,ori,5,1.1512e-05
66,Amazon-lb,IFD$_\div$,ori,10,1.1512e-05
100,QK-video,IFD$_\div$,ori,1,4.392e-07
102,QK-video,IFD$_\div$,ori,3,4.392e-07


In [10]:
#for ori measures, MostUnfair, nothing hits 1 or even remotely close to that
df_most_unfair.query("version=='ori'").unfairness.unique()

array([       nan, 0.0032782 , 0.00363243, 0.00451802, 0.00688958,
       0.0081538 , 0.01131436, 0.00061926, 0.00077495, 0.00116418,
       0.00828965, 0.00839147, 0.00864603, 0.16885736, 0.14616515,
       0.13192899, 0.0924191 , 0.00070847, 0.00156338, 0.0011574 ,
       0.00136783, 0.00252844, 0.00417776, 0.00365671, 0.00398787,
       0.00031138, 0.00043957, 0.00041706, 0.00043402, 0.00020365,
       0.00047931, 0.00032898, 0.00039084, 0.0010847 , 0.00145651,
       0.0016088 , 0.00170311, 0.00349154, 0.00481846, 0.00536197,
       0.00569856, 0.00039891, 0.00056232, 0.00062926, 0.00067071,
       0.00018489, 0.00029176, 0.00033553, 0.00036264])

In [11]:
#for our measures, MostFair, all hit 0
df_most_fair.query("version=='our'").unfairness.unique()

array([0.])

In [12]:
df_most_unfair.query("version=='our'").unfairness.unique()

array([1.        , 0.98856209, 0.76201373, 0.6243597 , 0.95994747])

In [13]:
#for our measures, MostUnfair, only IFD div doesn't hit 1
df_most_unfair.query("version=='our' & unfairness!=1")

Unnamed: 0,dataset,measure,version,cut-off $k$,unfairness
22,Lastfm,IFD$_\div$,our,1,0.988562
25,Lastfm,IFD$_\div$,our,3,0.988562
28,Lastfm,IFD$_\div$,our,5,0.988562
31,Lastfm,IFD$_\div$,our,10,0.988562
62,Amazon-lb,IFD$_\div$,our,1,0.762014
65,Amazon-lb,IFD$_\div$,our,3,0.762014
68,Amazon-lb,IFD$_\div$,our,5,0.762014
71,Amazon-lb,IFD$_\div$,our,10,0.762014
102,QK-video,IFD$_\div$,our,1,0.62436
105,QK-video,IFD$_\div$,our,3,0.62436


In [14]:
df_most_unfair.query("version=='ori' & measure == 'IFD$_\div$'")

Unnamed: 0,dataset,measure,version,cut-off $k$,unfairness
20,Lastfm,IFD$_\div$,ori,1,0.168857
23,Lastfm,IFD$_\div$,ori,3,0.168857
26,Lastfm,IFD$_\div$,ori,5,0.168857
29,Lastfm,IFD$_\div$,ori,10,0.168857
60,Amazon-lb,IFD$_\div$,ori,1,0.146165
63,Amazon-lb,IFD$_\div$,ori,3,0.146165
66,Amazon-lb,IFD$_\div$,ori,5,0.146165
69,Amazon-lb,IFD$_\div$,ori,10,0.146165
100,QK-video,IFD$_\div$,ori,1,0.131929
103,QK-video,IFD$_\div$,ori,3,0.131929


In [None]:
import matplotlib.ticker as ticker


colors = sns.color_palette("colorblind")[2:4]
colors = list(reversed(colors))

def mass_plot(df, exp_type, sharey=True):
#separate per measure
    height = 1.15
    if exp_type=="unfairest":
        height -= .05
        markers = [7,6]
    else:
        markers = [6,7]

    grid = sns.relplot(
        data=df, row ="measure", x="cut-off $k$", y="unfairness",
        col="dataset", hue="version", style="version", legend="brief",
        kind="line", palette=colors, markers=markers, mew=0.4, mfc=None,
        facet_kws={'sharey': sharey, 'sharex': True,  "margin_titles":True}, 
        height=height, aspect=1.05
        )

    grid.set_titles(row_template = '$\\downarrow${row_name}', col_template = '{col_name}',size=6.6)

    sns.move_legend(grid, loc="upper center", ncols=2, bbox_to_anchor=(0.525, 1.025), frameon=True,fontsize=7, title=None, markerscale=0.8)
    for ax in grid.axes.flatten():
        ax.ticklabel_format(axis='y', style='sci', scilimits=(0,0), useMathText=True) 
        ax.yaxis.get_offset_text().set_fontsize(4.8)
        ax.yaxis.set_major_locator(ticker.MaxNLocator(nbins=5))
        ax.xaxis.label.set_size(fontsize=6)
        ax.yaxis.label.set_size(fontsize=6)
        ax.tick_params(axis='both', which='major', labelsize=6, rotation=0)
        for _, spine in ax.spines.items():
            spine.set_visible(True) 
    grid.set(xticks=list_k)

    for ax, margin in zip(grid.axes, grid._margin_titles_texts):
        ax[0].set_ylabel(margin.get_text())
        ax[0].yaxis.label.set_size(fontsize=7)

    if exp_type == "fairest":
        for ax in grid.axes[2]:
            ax.set_ylim(-0.1, 1)

    grid.set_titles(col_template = '{col_name}', row_template = "",size=6.6)
    grid.fig.tight_layout(w_pad=0.05, h_pad=0.2)
    
    now = datetime.now()
    time = str(now.strftime("%Y-%m-%d_%H%M%S"))
    grid.savefig(f"{path}/{exp_type}_{time}.pdf",bbox_inches="tight")   

mass_plot(df_most_fair, "fairest", sharey=False)

mass_plot(df_most_unfair, "unfairest", sharey=True)