In [None]:
import jsonlines
import os
import pandas as pd
import numpy as np

from statistics import mean
import warnings

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import gca
from matplotlib import figure
import seaborn as sb
#pip install numba==0.57.0
import numba as nb
from numba import njit
from numba.core import types
from numba.typed import Dict

In [None]:
# generate array for denom of ndcg calcs
rec_list = 10
base_logs = np.log2(np.arange(rec_list)+2)

@njit
def calculate_ndcg(histories: types.DictType(types.unicode_type, types.DictType(types.unicode_type, types.float64[:])), 
                  recommender: types.DictType(types.int64, types.UniTuple(types.float64[:], 2)),
                  base_logs: types.float64[:]) -> types.float64[:]:
    avg_of_ndcg = []
    for history in histories:
        i_count = 0
        all_ndcg = 0
        for user, items in histories[history].items():
            scores = np.empty(len(items), dtype=np.float64)
            #scores = []
            for item in items:
                idx_array = np.asarray(recommender[user][1] == item).nonzero()[0]
                if idx_array.size != 0:
                    idx = idx_array[0]
                    score = recommender[user][0][idx]
                else:
                    score = 0.0
                scores.append(score)
            #scores = np.asarray(scores)
            ideal_scores = np.sort(recommender[user][0])[::-1][:len(scores)]
            scores[scores > 0] = 1.0
            ideal_scores[ideal_scores > 0] = 1.0
            recdcg = np.sum(scores/base_logs)
            idealdcg = np.sum(ideal_scores/base_logs)
            if idealdcg == 0.0:
                ndcg = 0.0
            else:
                ndcg = recdcg/idealdcg
            i_count += 1
            all_ndcg += ndcg

        avg_of_ndcg.append(all_ndcg/i_count)
    return avg_of_ndcg

In [None]:
### STATISTICS FUNCTIONS ###

# implicit type detection for pandas lookups
def typecast(series, var):
    dtype = pd.api.types.infer_dtype(series)
    dtypes = {"string":str,"integer":int,"floating":float,"mixed-integer-float":float}
    if type(var) != dtypes[dtype]:
        var = dtypes[dtype](var)
    if dtype not in dtypes.keys():
        warnings.warn("Type of column "+series.name+" could not be implicitly determined. Defaulting to integer...")
        var = int(var)
    return var

# given an item id return a list of its features as binary values
def get_item_features(item_features, item_id):
    country = item_features.loc[(item_features.Item == typecast(item_features.Item, item_id)) & (item_features.Feature == typecast(item_features.Feature, "COUNTRY_low_pfr"))]["BV"]
    loan_size = item_features.loc[(item_features.Item == typecast(item_features.Item, item_id)) & (item_features.Feature == typecast(item_features.Feature, "loan_buck_5"))]["BV"]
    return (country, loan_size)

In [None]:
### VISUALIZATION FUNCTIONS ###

def process_history(history, fair=True, compat=True, alloc=True, lists=True):
    if fair:
        fair_list = [entry['allocation']['fairness scores'] for entry in history]
        fair_df = pd.DataFrame(fair_list)
    else:
        fair_df = None
    if compat:
        compat_list = [entry['allocation']['compatibility scores'] for entry in history]
        compat_df = pd.DataFrame(compat_list)
    else:
        compat_df = None
    if alloc:
        alloc_list = [entry['allocation']['output'] for entry in history]
        alloc_df = pd.DataFrame(alloc_list)
        alloc_df['none'] = (alloc_df['COUNTRY_low_pfr'] == 0) & (alloc_df['loan_buck_5'] == 0)
    else:
        alloc_df = None
    if lists:
        results_list = [process_results(entry['choice_out']['results']) for entry in history]
    else:
        results_list = None
    return fair_df, compat_df, alloc_df, results_list

def process_results(result_structs):
    return [(entry['item'], entry['score']) for entry in result_structs]

def plot_fairness_time(experiment_data, include_none=False, image_prefix=None):

    fair_df = experiment_data[0]
    fig, ax = plt.subplots(figsize=(10, 6))
    sb.set(font_scale=2)
    plt.xlabel("Time")
    plt.ylabel("Fairness")
    sb.lineplot(data=fair_df)
    image_file = image_prefix + '-fairness.png'
    plt.savefig(image_file)

def plot_allocation(experiment_data, include_none=False, image_prefix=None):
    alloc_df = pd.DataFrame(experiment_data[2])
    if include_none is False:
        if not alloc_df['none'][1:].any():
            alloc_df.drop('none', axis=1, inplace=True)
    fig, ax = plt.subplots(figsize=(10, 6))
    sb.set(font_scale=2)
    plt.xlabel("Time")
    plt.ylabel("Allocation")
    sb.lineplot(data=alloc_df.cumsum())
    image_file = image_prefix + '-allocation.png'
    plt.savefig(image_file)

def plot_fairness_regret(experiment_data, include_none=False, image_prefix=None):

    fair_df = experiment_data[0]
    regret = 1-fair_df
    fig, ax = plt.subplots(figsize=(10, 6))
    sb.set(font_scale=2)
    plt.xlabel("Time")
    plt.ylabel("Fairness Regret")
    sb.lineplot(data=regret.cumsum())
    image_file = image_prefix + '-regret.png'
    plt.savefig(image_file)

def do_plots(experiment_data, include_none=False, image_prefix=None):
    plot_fairness_time(experiment_data, include_none, image_prefix)
    plot_allocation(experiment_data, include_none, image_prefix)
    plot_fairness_regret(experiment_data, include_none, image_prefix)

def process(experiment, include_none=False, image_prefix=None):
    experiment_data = process_history(experiment)
    do_plots(experiment_data, include_none, image_prefix)

def process_names(name):
    orig_name = name
    for alloc in ["Baseline","Lottery","Weighted Product","Least Fair"]:
        name = name.replace(alloc, "")
        if name != orig_name:
            return name.rstrip(), alloc

## Read in Synthetic Data

In [None]:
base_path = "data/" 
#Specify the base names for each experiment
base_names = [
    "history_file_baseline",
    "history_file_least",
    "history_file_ofair",
    "history_file_product",
    "history_file_weighted"

]

item_features = pd.read_cvs('items_s1.csv')
recommender = pd.read_csv('recs_s1.csv', names=["User","Item","Score"])

# Generate file paths and mechanisms dynamically
history_files = []
mechanisms = []
for base_name in base_names:
    history_file_path = os.path.join(base_path, f"{base_name}.json")
    mechanism_name = f"{base_name}"
    history_files.append(history_file_path)
    mechanisms.append(mechanism_name)

In [None]:
processed_histories = {}
for mechanism in mechanisms:
    processed_histories[mechanism] = {}
    processed_histories[mechanism]["History"] = []
for history_file, mechanism in zip(history_files,mechanisms):
    with jsonlines.open(history_file) as reader:
        for obj in reader:
            processed_histories[mechanism]["History"].append(obj)
for mechanism in mechanisms:
    processed_histories[mechanism]["Statistics"] = {}
for mechanism in mechanisms:
    for line in processed_histories[mechanism]["History"]:
        results = line['choice_out']['results']
        results_list = []
        for result in results:
            results_list.append(result['item'])
        processed_histories[mechanism]["Statistics"][line['user']] = results_list

In [None]:
# convert history + recommender info to typed dicts for numba
light_histories = Dict.empty(
    key_type=types.unicode_type,
    value_type=types.DictType(types.unicode_type, types.float64[:]),
)
for history in processed_histories:
    consolidated_statistics = Dict.empty(
    key_type=types.unicode_type,
    value_type=types.float64[:]
    )
    for user, items in processed_histories[history]["Statistics"].items():
        consolidated_statistics[user] = np.asarray(items, dtype='f8')
    light_histories[history] = consolidated_statistics

light_recommender = Dict.empty(
    key_type=types.unicode_type,
    value_type=types.UniTuple(types.float64[:], 2),
)
for user in recommender["User"].unique():
    scores = recommender[recommender["User"] == user]["Score"].to_numpy(dtype='f8')
    items = recommender[recommender["User"] == user]["Item"].to_numpy(dtype='f8')
    light_recommender[user] = (scores, items)
avg_of_ndcg = calculate_ndcg(light_histories, light_recommender, base_logs)
# creates dataframe of average NDCG values
ndcg_table = pd.DataFrame(data=avg_of_ndcg, index=mechanisms, columns=["NDCG"])
ndcg_table


In [None]:
# calculates adj proportional fairness for the representation of item features
num_features = 10
feature_names = ["0","1","2","3","4","5","6","7","8","9"]
protected_features = ["0", "1"]
fairness_targets = [0., 0.]

proportional_fairness = []
adj_fairness = []
for history, name in zip(processed_histories, mechanisms):
    item_counter = 0
    features = [0]*num_features
    for items in processed_histories[history]["Statistics"].values():
        for item in items:
            item_counter += 1
            for idx, val in enumerate(get_item_features(item_features, item)):
                features[idx] = features[idx] + val
    proportional = [x/item_counter for x in features]
    proportional_fairness.append(proportional)
    i = 0
    calc_adj_fairness = []
    for idx, name in enumerate(feature_names):
        if name in protected_features:
            fair_target = fairness_targets[i]
            calc_adj_fairness.append(proportional[idx]/fair_target)
            i = i+1
    adj_fairness.append(calc_adj_fairness)
prop_fairness_results = pd.DataFrame(data=adj_fairness, columns=protected_features, index=mechanisms)
ndcg_fairness = prop_fairness_results.merge(ndcg_table, left_index = True, right_index = True)

In [None]:
# get fairness data to create boxplots
# create df w/ allocation/choice mechs and fairness scores over time
experiment_data = []
for history in processed_histories:
    choice, alloc = process_names(history)
    experiment = pd.DataFrame(process_history(processed_histories[history]["History"])[0])
    experiment["Choice Mech"] = choice
    experiment["Allocation Mech"] = alloc
    experiment_data.append(experiment)
experiments = pd.concat(experiment_data)
experiments["Time"] = experiments.index
experiments = pd.melt(experiments, id_vars=['Allocation Mech',"Choice Mech","Time"], value_vars=["0","1"],var_name='Agent', value_name="Fairness")

In [None]:
# store baseline means, then remove baseline
baseline1_mean = experiments[experiments["Allocation Mech"] == "Baseline"][experiments["Agent"] == "0"]["Fairness"].mean()
baseline2_mean = experiments[experiments["Allocation Mech"] == "Baseline"][experiments["Agent"] == "1"]["Fairness"].mean()
boxplots = experiments[experiments["Allocation Mech"] != "Baseline"]

In [None]:
sb.set(font_scale=1.5)

g = sb.catplot(data=boxplots,
           row="Allocation Mech",
           col="Choice Mech",
           x="Fairness",
           y="Agent",
           order=["0","1"],
           kind="box",
           height=2,
           aspect=2,
           margin_titles=True)

for ax in g.axes.flat:
    ax.axvline(x=baseline1_mean, color='tab:blue', ls="dashed", lw=3)
    ax.axvline(x=baseline2_mean, color='tab:orange', ls="dashed", lw=3)

g.set_titles(row_template='{row_name}', col_template='{col_name}')
g.tight_layout()

In [None]:
def make_scatterplot(fairness_df, base_ndcg, filename=None):
    sb.set_style("white")
    plot = sb.scatterplot(x='Average Fairness Score',
                      y='NDCG',
                      data=fairness_df,
                      style="Allocation",
                      hue="Choice",
                      s=100,
                      markers={"Lottery": "^", "Weighted": "X", "Least Fair": "o"})

    sb.set(font_scale = 1.25)
    plt.axhline(y = base_ndcg, linestyle="dashed")
    plt.xlabel("Average Fairness Score", fontsize=12, labelpad=10)
    plt.ylabel("nDCG", fontsize=12, labelpad=11)
    plt.legend(loc="lower left", fontsize=12) # location for plots in paper, may need to change with other data
    plt.tight_layout()
    if filename is not None:
        plt.savefig(filename)
    plt.show()


In [None]:
make_scatterplot(ndcg_fairness, baseline_accuracy, filename='synthetic_scatter.png')