In [None]:
# To execute on branch "neurips24-plots" 

# Outside of Tournesol, solidago can be installed with the following command:
# %pip install "git+https://github.com/tournesol-app/tournesol.git@neurips24-plots#egg=solidago&subdirectory=solidago"

In [1]:
import logging
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from solidago.pipeline import Pipeline
from solidago.pipeline.inputs import TournesolInputFromPublicDataset
from solidago.pipeline.outputs import PipelineOutputInMemory

logging.basicConfig(level=logging.INFO)

In [2]:
# Download dataset, dated 2024-05-20
dataset = TournesolInputFromPublicDataset("https://kdrive.infomaniak.com/2/app/898769/share/fd11392f-ee1d-4fd5-aec2-eac8c27a5740/files/55/download")

In [3]:
def run_pipeline(pipeline: Pipeline, criterion: str):
    output = PipelineOutputInMemory()
    pipeline.run(input=dataset, criterion=criterion, output=output)

    user_to_n_comparisons = defaultdict(Counter)
    for c in dataset.get_comparisons(criteria=criterion).itertuples():
        user_to_n_comparisons[c.user_id].update([c.entity_a, c.entity_b])

    indiv_scores = output.individual_scores
    indiv_scores["comparisons"] = indiv_scores.apply(lambda s: user_to_n_comparisons[s.user_id][s.entity_id], axis=1)
    indiv_scores["n_comparisons"] = pd.cut(indiv_scores.comparisons, bins=[0, 1, 2, 4, 8, np.inf])    

    global_scores = output.entity_scores
    global_scores["contributors"] = global_scores["entity_id"].map(output.individual_scores.groupby("entity_id")["user_id"].nunique())
    global_scores["n_contributors"] = global_scores["contributors"].map(lambda x: str(x) if x <= 3 else "4+")

    return indiv_scores, global_scores

def plot_individual_scores(indiv_scores, subtitle="", dir=None):
    plt.figure(figsize=(15, 4))
    ax = sns.histplot(
        data=indiv_scores,
        x="score",
        hue="n_comparisons",
        hue_order=sorted(indiv_scores["n_comparisons"].unique(), reverse=True),
        palette="coolwarm_r",
        multiple="stack",
        linewidth=.5,
        binwidth=2,
        binrange=[-100,100],
    )
    ax.set_title("Displayed user scores $\\theta_{ue}^{\\bf display}$\n" + subtitle)
    if dir is not None:
        plt.tight_layout()
        plt.savefig(dir / f"indiv_scores.png", dpi=150)


def plot_global_scores(global_scores, subtitle="", dir=None):
    plt.figure(figsize=(15, 4))
    ax = sns.histplot(
        data=global_scores,
        x="score",
        hue="n_contributors",
        hue_order=sorted(global_scores["n_contributors"].unique()),
        palette="coolwarm",
        multiple="stack",
        binwidth=2,
        binrange=[-30, 70],
        linewidth=.5,
    )
    ax.set_title("Displayed global scores $\\rho_e^{\\bf display}$\n" + subtitle)
    if dir is not None:
        plt.tight_layout()
        plt.savefig(dir / f"global_scores.png", dpi=150)

In [None]:
import json
from math import sqrt
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor

from solidago.aggregation import EntitywiseQrQuantile
from solidago.preference_learning import UniformGBT
from solidago.scaling import ScalingCompose, Mehestan, QuantileZeroShift, Standardize


runs = [
    dict(pipeline=Pipeline(), subtitle="default pipeline"),
    dict(pipeline=Pipeline(aggregation=EntitywiseQrQuantile(quantile=0.3)), subtitle="$ \\alpha = 0.3 $"),
    dict(pipeline=Pipeline(aggregation=EntitywiseQrQuantile(quantile=0.4)), subtitle="$ \\alpha = 0.4 $"),
    dict(pipeline=Pipeline(aggregation=EntitywiseQrQuantile(quantile=0.5)), subtitle="$ \\alpha = 0.5 $"),
    dict(pipeline=Pipeline(preference_learning=UniformGBT(prior_std_dev=1/sqrt(0.002))), subtitle="$ \\alpha_{user}^{prior} = 0.002 $"),
    dict(pipeline=Pipeline(preference_learning=UniformGBT(prior_std_dev=1/sqrt(0.2))), subtitle="$ \\alpha_{user}^{prior} = 0.2 $"),
    dict(
        pipeline=Pipeline(scaling=ScalingCompose(Mehestan(), QuantileZeroShift(zero_quantile=0.1), Standardize())),
        subtitle="$ q^{zero}_{shift} = 0.1 $",
    ),
    dict(
        pipeline=Pipeline(scaling=ScalingCompose(Mehestan(), QuantileZeroShift(zero_quantile=0.2), Standardize())),
        subtitle="$ q^{zero}_{shift} = 0.2 $",
    ),
    dict(
        pipeline=Pipeline(scaling=ScalingCompose(Mehestan(), QuantileZeroShift(), Standardize(dev_quantile=0.5))),
        subtitle="$ q^{sss}_{dev} = 0.5 $",
    ),
    dict(
        pipeline=Pipeline(scaling=ScalingCompose(Mehestan(), QuantileZeroShift(), Standardize(dev_quantile=0.9))),
        subtitle="$ q^{sss}_{dev} = 0.9 $",
    ),
    dict(pipeline=Pipeline(aggregation=EntitywiseQrQuantile(lipschitz=0.05)), subtitle="$ L = 0.05 $"),
    dict(pipeline=Pipeline(aggregation=EntitywiseQrQuantile(lipschitz=0.2)), subtitle="$ L = 0.2 $"),
]

pool = ProcessPoolExecutor(max_workers=8)
submissions = dict()
for (idx, run) in enumerate(runs):
    submissions[idx] = pool.submit(run_pipeline, pipeline=run["pipeline"], criterion="largely_recommended")
for (idx, future) in submissions.items():
    dir = Path(f"./run_{idx:02}")
    dir.mkdir()
    run = runs[idx]
    pipeline = run["pipeline"]
    json.dump(pipeline.to_json(), (dir / "pipeline.json").open("w"))
    indiv_scores, global_scores = future.result()
    indiv_scores.to_csv(dir / "indiv_scores.csv")
    global_scores.to_csv(dir / "global_scores.csv")
    plot_individual_scores(indiv_scores, subtitle=run["subtitle"], dir=dir)
    plot_global_scores(global_scores, subtitle=run["subtitle"], dir=dir)
