## Plots

This notebooks contains the code relative to the construction of plots

In [None]:
import os
import pandas as pd
import plotly.graph_objects as go

In [None]:
res_to_plot = "single" # "single" or "multi"
to_plot = "Recall" # "Recall" or "NDCG"


retrieval_perfs = pd.read_json(f"results/recalls_ndcg/recall_ndcg_{res_to_plot}.json", orient="records")
agg = retrieval_perfs.groupby(["parser", "chunker"])[["recall", "ndcg"]].aggregate(["mean", "std"]).round(2)
agg = agg.reset_index()

parser_name_mapping = {
    "PyPDF": "base",
    "ChunkNorris": "chunknorris",
    "Docling": "docling",
    "Marker": "marker",
    "Open-Parse-P": "openparsecpu",
    "Open-Parse-U": "openparsegpu",
}
agg["parser"] = agg["parser"].map({v:k for k, v in parser_name_mapping.items()})

chunker_name_mapping = {
    "PageChunker": "PC",
    "RecursiveCharacterChunker": "RCC",
    "Default": "Default"
}
agg["chunker"] = agg["chunker"].map(chunker_name_mapping)

agg.loc[agg["chunker"] == "Default", "chunker"] = agg[agg["chunker"] == "Default"]["parser"].tolist()
agg["chunker"] = agg["chunker"].str.replace("-P", "").str.replace("-U", "")
agg = agg.rename({"recall": "Recall", "ndcg": "NDCG"}, axis=1)
agg = agg.sort_values(by=["chunker"])
agg

In [None]:
colors_mapping = {
    "ChunkNorris": "#2A3466",
    "Docling": "#5849BB",
    "Marker": "#5C00B8",
    "Open-Parse-P": "#477CE4",
    "Open-Parse-U": "#4CCCE6",
    "PyPDF": "#8FF5F6",
}

pattern_mapping = {
    "Default": "",
    "PC": "x",
    "RCC": "+",
}

In [None]:
fig = go.Figure(
    go.Bar(
        x=[agg["parser"], agg["chunker"]],
        y=agg[to_plot, "mean"],
        error_y=dict(
            type='data',
            array=agg[to_plot, "std"],
            visible=True,
            color='lightgray',
            thickness=1,
            width=3,
            ),
        # text=agg[to_plot, "mean"],
        # textposition='outside',
        marker_color=[colors_mapping[x] for x in agg["parser"]],
        marker_pattern_shape=[pattern_mapping[x] if x in pattern_mapping else "" for x in agg["chunker"]],
        marker_line_color="black",
        textangle=0
    )
)
for (parser, chunker), value in zip(zip(agg["parser"], agg["chunker"]), agg[to_plot, "mean"]):
    fig.add_annotation(
        x=(parser,chunker),
        y=.07, 
        text=str(value),
        showarrow=False,
        font=(dict(color='white')),
        textangle=90,
        font_color="black" if colors_mapping[parser] == "#8FF5F6" else "white",
        font_size=14,
        bgcolor=colors_mapping[parser],
        opacity=0.85,
    )

fig.update_layout(
    xaxis_title_text="",
    yaxis_title_text=f"{to_plot}@10",
    yaxis_range=[0,1.1],
    xaxis_color="black",
    yaxis_color="black",
    width=600,
    margin=dict(l=0, r=0, t=0, b=0),
    barmode='group',
    showlegend=False,
    legend=dict(
        yanchor="top",
        y=1.,
        xanchor="left",
        x=.6
        ),
    template="plotly_white",
    bargap=0.2,
    bargroupgap=0.1,
    font_size=14,
    )

fig.show()

In [None]:
fig.write_image(f"outputs/plots/{to_plot}at10_{res_to_plot}chunkdataset.pdf")