In [None]:
import pandas as pd
import re
import plotly.graph_objects as go

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/EuroEval/leaderboards/refs/heads/main/leaderboards/european_all.csv"
)
df.head(2)

In [None]:
columns_to_drop = [col for col in df.columns if col.endswith("_version")]
columns_to_drop += [col.replace("_version", "") for col in columns_to_drop]
columns_to_drop += [
    "generative_type",
    "parameters",
    "vocabulary_size",
    "context",
    "commercial",
    "merge",
    "rank",
]

clean_df = (
    df.map(lambda x: x.split("@@")[0] if isinstance(x, str) else x)
    .map(lambda x: re.sub(r"<.*?>(.*?)</.*>", r"\1", x) if isinstance(x, str) else x)
    .map(lambda x: re.sub(r"^(gemini|xai)/", "", x) if isinstance(x, str) else x)
    .map(lambda x: None if x == "-" else x)
    .drop(columns=columns_to_drop)
    .dropna()
    .set_index("model")
    .map(float)
)

zero_shot_df = (
    clean_df.reset_index()
    .map(lambda x: None if isinstance(x, str) and "few-shot" in x else x)
    .map(lambda x: re.sub(r" *\(.*\)", "", x) if isinstance(x, str) else x)
    .dropna()
    .set_index("model")
)

few_shot_df = (
    clean_df.reset_index()
    .map(lambda x: None if isinstance(x, str) and "few-shot" not in x else x)
    .map(lambda x: re.sub(r" *\(.*\)", "", x) if isinstance(x, str) else x)
    .map(lambda x: re.sub(r"^.*/", "", x) if isinstance(x, str) else x)
    .dropna()
    .set_index("model")
)

In [None]:
def plot_models(
    df: pd.DataFrame, models: list[str], title: str, max_score: float
) -> None:
    """Create a spider plot of a list of models."""
    fig = go.Figure()

    for model in models:
        model_scores = df.loc[model, :].tolist()
        trace = go.Scatterpolar(
            r=model_scores,
            theta=[x.capitalize() for x in clean_df.columns],
            name=model,
            fill="toself",
        )
        fig.add_trace(trace)

    fig.update_layout(
        polar=dict(radialaxis=dict(range=[max_score, 1])),
        showlegend=True,
        title=title.strip() + " (smaller is better)",
        width=800,
        height=500,
    )
    fig.show(config=dict(toImageButtonOptions=dict(scale=6)))

In [None]:
plot_models(
    df=zero_shot_df,
    models=[
        "o3-2025-04-16",
        "gemini-2.5-pro-preview-03-25",
        "gemini-2.5-flash-preview-04-17",
    ],
    title="Zero-shot Performance of SOTA Reasoning LLMs",
    max_score=3,
)

In [None]:
plot_models(
    df=zero_shot_df,
    models=[
        "gpt-4.1-2025-04-14",
        "grok-3-beta",
        "claude-3-5-sonnet-20241022",
        "gemini-2.0-flash-001",
    ],
    title="Zero-shot Performance of SOTA Non-Reasoning LLMs",
    max_score=3,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=[
        "SmolLM2-360M",
        "Pleias-Pico",
        "gpt-sw3-356m",
    ],
    title="Few-shot Performance of ~300M LMs",
    max_score=7,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=[
        "Llama-3.2-1B-Instruct",
        "gemma-3-1b-it",
        "Pleias-1.2b-Preview",
    ],
    title="Few-shot Performance of ~1B LMs",
    max_score=6,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=[
        "gemma-3-12b-it",
        "cogito-v1-preview-llama-8B",
        "Llama-3.1-8B-Instruct",
        "EuroLLM-9B-Instruct",
        "occiglot-7b-eu5-instruct",
        "Teuken-7B-instruct-commercial-v0.4",
    ],
    title="Few-shot Performance of ~8B LMs",
    max_score=5,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=[
        "gemma-3-27b-it",
        "Mistral-Small-24B-Instruct-2501",
        "aya-expanse-32b",
    ],
    title="Few-shot Performance of ~30B LMs",
    max_score=4,
)

In [None]:
plot_models(
    df=few_shot_df,
    models=[
        "Meta-Llama-3-70B",
        "Llama-3.3-70B-Instruct",
        "Qwen2.5-72B-Instruct",
    ],
    title="Few-shot Performance of ~70B LMs",
    max_score=3,
)