In [2]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import ipynbname
ROOT = Path(ipynbname.path()).resolve().parents[1]

In [2]:

# Définition des scénarios
scenarios = {
    "si_si": "(S1, I1) $\\leftrightarrow$ (S2, I2)",          
}

# Classifieurs à garder
classifiers_to_keep = ['RandomForest', 'LogisticRegression', 'KNN', 'MLP', 'CatBoost', 'XGBoost']

# Fonction pour formatter les valeurs
def format_latex(mean, std):
    return f"${mean:.2f}\\pm{std:.2f}$"


In [5]:

for abr, scenario in scenarios.items():
    # Charger les résultats
    path = ROOT/'RQ1_Effectiveness/f1_prec_recall_per_cls_model.csv'
    df = pd.read_csv(path)

    # Garder uniquement les classifieurs sélectionnés
    df = df[df['classifier'].isin(classifiers_to_keep)]

    # Calcul des moyennes et écarts-types
    grouped = df.groupby(['classifier', 'embedding_model']).agg(
        mean_precision=('precision', 'mean'),
        std_precision=('precision', 'std'),
        mean_recall=('recall', 'mean'),
        std_recall=('recall', 'std'),
        mean_f1=('f1', 'mean'),
        std_f1=('f1', 'std')
    ).reset_index()

    embeddings = grouped['embedding_model'].unique()

    # Construire LaTeX à la main (style demandé)
    header = (
        "\\begin{table}[!ht]\n"
        "\\footnotesize\n"
        "\\centering\n"
        f"\\caption{{Precision, Recall, and F1-score (mean $\\pm$ std) per classifier and embedding model for scenario {scenario}.}}\n"
        f"\\label{{tab:prec_rec_f1_{abr}}}\n"
        "\\resizebox{\\textwidth}{!}{%\n"
        "\\begin{tabular}{l c " + "c" * len(embeddings) + " c}\n"
        "\\toprule\n"
        "\\textbf{Classifier} & \\textbf{Metric} & " + " & ".join(embeddings) + " & \\textbf{Classifier Mean} \\\\\n"
        "\\midrule\n"
    )

    body = ""
    for clf in classifiers_to_keep:
        sub = grouped[grouped['classifier'] == clf]
        for i, (metric, (mcol, scol)) in enumerate({
            "Precision": ("mean_precision", "std_precision"),
            "Recall": ("mean_recall", "std_recall"),
            "F1": ("mean_f1", "std_f1"),
        }.items()):
            row = "  & " + metric if i > 0 else f"\\multirow{{3}}{{*}}{{{clf}}} & {metric}"
            values = []
            for emb in embeddings:
                mean = sub.loc[sub['embedding_model'] == emb, mcol].values[0]
                std = sub.loc[sub['embedding_model'] == emb, scol].values[0]
                values.append(format_latex(mean, std))
            # Moyenne par classifieur
            mean = sub[mcol].mean()
            std = sub[scol].mean()
            values.append(format_latex(mean, std))
            row += " & " + " & ".join(values) + " \\\\\n"
            body += row
        body += "\\midrule\n"

    # Embedding Mean
    for i, (metric, (mcol, scol)) in enumerate({
        "Precision": ("mean_precision", "std_precision"),
        "Recall": ("mean_recall", "std_recall"),
        "F1": ("mean_f1", "std_f1"),
    }.items()):
        row = "  & " + metric if i > 0 else "\\multirow{3}{*}{\\textbf{Embedding Mean}} & " + metric
        values = []
        for emb in embeddings:
            mean = grouped.loc[grouped['embedding_model'] == emb, mcol].mean()
            std = grouped.loc[grouped['embedding_model'] == emb, scol].mean()
            values.append(format_latex(mean, std))
        values.append("--")
        row += " & " + " & ".join(values) + " \\\\\n"
        body += row
    body += "\\bottomrule\n"

    footer = "\\end{tabular}\n}\n\\end{table}\n"

    latex_code = header + body + footer
    print("\n")
    print(latex_code)




\begin{table}[!ht]
\footnotesize
\centering
\caption{Precision, Recall, and F1-score (mean $\pm$ std) per classifier and embedding model for scenario (S1, I1) $\leftrightarrow$ (S2, I2).}
\label{tab:prec_rec_f1_si_si}
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c cccccc c}
\toprule
\textbf{Classifier} & \textbf{Metric} & albert & bart & bert & distilbert & minilm & roberta & \textbf{Classifier Mean} \\
\midrule
\multirow{3}{*}{RandomForest} & Precision & $0.99\pm0.00$ & $0.99\pm0.00$ & $0.99\pm0.00$ & $0.99\pm0.00$ & $0.99\pm0.00$ & $0.99\pm0.00$ & $0.99\pm0.00$ \\
  & Recall & $0.82\pm0.01$ & $0.82\pm0.01$ & $0.82\pm0.01$ & $0.82\pm0.01$ & $0.83\pm0.01$ & $0.82\pm0.01$ & $0.82\pm0.01$ \\
  & F1 & $0.90\pm0.00$ & $0.90\pm0.00$ & $0.90\pm0.00$ & $0.90\pm0.00$ & $0.90\pm0.00$ & $0.90\pm0.00$ & $0.90\pm0.00$ \\
\midrule
\multirow{3}{*}{LogisticRegression} & Precision & $0.30\pm0.00$ & $0.30\pm0.00$ & $0.30\pm0.00$ & $0.30\pm0.00$ & $0.30\pm0.00$ & $0.30\pm0.00$ & $0.30\pm0.00$ \\
  & 

In [7]:
import pandas as pd

# Nom des embeddings en joli
embedding_names = {
    "albert": "ALBERT",
    "bart": "BART",
    "bert": "BERT",
    "distilbert": "DistilBERT",
    "minilm": "MiniLM",
    "roberta": "RoBERTa"
}

scenarios = {
    "si_si": "(S1, I1) $\\leftrightarrow$ (S2, I2)",          
}

def format_latex(mean, std):
    return f"${mean:.2f}\\pm{std:.2f}$"

for abr, scenario in scenarios.items():
    # Charger les résultats
    path =  ROOT/'RQ1_Effectiveness/f1_prec_recall_per_cls_model_cat.csv'
    df = pd.read_csv(path)

    # Garder uniquement XGBoost
    df = df[df["classifier"] == "XGBoost"]

    # Groupement par Category + Embedding
    grouped = df.groupby(["category", "embedding_model"]).agg(
        mean_precision=("precision", "mean"),
        std_precision=("precision", "std"),
        mean_recall=("recall", "mean"),
        std_recall=("recall", "std"),
        mean_f1=("f1", "mean"),
        std_f1=("f1", "std")
    ).reset_index()

    categories = grouped["category"].unique()
    embeddings = list(embedding_names.keys())

    # Construire header du tableau
    header = (
        "\\begin{table}[!ht]\n"
        "\\footnotesize\n"
        "\\centering\n"
        f"\\caption{{Precision, Recall, and F1-score (mean $\\pm$ std) with XGBoost across categories for scenario {scenario}.}}\n"
        f"\\label{{tab:xgboost_{abr}}}\n"
        "\\resizebox{\\textwidth}{!}{%\n"
        "\\begin{tabular}{l c " + "c" * len(embedding_names) + "}\n"
        "\\toprule\n"
        "\\textbf{Category} & \\textbf{Metric} & "
        + " & ".join(embedding_names.values()) + " \\\\\n"
        "\\midrule\n"
    )

    body = ""
    for cat in categories:
        sub = grouped[grouped["category"] == cat]
        for i, (metric, (mcol, scol)) in enumerate({
            "Precision": ("mean_precision", "std_precision"),
            "Recall": ("mean_recall", "std_recall"),
            "F1": ("mean_f1", "std_f1"),
        }.items()):
            row = f"{cat} & {metric}" if i == 0 else f" & {metric}"
            values = []
            for emb_key in embeddings:
                tmp = sub[sub["embedding_model"] == emb_key]
                if not tmp.empty:
                    mean = tmp[mcol].values[0]
                    std = tmp[scol].values[0]
                    values.append(format_latex(mean, std))
                else:
                    values.append("--")
            row += " & " + " & ".join(values) + " \\\\\n"
            body += row
        body += "\\midrule\n"

    footer = "\\bottomrule\n\\end{tabular}\n}\n\\end{table}\n"

    latex_code = header + body + footer
    print("\n\n")
    print(latex_code)





\begin{table}[!ht]
\footnotesize
\centering
\caption{Precision, Recall, and F1-score (mean $\pm$ std) with XGBoost across categories for scenario (S1, I1) $\leftrightarrow$ (S2, I2).}
\label{tab:xgboost_si_si}
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c cccccc}
\toprule
\textbf{Category} & \textbf{Metric} & ALBERT & BART & BERT & DistilBERT & MiniLM & RoBERTa \\
\midrule
ChEMBL & Precision & $0.95\pm0.01$ & $0.95\pm0.01$ & $0.95\pm0.01$ & $0.95\pm0.01$ & $0.94\pm0.01$ & $0.95\pm0.01$ \\
 & Recall & $0.96\pm0.01$ & $0.96\pm0.01$ & $0.96\pm0.01$ & $0.96\pm0.01$ & $0.96\pm0.01$ & $0.96\pm0.01$ \\
 & F1 & $0.95\pm0.01$ & $0.95\pm0.01$ & $0.95\pm0.01$ & $0.95\pm0.01$ & $0.95\pm0.00$ & $0.95\pm0.01$ \\
\midrule
Magellan & Precision & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0.00$ \\
 & Recall & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0.00$ \\
 & F1 & $1.00\pm0.00$ & $1.00\pm0.00$ & $1.00\pm0