In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import ipynbname
ROOT = Path(ipynbname.path()).resolve().parents[1]

In [5]:
abr=scenario='si_si'
path =ROOT/'RQ1_Effectiveness/f1_prec_recall_per_cls_model.csv'
df = pd.read_csv(path)
classifiers_to_keep = ['RandomForest', 'LogisticRegression', 'KNN', 'MLP', 'CatBoost', 'XGBoost']
def format_latex(mean, std):
    return f"${mean:.2f}\\pm{std:.2f}$"

In [8]:
grouped = df.groupby(['classifier', 'embedding_model']).agg(
    mean_train=('elapsed_train', 'mean'),
    std_train=('elapsed_train', 'std'),
    mean_test=('elapsed_test', 'mean'),
    std_test=('elapsed_test', 'std')
).reset_index()

embeddings = grouped['embedding_model'].unique()

# Construire LaTeX à la main (style demandé)
header = (
    "\\begin{table}[!ht]\n"
    "\\footnotesize\n"
    "\\centering\n"
    f"\\caption{{Training and testing time (in seconds, mean $\\pm$ std) per classifier and embedding model for scenario {scenario}.}}\n"
    f"\\label{{tab:time_{abr}}}\n"
    "\\resizebox{\\textwidth}{!}{%\n"
    "\\begin{tabular}{l c " + "c" * len(embeddings) + " c}\n"
    "\\toprule\n"
    "\\textbf{Classifier} & \\textbf{Metric} & " + " & ".join(embeddings) + " & \\textbf{Classifier Mean} \\\\\n"
    "\\midrule\n"
)

body = ""
for clf in classifiers_to_keep:
    sub = grouped[grouped['classifier'] == clf]
    for i, (metric, (mcol, scol)) in enumerate({
        "Train Time": ("mean_train", "std_train"),
        "Test Time": ("mean_test", "std_test"),
    }.items()):
        row = "  & " + metric if i > 0 else f"\\multirow{{2}}{{*}}{{{clf}}} & {metric}"
        values = []
        for emb in embeddings:
            mean = sub.loc[sub['embedding_model'] == emb, mcol].values[0]
            std = sub.loc[sub['embedding_model'] == emb, scol].values[0]
            values.append(format_latex(mean, std))
        # Moyenne par classifieur
        mean = sub[mcol].mean()
        std = sub[scol].mean()
        values.append(format_latex(mean, std))
        row += " & " + " & ".join(values) + " \\\\\n"
        body += row
    body += "\\midrule\n"

# Embedding Mean
for i, (metric, (mcol, scol)) in enumerate({
    "Train Time": ("mean_train", "std_train"),
    "Test Time": ("mean_test", "std_test"),
}.items()):
    row = "  & " + metric if i > 0 else "\\multirow{2}{*}{\\textbf{Embedding Mean}} & " + metric
    values = []
    for emb in embeddings:
        mean = grouped.loc[grouped['embedding_model'] == emb, mcol].mean()
        std = grouped.loc[grouped['embedding_model'] == emb, scol].mean()
        values.append(format_latex(mean, std))
    values.append("--")
    row += " & " + " & ".join(values) + " \\\\\n"
    body += row
body += "\\bottomrule\n"

footer = "\\end{tabular}\n}\n\\end{table}\n"

latex_code = header + body + footer

print(f"\n\n#====={abr} =====\n")
print(latex_code)




#=====si_si =====

\begin{table}[!ht]
\footnotesize
\centering
\caption{Training and testing time (in seconds, mean $\pm$ std) per classifier and embedding model for scenario si_si.}
\label{tab:time_si_si}
\resizebox{\textwidth}{!}{%
\begin{tabular}{l c cccccc c}
\toprule
\textbf{Classifier} & \textbf{Metric} & albert & bart & bert & distilbert & minilm & roberta & \textbf{Classifier Mean} \\
\midrule
\multirow{2}{*}{RandomForest} & Train Time & $226.13\pm2.37$ & $225.76\pm2.81$ & $227.01\pm5.22$ & $226.75\pm3.83$ & $227.16\pm2.77$ & $227.85\pm5.22$ & $226.78\pm3.70$ \\
  & Test Time & $2.49\pm0.16$ & $2.47\pm0.13$ & $2.48\pm0.16$ & $2.43\pm0.11$ & $2.51\pm0.16$ & $2.48\pm0.14$ & $2.48\pm0.14$ \\
\midrule
\multirow{2}{*}{LogisticRegression} & Train Time & $80.45\pm17.61$ & $80.56\pm17.81$ & $81.15\pm18.60$ & $81.26\pm18.58$ & $87.78\pm20.48$ & $81.11\pm18.41$ & $82.05\pm18.58$ \\
  & Test Time & $0.02\pm0.00$ & $0.02\pm0.00$ & $0.02\pm0.00$ & $0.02\pm0.00$ & $0.02\pm0.00$ & $0.02\pm0

In [13]:
import pandas as pd

# Mapping des noms pour affichage
embedding_names = {
    "albert": "ALBERT",
    "bart": "BART",
    "bert": "BERT",
    "distilbert": "DistilBERT",
    "minilm": "MiniLM",
    "roberta": "RoBERTa"
}
scenarios = {
    "si_si": "(S1, I1) $\\leftrightarrow$ (S2, I2)",      
    "s_s": "(S1, $\varnothing$) $\\leftrightarrow$ (S2, $\varnothing$)",        
    "i_i": "($\varnothing$, I1) $\\leftrightarrow$ ($\varnothing$, I2)",        
    "si_s":  "(S1, I1) $\\leftrightarrow$ (S2, $\varnothing$)",       
    "si_i":  "(S1, I1) $\\leftrightarrow$ ($\varnothing$, I2)",       
    "s_i":   "(S1, $\varnothing$) $\\leftrightarrow$ ($\varnothing$, I2)"        
}


models = list(embedding_names.keys())

# Fonction pour formater une cellule
def format_cell(src, tgt):
    return f"${src:.2f}$ / ${tgt:.2f}$"

# Charger et concaténer tous les CSV
all_results = []
for model in models:
    path = path =ROOT/ f"RQ2_Effeciency/timings/{model}_embeddings_timings.csv"
    df = pd.read_csv(path)
    df["model"] = embedding_names[model]
    all_results.append(df)

df_all = pd.concat(all_results)

# Liste des catégories (incluant TPC-DI)
categories = ["ChEMBL", "Magellan", "OpenData", "TPC-DI", "Wikidata"]

# Générer un tableau pour chaque scénario
for abr,scenario in scenarios.items():
    df_scenario = df_all[df_all["scenario"] == abr]

    # Moyenne par modèle et catégorie
    grouped = df_scenario.groupby(["model", "category"]).agg(
        mean_src=("t_embed_src_s", "mean"),
        mean_tgt=("t_embed_tgt_s", "mean")
    ).reset_index()

    # Construire tableau formaté
    formatted = pd.DataFrame(index=embedding_names.values(), columns=categories)
    for model in embedding_names.values():
        for cat in categories:
            sub = grouped[(grouped["model"] == model) & (grouped["category"] == cat)]
            if not sub.empty:
                formatted.loc[model, cat] = format_cell(sub["mean_src"].values[0], sub["mean_tgt"].values[0])
            else:
                formatted.loc[model, cat] = "--"

    # Construire la légende séparément pour éviter les problèmes de {}
    caption_text = (
        f"Embedding time per model and category for scenario {scenario}. "
        "Each cell shows $(\\mathcal{S}_1^{\\mathrm{gen}}, \\mathcal{I}_1^{\\mathrm{gen}})|"
        "(\\mathcal{S}_2^{\\mathrm{gen}}, \\mathcal{I}_2^{\\mathrm{gen}})$ time (seconds)."
    )

    # Génération LaTeX brut avec caption et label propres
    latex_body = formatted.to_latex(
        escape=False,
        index=True,
        column_format="l" + "c" * len(categories)
    )

    latex_code = (
        "\\begin{table}[!ht]\n"
        "\\footnotesize\n"
        "\\centering\n"
        f"\\caption{{{caption_text}}}\n"
        f"\\label{{tab:embedding_times_{scenario}}}\n"
        "\\resizebox{\\textwidth}{!}{%\n"
        + latex_body +
        "}\n"
        "\\end{table}\n"
    )

    print("\n\n")
    print(latex_code)





\begin{table}[!ht]
\footnotesize
\centering
\caption{Embedding time per model and category for scenario (S1, I1) $\leftrightarrow$ (S2, I2). Each cell shows $(\mathcal{S}_1^{\mathrm{gen}}, \mathcal{I}_1^{\mathrm{gen}})|(\mathcal{S}_2^{\mathrm{gen}}, \mathcal{I}_2^{\mathrm{gen}})$ time (seconds).}
\label{tab:embedding_times_(S1, I1) $\leftrightarrow$ (S2, I2)}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccc}
\toprule
 & ChEMBL & Magellan & OpenData & TPC-DI & Wikidata \\
\midrule
ALBERT & $8.29$ / $8.46$ & $2.73$ / $4.35$ & $19.21$ / $19.47$ & $7.83$ / $5.57$ & $7.30$ / $7.25$ \\
BART & $9.15$ / $9.48$ & $2.99$ / $4.56$ & $20.40$ / $20.60$ & $7.77$ / $5.43$ & $7.29$ / $7.29$ \\
BERT & $8.29$ / $8.57$ & $2.49$ / $3.95$ & $18.68$ / $18.61$ & $7.47$ / $5.26$ & $6.63$ / $6.81$ \\
DistilBERT & $4.02$ / $4.06$ & $1.25$ / $2.35$ & $8.98$ / $8.92$ & $3.50$ / $2.47$ & $3.15$ / $3.17$ \\
MiniLM & $2.23$ / $2.24$ & $0.53$ / $1.88$ & $4.58$ / $4.50$ & $1.77$ / $1.19$ & $1.33$ / $1.40$ \\
RoBE

In [25]:
from pathlib import Path
import pandas as pd

BASE = Path(ROOT/"MetaSpace")
GLOB_SUFFIX = "*/*/*/*_meta_CPU.csv"  

def collect_meta_cpu_files():
    files = []
    root = BASE
    files.extend(root.glob(GLOB_SUFFIX))
    return sorted(files)

files = collect_meta_cpu_files()
print(f"{len(files)} files found")


# ---------- (optionnel) Concaténer tout dans un seul CSV ----------
def parse_context(p: Path):
    """
    p = .../results_meta/<scenario>/<Cat>/<Rel>/<Run>/{model}_meta_CPU.csv
    Retourne (scenario, Cat, Rel, Run, model).
    """
    rel = p.relative_to(BASE)
    parts = rel.parts
    # parts: [scenario, Cat, Rel, Run, filename]
    scenario = parts[0] if len(parts) > 0 else "NA"
    Cat      = parts[1] if len(parts) > 1 else "NA"
    Rel      = parts[2] if len(parts) > 2 else "NA"
    Run      = parts[3] if len(parts) > 3 else "NA"
    filename = parts[-1]
    model    = filename[:-len("_meta_CPU.csv")] if filename.endswith("_meta_*.csv") else filename
    return scenario, Cat, Rel, Run, model

def concat_all_meta_cpu(files):
    dfs = []
    for p in files:
        try:
            df = pd.read_csv(p)
        except Exception as e:
            print(f"[skip] {p} → {e}")
            continue
        scenario, Cat, Rel, Run, model = parse_context(p)
        df["Scenario"] = scenario
        df["Category"] = Cat
        df["Relation"] = Rel
        df["Run"] = Run
        df["Model"] = model
        df["FilePath"] = str(p)
        dfs.append(df)

    if not dfs:
        print("Aucun CSV lisible.")
        return None

    big = pd.concat(dfs, ignore_index=True)
    out_path = BASE / "ALL_meta_CPU_concatenated.csv"
    big.to_csv(out_path, index=False)
    print(f"OK: {len(big)} lignes → {out_path}")
    return big



3330 files found


In [27]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# fonction pour lire un seul fichier
def read_file(path):
    small_df = pd.read_csv(path)
    small_df['Scenario'] = str(path).split('/')[7]
    return small_df

# lecture parallèle avec barre de progression
with ThreadPoolExecutor() as executor:
    dfs = list(tqdm(executor.map(read_file, files), total=len(files)))

# concaténation finale
df_ = pd.concat(dfs, axis=0, ignore_index=True)
df_.head()

100%|██████████| 3330/3330 [00:48<00:00, 69.01it/s] 


Unnamed: 0,Attribute1,Attribute2,true_match,Category,Relation,Dataset,Model,Euclidean,Cosine,Pearson,...,syn_jaccard_tokens,syn_dice_tokens,syn_jaccard_bigrams,syn_jaccard_trigrams,syn_cosine_bigrams,syn_cosine_trigrams,syn_jaro,syn_jaro_winkler,ExecutionTime,Scenario
0,assay_id,assays_assay_id,1.0,ChEMBL,Joinable,assays_both_50_1_ac1_ev,albert,0.903006,0.40771,0.590609,...,0.666667,0.8,0.857143,0.625,0.916667,0.867722,0.716667,0.83,2.703047,ChEMBL
1,assay_id,assays_tid,0.0,ChEMBL,Joinable,assays_both_50_1_ac1_ev,albert,0.925387,0.428171,0.569629,...,0.0,0.0,0.555556,0.333333,0.721688,0.507093,0.933333,0.96,3.248749,ChEMBL
2,assay_id,assays_relationship_type,0.0,ChEMBL,Joinable,assays_both_50_1_ac1_ev,albert,1.323032,0.875206,0.12163,...,0.0,0.0,0.173913,0.136364,0.356348,0.3,0.722222,0.833333,2.796039,ChEMBL
3,assay_id,assays_confidence_score,0.0,ChEMBL,Joinable,assays_both_50_1_ac1_ev,albert,1.036815,0.537493,0.459911,...,0.0,0.0,0.263158,0.15,0.416667,0.29277,0.782609,0.869565,2.936749,ChEMBL
4,assay_id,assays_curated_by,0.0,ChEMBL,Joinable,assays_both_50_1_ac1_ev,albert,1.35152,0.913303,0.085617,...,0.0,0.0,0.25,0.2,0.436436,0.372104,0.762255,0.857353,2.992144,ChEMBL


In [32]:
import pandas as pd

# Mapping des noms pour affichage
embedding_names = {
    "albert": "ALBERT",
    "bart": "BART",
    "bert": "BERT",
    "distilbert": "DistilBERT",
    "minilm": "MiniLM",
    "roberta": "RoBERTa"
}

scenarios = {
    "si_si": "(S1, I1) $\\leftrightarrow$ (S2, I2)",       
}

# Catégories d’intérêt
categories = ["ChEMBL", "Magellan", "OpenData", "TPC-DI", "Wikidata"]

# Fonction pour formater une cellule (mean ± std)
def format_cell(mean, std):
    return f"${mean:.2f}\\pm{std:.2f}$"

for abr, scenario in scenarios.items():
    df_scenario = df_.copy()

    # Étape 1 : somme des temps par (Dataset, Model)
    summed = df_scenario.groupby(["Dataset", "Category", "Model"]).agg(
        total_time=("ExecutionTime", "sum")
    ).reset_index()

    # Étape 2 : moyenne ± std au niveau de la catégorie
    grouped = summed.groupby(["Category", "Model"]).agg(
        mean_time=("total_time", "mean"),
        std_time=("total_time", "std")
    ).reset_index()

    # Construire tableau formaté
    formatted = pd.DataFrame(index=[embedding_names.get(m, m) for m in grouped["Model"].unique()],
                             columns=categories)
    for _, row in grouped.iterrows():
        model = embedding_names.get(row["Model"], row["Model"])
        cat = row["Category"]
        if cat in categories:
            formatted.loc[model, cat] = format_cell(row["mean_time"], row["std_time"])

    formatted = formatted.fillna("--")

    # Caption pour LaTeX
    caption_text = (
        f"Execution time (s) per embedding model and category for scenario {scenario}. "
        "Execution times are first summed per dataset, then averaged (mean $\\pm$ std) per category."
    )

    # Génération LaTeX du tableau brut
    latex_body = formatted.to_latex(
        escape=False,
        index=True,
        column_format="l" + "c" * len(categories)
    )

    # Ajouter le style complet
    latex_code = (
        "\\begin{table}[!ht]\n"
        "\\footnotesize\n"
        "\\centering\n"
        f"\\caption{{{caption_text}}}\n"
        f"\\label{{tab:exec_time_{abr}}}\n"
        "\\resizebox{\\textwidth}{!}{%\n"
        + latex_body +
        "}\n"
        "\\end{table}\n"
    )

    print("\n\n")
    print(latex_code)





\begin{table}[!ht]
\footnotesize
\centering
\caption{Execution time (s) per embedding model and category for scenario (S1, I1) $\leftrightarrow$ (S2, I2). Execution times are first summed per dataset, then averaged (mean $\pm$ std) per category.}
\label{tab:exec_time_si_si}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lccccc}
\toprule
 & ChEMBL & Magellan & OpenData & TPC-DI & Wikidata \\
\midrule
ALBERT & $905.92\pm393.80$ & $105.22\pm61.57$ & $4252.48\pm1858.30$ & $843.58\pm369.91$ & $1083.81\pm963.98$ \\
BART & $913.54\pm399.62$ & $107.65\pm63.82$ & $4280.64\pm1855.47$ & $839.50\pm367.58$ & $1086.20\pm960.00$ \\
BERT & $912.36\pm400.40$ & $105.19\pm62.44$ & $4272.54\pm1864.34$ & $839.52\pm367.35$ & $1076.37\pm958.85$ \\
DistilBERT & $908.67\pm397.01$ & $103.51\pm60.23$ & $4264.11\pm1867.86$ & $840.53\pm367.51$ & $1083.67\pm957.08$ \\
MiniLM & $892.69\pm393.47$ & $102.97\pm59.06$ & $4207.99\pm1837.58$ & $818.70\pm358.35$ & $1046.20\pm927.29$ \\
RoBERTa & $905.15\pm393.90$ & $105.04