In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv("merge.tsv", sep="\t")

# Define LLM pairs to compare
LLM_PAIRS = [
    ('Gemma', 'Gemma-large'),
    ('Llama', 'Llama-large'),
    ('Mistral', 'Mistral-large'),
    ('Qwen', 'Qwen-large')
]

# -------------------------------
# Analysis for all metrics
# -------------------------------
TARGET_METRICS = ["L_rate", "L_nonword", "C_rate", "C_outlier", 
                 "R_rate", "R_duplicate", "D_rate", "A_ir-tw", 
                 "A_missing-theme"]

for TARGET_METRIC in TARGET_METRICS:
    print(f"\n{'='*40}\nAnalyzing {TARGET_METRIC}\n{'='*40}")
    
    # Filter and clean data
    df_metric = df[df["LLM-based Metric"] == TARGET_METRIC].copy()
    df_metric = df_metric.drop(columns=["LLM-based Metric"])

    # Melt to long format
    melted = df_metric.melt(
        id_vars=["LLM", "K"],
        var_name="Method_Dataset",
        value_name="Score"
    )
    melted["Score"] = pd.to_numeric(melted["Score"], errors="coerce")

    # Pivot to wide format
    pivoted = melted.pivot_table(
        index=["Method_Dataset", "K"],
        columns="LLM",
        values="Score",
        aggfunc="mean"
    ).reset_index()

    # Clean columns
    pivoted.columns.name = None
    llm_columns = [col for col in pivoted.columns 
                  if col not in ["Method_Dataset", "K"]]
    pivoted_llms = pivoted[llm_columns].dropna()

    # Calculate correlation matrix
    corr_matrix = pivoted_llms.corr(method="pearson")

    # Extract specific base-large pairs
    results = []
    for base, large in LLM_PAIRS:
        if base in corr_matrix.columns and large in corr_matrix.columns:
            corr = corr_matrix.loc[base, large]
            results.append({
                'Base_LLM': base,
                'Large_LLM': large,
                'Correlation': round(corr, 4)
            })

    # Create and display results
    result_df = pd.DataFrame(results)
    if not result_df.empty:
        print(f"Base vs Large LLM Correlations for {TARGET_METRIC}:")
        print(result_df.to_string(index=False))
    else:
        print("No valid LLM pairs found for this metric")


Analyzing L_rate
Base vs Large LLM Correlations for L_rate:
Base_LLM     Large_LLM  Correlation
   Gemma   Gemma-large       0.9708
   Llama   Llama-large       0.7235
 Mistral Mistral-large       0.9284
    Qwen    Qwen-large       0.9198

Analyzing L_nonword
Base vs Large LLM Correlations for L_nonword:
Base_LLM     Large_LLM  Correlation
   Gemma   Gemma-large       0.0578
   Llama   Llama-large       0.9120
 Mistral Mistral-large       0.8599
    Qwen    Qwen-large       0.8456

Analyzing C_rate
Base vs Large LLM Correlations for C_rate:
Base_LLM     Large_LLM  Correlation
   Gemma   Gemma-large       0.5385
   Llama   Llama-large       0.9666
 Mistral Mistral-large       0.9842
    Qwen    Qwen-large       0.9958

Analyzing C_outlier
Base vs Large LLM Correlations for C_outlier:
Base_LLM     Large_LLM  Correlation
   Gemma   Gemma-large       0.7897
   Llama   Llama-large       0.5127
 Mistral Mistral-large       0.7991
    Qwen    Qwen-large       0.7732

Analyzing R_rate
Base v

In [1]:
import pandas as pd

# Load data
df = pd.read_csv("merge.tsv", sep="\t")

# Define datasets and topic models
datasets = ["20NG", "AGRIS", "TWEETS_NYR"]
topic_models = ["LDA", "ProdLDA", "CombineTM", "BERTopic"]

# Melt to long format
melted = df.melt(
    id_vars=["LLM-based Metric", "LLM", "K"],
    var_name="Dataset_TopicModel",
    value_name="Value"
)

# Split Dataset_TopicModel into components
melted[["TopicModel", "Dataset"]] = melted["Dataset_TopicModel"].str.split("-", n=1, expand=True)

# Filter valid datasets and average across all datasets, topic models, and K values
average_df = (
    melted.groupby(["LLM-based Metric", "LLM"])
    .Value.mean()
    .reset_index()
    .pivot(index="LLM-based Metric", columns="LLM", values="Value")
    .round(2)
)

# Rename metrics for clarity
metric_names = {
    "L_rate": "Lexical Validity Rate (↑)",
    "L_nonword": "Non-word Errors (↓)",
    "C_rate": "Coherence Rate (↑)",
    "C_outlier": "Outliers (↓)",
    "R_rate": "Repetitiveness Rate (↑)",
    "R_duplicate": "Duplicates (↓)",
    "D_rate": "Diversity Rate (↑)",
    "A_ir-tw": "Irrelevant Terms (↓)",
    "A_missing-theme": "Missing Themes (↓)",
}
average_df.rename(index=metric_names, inplace=True)

# Reorder rows logically
row_order = [
    "Lexical Validity Rate (↑)", "Non-word Errors (↓)",
    "Coherence Rate (↑)", "Outliers (↓)",
    "Repetitiveness Rate (↑)", "Duplicates (↓)",
    "Diversity Rate (↑)",
    "Irrelevant Terms (↓)", "Missing Themes (↓)"
]
average_df = average_df.loc[row_order]

latex_table = average_df.to_latex(
    caption="Average Performance Across All Datasets (20NG, AGRIS, TWEETS\\_NYR) and K Values (50, 100)",
    label="tab:llm_averages",
    position="ht",
    column_format="l" + "c" * len(average_df.columns),
    escape=False
)

# Add LaTeX formatting
latex_table = latex_table.replace(
    "LLM-based Metric", 
    "\\textbf{Metric} & \\textbf{" + "} & \\textbf{".join(average_df.columns) + "}"
).replace("_", "\\_")

print(latex_table)

\begin{table}[ht]
\caption{Average Performance Across All Datasets (20NG, AGRIS, TWEETS\\_NYR) and K Values (50, 100)}
\label{tab:llm\_averages}
\begin{tabular}{lccccccccc}
\toprule
LLM & Gemma & Gemma-large & Llama & Llama-large & Mistral & Mistral-large & Qwen & Qwen-large & automated \\
\textbf{Metric} & \textbf{Gemma} & \textbf{Gemma-large} & \textbf{Llama} & \textbf{Llama-large} & \textbf{Mistral} & \textbf{Mistral-large} & \textbf{Qwen} & \textbf{Qwen-large} & \textbf{automated} &  &  &  &  &  &  &  &  &  \\
\midrule
Lexical Validity Rate (↑) & 2.780000 & 2.770000 & 2.300000 & 2.840000 & 2.800000 & 2.740000 & 2.770000 & 2.860000 & NaN \\
Non-word Errors (↓) & 0.260000 & 0.210000 & 1.220000 & 1.150000 & 0.170000 & 0.720000 & 1.680000 & 0.540000 & NaN \\
Coherence Rate (↑) & 2.260000 & 2.330000 & 2.600000 & 2.430000 & 2.560000 & 2.340000 & 2.000000 & 2.020000 & NaN \\
Outliers (↓) & 1.690000 & 1.710000 & 1.670000 & 2.190000 & 1.010000 & 1.630000 & 2.210000 & 2.900000 & NaN \\
Repet