In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [93]:
logs_dir_path = "/Users/azamkhan/Downloads/museum_metric_logs"

In [None]:
import pandas as pd
import glob

# Load the log files
log_files = glob.glob(f"{logs_dir_path}/*_metrics_log.csv")

if len(log_files) < 1:
    raise ValueError("At least one log file is required.")

# Load all datasets
datasets = {
    file.split("/")[-1].replace("_metrics_log.csv", ""): pd.read_csv(file)
    for file in log_files
}



# Define a function to summarize metrics
def summarize_metrics(datasets):
    summary_results = []

    for name, data in datasets.items():
        summary = {"Model": name}
        data.drop(columns=["input","llm_output", "num_total_output_tokens"], inplace=True)
        for column in data.columns:
            if pd.api.types.is_numeric_dtype(data[column]):
                summary[f"avg_{column}"] = data[column].mean()
        summary_results.append(summary)

    return pd.DataFrame(summary_results)


# Perform the summarization
results_df = summarize_metrics(datasets)

# Output the results
print("\nSummary Table:")
print(results_df.to_string(index=False))


Summary Table:
 Model  avg_time_to_first_token  avg_total_time  avg_time_per_output_token  avg_throughput
gemini                 0.318182        0.680909                 117.285455       80.958182
  qwen                26.988000       36.577000                 103.038000        2.446000
 gemma               107.775000      131.928000                 332.512000        0.579000


In [97]:
results_df.to_csv(
    "/Users/azamkhan/columbia/deep_learning/museum-ai/evaluation_logs/analysis/metrics_summary.csv",
    index=False,
)

In [41]:
import pandas as pd
import glob

# Load the interpretation log files
log_files = glob.glob(f"{logs_dir_path}/*_interpretation_log.csv")

if len(log_files) < 1:
    raise ValueError("At least one interpretation log file is required.")

# Load all datasets
datasets = {
    file.split("/")[-1].replace("_interpretation_log.csv", ""): pd.read_csv(file)
    for file in log_files
}

# Ensure all datasets have the same columns
columns_set = [set(data.columns) for data in datasets.values()]
if not all(cols == columns_set[0] for cols in columns_set):
    raise ValueError(
        "The columns in the datasets do not match. Please verify the data."
    )

columns_to_analyze = list(columns_set[0])


# Define a function to summarize metrics
def summarize_interpretation_logs(datasets, columns):
    summary_results = []

    for name, data in datasets.items():
        summary = {"Model": name}
        for column in columns:
            if pd.api.types.is_numeric_dtype(data[column]):
                summary[f"avg_{column}"] = data[column].mean()
                summary[f"std_{column}"] = data[column].std()
        summary_results.append(summary)

    return pd.DataFrame(summary_results)


# Perform the summarization
interpretation_results_df = summarize_interpretation_logs(datasets, columns_to_analyze)

# Output the results
print("\nSummary Table:")
print(interpretation_results_df.to_string(index=False))


Summary Table:
 Model  avg_response_time  std_response_time
gemini           0.693636           0.074870
 gemma          51.012000           8.290423
  qwen           9.994000           1.559068


In [None]:
interpretation_results_df.to_csv(
    "/Users/azamkhan/columbia/deep_learning/museum-ai/evaluation_logs/analysis/interpretation_metrics_summary.csv",
    index=False,
)