In [124]:
import re
import os
import json
import numpy as np
import matplotlib.pyplot as plt

In [125]:
def parse_log_file(filepath):
    sections = {"prompt": "", "response": "", "scores": []}
    current_section = None
    ground_truth_line = None

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if line == "=== Generated Prompt ===":
                current_section = "prompt"
                continue
            elif line == "=== GPT Response ===":
                current_section = "response"
                continue
            elif line == "=== Collecting and Comparing Assertions ===":
                current_section = "scores"
                continue

            if current_section == "prompt":
                sections["prompt"] += line + "\n"
            elif current_section == "response":
                sections["response"] += line + "\n"
            elif current_section == "scores":
                if line.startswith("=== Ground Oracle Truth:"):
                    ground_truth_line = (
                        line.replace("=== Ground Oracle Truth:", "")
                        .strip()
                        .replace(" ===", "")
                    )
                elif re.match(r"^\d+\.\d+\s*-\s*", line):
                    match = re.match(r"^(\d+\.\d+)", line)
                    if match:
                        sections["scores"].append(float(match.group(1)))

    sections["ground_truth"] = ground_truth_line
    return sections

In [126]:
def calc_log_data(log_folder):
    log_data = []
    for filename in os.listdir(log_folder):
        if filename.endswith(".log"):
            filepath = os.path.join(log_folder, filename)
            parsed_data = parse_log_file(filepath)
            if parsed_data["scores"]:
                log_data.append(
                    {
                        "file": filename,
                        "ground_truth": parsed_data.get("ground_truth"),
                        "top_score": parsed_data["scores"][0]
                        if parsed_data["scores"]
                        else None,
                        "avg_score": np.mean(parsed_data["scores"])
                        if parsed_data["scores"]
                        else None,
                        "full_log": parsed_data,
                    }
                )
    return log_data

In [127]:
def save_logs_as_json(log_data, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    for entry in log_data:
        output_path = os.path.join(output_folder, f"{entry['file']}.json")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(entry["full_log"], f, indent=4)

In [128]:
def plot_log_data(log_data, output_path):
    all_scores = [entry["top_score"] for entry in log_data]
    overall_avg = np.mean(all_scores)
    overall_std = np.std(all_scores)

    plt.figure(figsize=(10, 6))
    plt.hist(all_scores, bins=20, color="skyblue", edgecolor="black")
    plt.title(
        f"Distribution of Similarity Scores\n(Overall Avg: {overall_avg:.3f}, Std: {overall_std:.3f})"
    )
    plt.xlabel("Similarity Score")
    plt.ylabel("Number of Log Files")
    plt.savefig(output_path)
    plt.close()
    return overall_avg, overall_std

In [129]:
def print_stats(log_data):
    print(f"Processed {len(log_data)} log files")
    for entry in log_data:
        print(f"File: {entry['file']}")
        print(f"  Ground Oracle Truth: {entry['ground_truth']}")
        print(f"  Average Similarity Score: {entry['avg_score']:.3f}")
        print(f"  Top Score: {entry['top_score']:.3f}")
        print("------")

In [130]:
models = ["gpt-35-turbo", "gpt-4o-mini"]
temps = [0.0, 0.2, 0.7]
log_folders = [f"./assets/logs/{model}/{temp}" for temp in temps for model in models]

print(log_folders)

for log_folder in log_folders:
    if not os.path.exists(log_folder):
        print(f"Log folder does not exist: {log_folder}")
        continue

    log_data = calc_log_data(log_folder)

    output_json_folder = (
        f"./assets/processed_logs/{log_folder.replace('./assets/logs/', '')}"
    )
    save_logs_as_json(log_data, output_json_folder)

    plot_output_path = f"../results/{log_folder.replace('./logs/', '').replace('/', '_')}_distribution.png"
    overall_avg, overall_std = plot_log_data(log_data, plot_output_path)

    print(f"Processed logs from {log_folder}")
    print(f"  Overall Average Similarity Score: {overall_avg:.3f}")
    print(f"  Overall Standard Deviation: {overall_std:.3f}")
    print_stats(log_data)

['./assets/logs/gpt-35-turbo/0.0', './assets/logs/gpt-4o-mini/0.0', './assets/logs/gpt-35-turbo/0.2', './assets/logs/gpt-4o-mini/0.2', './assets/logs/gpt-35-turbo/0.7', './assets/logs/gpt-4o-mini/0.7']
Processed logs from ./assets/logs/gpt-35-turbo/0.0
  Overall Average Similarity Score: 0.833
  Overall Standard Deviation: 0.195
Processed 441 log files
File: 20250607_173544_testGetContext.log
  Ground Oracle Truth: assertNotNull(actual);
  Average Similarity Score: 0.461
  Top Score: 1.000
------
File: 20250607_180107_testFriendsOfFriends.log
  Ground Oracle Truth: assertEquals(1,qResult.size());
  Average Similarity Score: 0.538
  Top Score: 0.984
------
File: 20250607_180545_testGetProperty.log
  Ground Oracle Truth: assertEquals("STR",create(env).getProperty("STR"));
  Average Similarity Score: 0.553
  Top Score: 0.844
------
File: 20250607_174857_testGetPolledObjects.log
  Ground Oracle Truth: assertEquals(this.polledObjects,this.poller.getPolledObjects());
  Average Similarity Sco