In [25]:
import csv
import random
import numpy as np
import os


def kendall_tau(gold_idx, pred_idx):
    C = 0
    D = 0
    n = len(gold_idx)
    for i in range(n):
        for j in range(i+1, n):
            if (gold_idx[i] < gold_idx[j] and pred_idx[i] < pred_idx[j]) or \
               (gold_idx[i] > gold_idx[j] and pred_idx[i] > pred_idx[j]):
                C += 1
            else:
                D += 1
        
    total_pairs = n * (n-1) // 2
    return (C - D) / total_pairs

def normalize(s):
    return " ".join(s.strip().split())

def get_sentence_list(story_string):
    return [normalize(s) for s in story_string.split(' | ')]

def calculate_metrics(gold_story_list, pred_story_list):
    N = len(gold_story_list)
    if N <= 1:
        return 0, 0

    pmr = 1 if pred_story_list == gold_story_list else 0
    gold_idx = list(range(N))
    try:
        pred_idx = [gold_story_list.index(s) for s in pred_story_list]
        tau = kendall_tau(gold_idx, pred_idx)
    except ValueError:
        tau = 0 
    return pmr, tau


def evaluate_baseline(csv_path, baseline_type):
    all_pmr = []
    all_tau = []
    total_stories = 0

    try:
        with open(csv_path, encoding='utf-8') as f:
            reader = csv.DictReader(f)
            
            for row in reader:
                gold_sentences = get_sentence_list(row['gold'])
                shuffled_sentences = get_sentence_list(row['shuffled'])
                
                N = len(gold_sentences)
                if N <= 1:
                    continue

                if baseline_type == 'identity':
                    pred_sentences = shuffled_sentences
                
                elif baseline_type == 'random':
                    indices = list(range(N))
                    random.shuffle(indices)
                    pred_sentences = [shuffled_sentences[i] for i in indices]
                
                pmr, tau = calculate_metrics(gold_sentences, pred_sentences)
                
                all_pmr.append(pmr)
                all_tau.append(tau)
                total_stories += 1
                
    except FileNotFoundError:
        return 0, 0
    
    if total_stories == 0:
        return 0, 0
    return np.mean(all_pmr), np.mean(all_tau)


def main():
    CANONICAL_DATA_PATH = "data/final_outputs/train_gpt5_nano_reordered.csv"
    
    if not os.path.exists(CANONICAL_DATA_PATH):
        print("ERROR: Canonical data file not found. Ensure it is at the expected path.")
        return

    identity_pmr, identity_tau = evaluate_baseline(CANONICAL_DATA_PATH, 'identity')

    NUM_RUNS = 20
    random_pmr_runs = []
    random_tau_runs = []

    for _ in range(NUM_RUNS):
        pmr, tau = evaluate_baseline(CANONICAL_DATA_PATH, 'random')
        random_pmr_runs.append(pmr)
        random_tau_runs.append(tau)

    random_pmr = np.mean(random_pmr_runs)
    random_tau = np.mean(random_tau_runs)

    print(f"Identity Baseline PMR: {identity_pmr:.4f}")
    print(f"Identity Baseline Tau: {identity_tau:.4f}")
    print(f"Random Baseline PMR: {random_pmr:.4f}")
    print(f"Random Baseline Tau: {random_tau:.4f}")


if __name__ == "__main__":
    main()

Identity Baseline PMR: 0.0094
Identity Baseline Tau: -0.0019
Random Baseline PMR: 0.0084
Random Baseline Tau: 0.0007
