In [1]:
import csv
import os

ORDERS_PATH = "data/processed/train_reordered_pairs_gpt5_nano_numeric.csv"
SHUFFLED_PATH = "data/processed/train_processed.csv"
OUTPUT_PATH = "data/processed/train_gpt5_nano_reordered.csv"


def apply_order_to_shuffled(shuffled_story: str, order_str: str) -> str:
    sentences = [s.strip() for s in shuffled_story.split("|")]
    digits = [ch for ch in order_str if ch.isdigit()]

    if len(digits) != len(sentences):
        pass

    reordered = []
    for d in digits:
        idx = int(d) - 1
        if 0 <= idx < len(sentences):
            reordered.append(sentences[idx])
        else:
            reordered.append(f"[INDEX_OUT_OF_RANGE_{d}]")

    return " | ".join(reordered)


def main():
    orders = []

    with open(ORDERS_PATH, "r", newline="", encoding="utf-8") as f_orders:
        reader = csv.DictReader(f_orders)
        for row in reader:
            order_str = row["reordered_story"].strip()
            orders.append(order_str)

    with open(SHUFFLED_PATH, "r", newline="", encoding="utf-8") as f_in, \
         open(OUTPUT_PATH, "w", newline="", encoding="utf-8") as f_out:

        reader = csv.DictReader(f_in)
        writer = csv.writer(f_out, quoting=csv.QUOTE_ALL)

        writer.writerow(["story_id", "gold", "shuffled", "model_reordered"])

        for row in reader:
            story_id = int(row["story_id"])
            gold = row["gold"]
            shuffled = row["shuffled"]

            if story_id >= len(orders):
                continue

            order_str = orders[story_id]

            model_reordered = apply_order_to_shuffled(shuffled, order_str)

            writer.writerow([story_id, gold, shuffled, model_reordered])
            f_out.flush()
            os.fsync(f_out.fileno())


if __name__ == "__main__":
    main()
