# ECD-Eye POC: Fine-Tuning Preparation

This notebook converts ECD rankings into the JSONL format required for OpenAI fine-tuning.

In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

## 1. Load Data

First, let's load the ECD rankings and baseline taglines.

In [None]:
# Paths
DATA_DIR = Path("../data")
RANKINGS_FILE = DATA_DIR / "rankings.csv"
BASELINE_FILE = DATA_DIR / "baseline.csv"

# Load rankings
rankings_df = pd.read_csv(RANKINGS_FILE)
print(f"Loaded {len(rankings_df)} rankings")

# Load baseline taglines
baseline_df = pd.read_csv(BASELINE_FILE)
print(f"Loaded {len(baseline_df)} baseline taglines")

# Display first few rows of each
print("\nRankings:")
rankings_df.head()

In [None]:
print("\nBaseline Taglines:")
baseline_df.head()

## 2. Analyze Rankings

Let's analyze the ECD rankings to understand their preferences.

In [None]:
# Count how many times each position (1-5) appears in each rank column
rank_counts = pd.DataFrame({
    "Rank 1": rankings_df["rank_1"].value_counts().sort_index(),
    "Rank 2": rankings_df["rank_2"].value_counts().sort_index(),
    "Rank 3": rankings_df["rank_3"].value_counts().sort_index(),
    "Rank 4": rankings_df["rank_4"].value_counts().sort_index(),
    "Rank 5": rankings_df["rank_5"].value_counts().sort_index()
})

rank_counts

In [None]:
# Visualize the distribution of rankings
plt.figure(figsize=(10, 6))
sns.heatmap(rank_counts, annot=True, cmap="YlGnBu", fmt="d")
plt.title("Distribution of Rankings")
plt.xlabel("Tagline Position")
plt.ylabel("Rank (1 = Best, 5 = Worst)")
plt.show()

## 3. Prepare Fine-Tuning Data

Now, let's prepare the fine-tuning data in the format required by OpenAI.

In [None]:
# Prepare fine-tuning data
finetune_data = []

for _, row in rankings_df.iterrows():
    brief_id = row["brief_id"]
    brief = row["brief"]
    
    # Get the taglines in order of ranking
    baseline_row = baseline_df[baseline_df["brief_id"] == brief_id].iloc[0]
    taglines = [
        baseline_row["tagline_1"],
        baseline_row["tagline_2"],
        baseline_row["tagline_3"],
        baseline_row["tagline_4"],
        baseline_row["tagline_5"]
    ]
    
    # Get the rankings (1 to 5, where 1 is best)
    rankings = [
        row["rank_1"],
        row["rank_2"],
        row["rank_3"],
        row["rank_4"],
        row["rank_5"]
    ]
    
    # Create a mapping of tagline index to rank
    tagline_ranks = {i: rank for i, rank in enumerate(rankings)}
    
    # Sort taglines by rank
    sorted_taglines = [taglines[i] for i in sorted(tagline_ranks, key=tagline_ranks.get)]
    
    # Get the top-ranked tagline
    best_tagline = sorted_taglines[0]
    
    # Create fine-tuning example
    finetune_example = {
        "messages": [
            {
                "role": "system",
                "content": "You are a punchy award-winning copywriter."
            },
            {
                "role": "user",
                "content": f"Write a punchy tagline (≤7 words) for: {brief}"
            }
        ],
        "response": best_tagline
    }
    
    finetune_data.append(finetune_example)

# Display the first few examples
finetune_data[:3]

## 4. Save Fine-Tuning Data

Finally, let's save the fine-tuning data to a JSONL file.

In [None]:
# Save fine-tuning data to JSONL file
finetune_file = DATA_DIR / "fine_tune.jsonl"
with open(finetune_file, "w") as f:
    for example in finetune_data:
        f.write(json.dumps(example) + "\n")

print(f"Fine-tuning data saved to {finetune_file}")
print(f"Number of examples: {len(finetune_data)}")

## 5. Verify Fine-Tuning Data

Let's verify that the fine-tuning data is in the correct format.

In [None]:
# Read the JSONL file back in
with open(finetune_file, "r") as f:
    lines = f.readlines()

# Parse each line as JSON
examples = [json.loads(line) for line in lines]

# Display the first example
examples[0]

In [None]:
# Check that all examples have the required fields
for i, example in enumerate(examples):
    if "messages" not in example or "response" not in example:
        print(f"Example {i} is missing required fields")
    if len(example["messages"]) != 2:
        print(f"Example {i} has {len(example['messages'])} messages instead of 2")
    if example["messages"][0]["role"] != "system" or example["messages"][1]["role"] != "user":
        print(f"Example {i} has incorrect message roles")

print(f"All {len(examples)} examples are in the correct format")

## 6. Next Steps

Now that we have prepared the fine-tuning data, we can submit a fine-tuning job to OpenAI using the `submit_finetune.py` script.

```bash
python ../scripts/submit_finetune.py
```

This will submit a fine-tuning job to OpenAI and save the model ID to `data/model_id.txt`.