# OpenAI Supervised Fine-Tuning

This recipe allows TensorZero users to fine-tune OpenAI models using their own data.
Since TensorZero automatically logs all inferences and feedback, it is straightforward to fine-tune a model using your own data and any prompt you want.

To use this recipe, you need to set the following parameters:


In [None]:
from pathlib import Path

# Note: you should also set the CLICKHOUSE_URL environment variable to your ClickHouse URL
METRIC_NAME = "haiku_score"
FUNCTION_NAME = "write_haiku"
# The name of the variant to use to grab the templates used for fine-tuning
TEMPLATE_VARIANT_NAME = "initial_prompt_gpt4o_mini"
CONFIG_PATH = Path("../../examples/haiku_hidden_preferences/config/tensorzero.toml")
# Only relevant if the metric is a float metric
FLOAT_METRIC_THRESHOLD = 0.5
# Fraction of the data to use for validation
VAL_FRACTION = 0.2
# Max number of samples to use for fine-tuning
MAX_SAMPLES = 100_000
# The name of the model to fine-tune (supported models [here](https://platform.openai.com/docs/guides/fine-tuning))
MODEL_NAME = "gpt-4o-mini-2024-07-18"

In [None]:
import json
import os
import tempfile
import time
from typing import Any, Dict, List

import openai
import pandas as pd
import toml
from clickhouse_driver import Client
from IPython.display import clear_output
from minijinja import Environment

In [None]:
with CONFIG_PATH.open("r") as f:
    config = toml.load(f)

In [None]:
# Metrics are always float- or boolean-valued (we handle "comment" and "demonstration" specifically)
metric = config["metrics"][METRIC_NAME]

In [None]:
variant = config["functions"][FUNCTION_NAME]["variants"][TEMPLATE_VARIANT_NAME]
config_dir = CONFIG_PATH.parent
system_template_path = (
    config_dir / variant["system_template"] if "system_template" in variant else None
)
user_template_path = (
    config_dir / variant["user_template"] if "user_template" in variant else None
)
assistant_template_path = (
    config_dir / variant["assistant_template"]
    if "assistant_template" in variant
    else None
)

In [None]:
variant

In [None]:
templates = {}
if system_template_path:
    with system_template_path.open("r") as f:
        templates["system"] = f.read()
if user_template_path:
    with user_template_path.open("r") as f:
        templates["user"] = f.read()
if assistant_template_path:
    with assistant_template_path.open("r") as f:
        templates["assistant"] = f.read()
env = Environment(templates=templates)

In [None]:
clickhouse_client = Client.from_url(os.environ["CLICKHOUSE_URL"])

In [None]:
metric_table_name = {
    "float": "FloatMetricFeedback",
    "boolean": "BooleanMetricFeedback",
}.get(metric["type"])

if metric_table_name is None:
    raise ValueError(f"Unsupported metric type: {metric['type']}")

In [None]:
# Query the inferences and feedback from the database and join them on the inference id
# We do a bit of conditional logic to handle float and boolean metrics
threshold = FLOAT_METRIC_THRESHOLD if metric["type"] == "float" else 0.5
optimize_direction = metric["optimize"]
filtered_df = clickhouse_client.query_dataframe(
    """SELECT 
    i.variant_name, 
    i.input, 
    i.output, 
    f.value
FROM 
    Inference i
JOIN 
    %(metric_table_name)s f ON i.id = f.target_id
WHERE 
    i.function_name = %(function_name)s
    AND (
        (%(optimize_direction)s = 'max' AND f.value > %(threshold)s)
        OR (%(optimize_direction)s = 'min' AND f.value < %(threshold)s)
    )
LIMIT %(max_samples)s""",
    {
        "metric_table_name": metric_table_name,
        "function_name": FUNCTION_NAME,
        "optimize_direction": optimize_direction,
        "threshold": threshold,
        "max_samples": MAX_SAMPLES,
    },
)

In [None]:
def render_message(content: List[Dict[str, Any]], role: str) -> str:
    assert role in ["user", "assistant"]
    if len(content) != 1:
        raise ValueError(f"Message must have exactly one content block: {content}")
    if content[0]["type"] != "text":
        raise ValueError(f"Content block must be of type text: {content}")
    content = content[0]["value"]
    if isinstance(content, str):
        return content
    else:
        return env.render_template(role, **content)

In [None]:
def example_to_openai_messages(example) -> List[Dict[str, str]]:
    function_input = json.loads(example["input"])
    system = function_input.get("system", {})
    rendered_messages = []
    # Add the system message to the rendered messages
    # If there is data passed in or a system template there must be a system message
    if len(system) > 0 or system_template_path:
        if system_template_path:
            system_message = env.render_template("system", **system)
            rendered_messages.append({"role": "system", "content": system_message})
        else:
            rendered_messages.append({"role": "system", "content": system})
    # Add the input messages to the rendered messages
    for message in function_input["messages"]:
        rendered_message = render_message(message["content"], message["role"])
        rendered_messages.append({"role": message["role"], "content": rendered_message})
    # Add the output to the messages
    output = json.loads(example["output"])
    if len(output) != 1:
        raise ValueError(f"Output {output} has a not-one number of content blocks.")
    if output[0]["type"] != "text":
        raise ValueError(f"Output {output} has a not-text content block.")
    rendered_messages.append({"role": "assistant", "content": output[0]["text"]})
    return dict(messages=rendered_messages)

In [None]:
filtered_df["openai_messages"] = filtered_df.apply(example_to_openai_messages, axis=1)

In [None]:
# Split the data into training and validation sets for fine-tuning
# Shuffle the DataFrame
shuffled_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Calculate the split index
split_index = int(len(shuffled_df) * (1 - VAL_FRACTION))

# Split the data into training and validation sets
train_df = shuffled_df.iloc[:split_index]
val_df = shuffled_df.iloc[split_index:]

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

In [None]:
def upload_openai_training_data(df: pd.DataFrame, openai_client: openai.OpenAI) -> str:
    with tempfile.NamedTemporaryFile(
        mode="w", suffix=".jsonl", delete=False
    ) as temp_file:
        # Write the openai_messages to the temporary file
        for item in df["openai_messages"]:
            json.dump(item, temp_file)
            temp_file.write("\n")
        temp_file.flush()
        temp_file_path = temp_file.name

        # Upload the file to OpenAI
        with open(temp_file_path, "rb") as file:
            file_object = openai_client.files.create(file=file, purpose="fine-tune")

        return file_object.id

In [None]:
openai_client = openai.OpenAI()
train_file_object_id = upload_openai_training_data(train_df, openai_client)
val_file_object_id = upload_openai_training_data(val_df, openai_client)

In [None]:
ft_job = openai_client.fine_tuning.jobs.create(
    training_file=train_file_object_id,
    validation_file=val_file_object_id,
    model=MODEL_NAME,
)

In [None]:
# Keep an eye on the job status (this will take a while)
while True:
    clear_output(wait=True)
    job_status = openai_client.fine_tuning.jobs.retrieve(ft_job.id)
    print(f"job_status: {job_status}")
    if job_status.status in ("succeeded", "failed", "cancelled"):
        break
    time.sleep(10)

## Add the following block to your config file to include the fine-tuned model in your gateway:


In [None]:
fine_tuned_model = job_status.fine_tuned_model
model_config = {
    "models": {
        fine_tuned_model: {
            "routing": ["openai"],
            "providers": {"openai": {"type": "openai", "model_name": fine_tuned_model}},
        }
    }
}

model_instructions = f"```toml\n{toml.dumps(model_config)}\n```"

print(model_instructions)

## Add the following block to your config file to include the fine-tuned model in your function:


In [None]:
variant_name = f"haiku_{fine_tuned_model}"
variant_config = {
    "functions": {
        FUNCTION_NAME: {
            "variants": {
                variant_name: {
                    "type": "chat_completion",
                    "weight": 0,
                    "model": fine_tuned_model,
                }
            }
        }
    }
}

system_template = variant.get("system_template")
if system_template:
    variant_config["functions"][FUNCTION_NAME]["variants"][variant_name][
        "system_template"
    ] = system_template

user_template = variant.get("user_template")
if user_template:
    variant_config["functions"][FUNCTION_NAME]["variants"][variant_name][
        "user_template"
    ] = user_template

assistant_template = variant.get("assistant_template")
if assistant_template:
    variant_config["functions"][FUNCTION_NAME]["variants"][variant_name][
        "assistant_template"
    ] = assistant_template

variant_instructions = f"```toml\n{toml.dumps(variant_config)}\n```"
print(variant_instructions)
print("")

You can change the weight to enable a gradual rollout of the new model.
You might also add other parameters (max_tokens, temperature, etc.) to the variant TOML block.
