In [2]:
import json
import tiktoken  # for token counting
import numpy as np
from collections import defaultdict
import os
from dotenv import load_dotenv
load_dotenv()

True

### Data Preparation and Analysis for GPT-4o Mini Fine-Tuning

In [2]:
# Data loading
data_path = "data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 1173
First example:
{'role': 'system', 'content': 'You are an assistant for question-answering tasks about amazon web service.'}
{'role': 'user', 'content': 'Q: What is Amazon EC2 Auto Scaling?'}
{'role': 'assistant', 'content': 'Amazon EC2 Auto Scaling is a fully managed service designed to launch or terminate Amazon EC2 instances automatically to help ensure you have the correct number of Amazon EC2 instances available to handle the load for your application. Amazon EC2 Auto Scaling helps you maintain application availability through fleet management for EC2 instances, which detects and replaces unhealthy instances, and by scaling your Amazon EC2 capacity up or down automatically according to conditions you define. You can use Amazon EC2 Auto Scaling to automatically increase the number of Amazon EC2 instances during demand spikes to maintain performance and decrease capacity during lulls to reduce costs.'}


### Format Validation

In [3]:
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
    messages = ex.get("messages", None)
    if messages is None:
        format_errors["missing_messages_list"] += 1
    else:
        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1
            if message.get("role") not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1
            if not isinstance(message.get("content", ''), str):
                format_errors["missing_content"] += 1
        if not any(message.get("role") == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


### Token Counting Utilities

In [4]:
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        num_tokens += len(encoding.encode(message['content']))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.05)}, {np.quantile(values, 0.95)}")

### Data Warnings and Token Counts

In [5]:
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    system_present = any(m["role"] == "system" for m in messages)
    user_present = any(m["role"] == "user" for m in messages)
    n_missing_system += not system_present
    n_missing_user += not user_present
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_tokens_from_messages([m for m in messages if m["role"] == "assistant"]))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 36, 972
mean / median: 125.70332480818415, 109.0
p5 / p95: 49.0, 264.7999999999997

#### Distribution of num_assistant_tokens_per_example:
min / max: 4, 946
mean / median: 95.42540494458653, 78.0
p5 / p95: 18.0, 234.0


## Cost Estimation

In [6]:
# Assume each example's token count does not exceed the maximum context length for the model.
n_epochs = 3  # typically a good starting point
n_train_examples = len(dataset)
n_billing_tokens_in_dataset = sum(l for l in convo_lens)

print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~147450 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~442350 tokens


### Uploading a Training File for Fine-Tuning

In [3]:
from openai import OpenAI

client = OpenAI()

file_upload_response = client.files.create(
    file=open("data.jsonl", "rb"),  # Make sure the file path and name are correct
    purpose="fine-tune"
)
print(file_upload_response)  # This will print the response from the server including the file ID

### Start Fine-Tuning Job

In [17]:
# Following code will start the finetuning job which you can check in the OpenAI Dashboard
fine_tuning_response = client.fine_tuning.jobs.create(
    training_file=file_upload_response.id,  # File id from the above response
    model=os.environ['OPENAI_FINETUNING_MODEL'],
    hyperparameters={'n_epochs':n_epochs},
    suffix=os.environ['OPENAI_FINETUNING_MODEL_SUFFIX']
)
print(fine_tuning_response)

FineTuningJob(id='ftjob-1QPUJpBQzVQbHQO7G9qu9Iml', created_at=1723569390, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-LUz8bhCnZYA1TqgxH6GUEo06', result_files=[], seed=2087838318, status='validating_files', trained_tokens=None, training_file='file-Mnvub7g7KEiJ5LSfOU8yLkam', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='awsage')


In [19]:
# Retrieve the state of a fine-tune job
job_status = client.fine_tuning.jobs.retrieve(fine_tuning_response.id)
print(job_status)

FineTuningJob(id='ftjob-1QPUJpBQzVQbHQO7G9qu9Iml', created_at=1723569390, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:personal:awsage:9vq7igpe', finished_at=1723572085, hyperparameters=Hyperparameters(n_epochs=3, batch_size=2, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-LUz8bhCnZYA1TqgxH6GUEo06', result_files=['file-cnBsuFqsoQDTOpComXJa0doR'], seed=2087838318, status='succeeded', trained_tokens=480009, training_file='file-Mnvub7g7KEiJ5LSfOU8yLkam', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='awsage')


'ft:gpt-4o-mini-2024-07-18:personal:awsage:9vq7igpe'