## Fine-tuning OpenAI models with Weights & Biases

https://colab.research.google.com/github/wandb/examples/blob/master/colabs/openai/Fine_tune_OpenAI_with_Weights_and_Biases.ipynb#scrollTo=nMaTVSdD6Fpc

- Data preparation: training data; validation data; Upload files
- Fine-tuned model: fine-tuned model with job_id
- Evaluation
- Inference

In [31]:
from openai import OpenAI
import wandb

import os
import glob
import json
import random
import tiktoken
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from collections import defaultdict
from tenacity import retry, stop_after_attempt, wait_fixed

from openai import OpenAI
import wandb




In [4]:
WANDB_PROJECT = "OpenAI-Fine-Tune-test"

# # Enter credentials
client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

### Dataset Preparation

In [8]:
# download a dataset from LegalBench, a project to curate tasks for evaluating legal reasoning, 
# specifically the Contract NLI Explicit Identification task.

from datasets import load_dataset

# Download the data, merge into a single dataset and shuffle
dataset = load_dataset("nguha/legalbench", "contract_nli_explicit_identification")

data = []
for d in dataset["train"]:
  data.append(d)

for d in dataset["test"]:
  data.append(d)

random.shuffle(data)

for idx, d in enumerate(data):
  d["new_index"] = idx

Generating train split: 100%|██████████| 8/8 [00:00<00:00, 43.97 examples/s]
Generating test split: 100%|██████████| 109/109 [00:00<00:00, 930.20 examples/s]


In [9]:
len(data), data[0:2]

(117,
 [{'answer': 'No',
   'index': '77',
   'text': 'The obligations of the Receiving Party pursuant to the provisions of this Agreement shall not apply to any Confidential Information that – 8.3 is developed independently of the Disclosing Party by the Receiving Party in circumstances that do not amount to a breach of the provisions of this Agreement; ',
   'document_name': 'AfriGIS_Client-NDA_Template_2019.pdf',
   'new_index': 0},
  {'answer': 'No',
   'index': '97',
   'text': 'Except as expressly authorized by prior written consent of the Disclosing Party, the Receiving Party shall: (e) use all Confidential Information received by it for the purposes described in subsection  (a) of this Section 2 and for no other purpose whatsoever. ',
   'document_name': '1173495_0001047469-03-033872_a2118144zex-10_12.txt',
   'new_index': 1}])

### Format our Data for Chat Completion Models

In [12]:
base_prompt_zero_shot = "Identify if the clause provides that all Confidential Information shall be expressly identified by the Disclosing Party. Answer with only `Yes` or `No`"

In [10]:
n_train = 30
n_test = len(data) - n_train

In [13]:
train_messages = []
test_messages = []

for d in data:
  prompts = []
  prompts.append({"role": "system", "content": base_prompt_zero_shot})
  prompts.append({"role": "user", "content": d["text"]})
  prompts.append({"role": "assistant", "content": d["answer"]})

  if int(d["new_index"]) < n_train:
    train_messages.append({'messages': prompts})
  else:
    test_messages.append({'messages': prompts})

len(train_messages), len(test_messages), n_test, train_messages[5]

(30,
 87,
 87,
 {'messages': [{'role': 'system',
    'content': 'Identify if the clause provides that all Confidential Information shall be expressly identified by the Disclosing Party. Answer with only `Yes` or `No`'},
   {'role': 'user',
    'content': 'The definition of Information shall not include information that: (e) is developed by the Receiving Party independently of Information disclosed by the Disclosing Party and without breach of this Agreement. '},
   {'role': 'assistant', 'content': 'No'}]})

### Save the data to jsonl first


In [14]:
# Save the data in a train and test file first
train_file_path = 'encoded_train_data.jsonl'
with open(train_file_path, 'w') as file:
    for item in train_messages:
        line = json.dumps(item)
        file.write(line + '\n')

test_file_path = 'encoded_test_data.jsonl'
with open(test_file_path, 'w') as file:
    for item in test_messages:
        line = json.dumps(item)
        file.write(line + '\n')

#### Validate that our training data is in the correct format

In [15]:
# Next, we specify the data path and open the JSONL file

def openai_validate_data(dataset_path):
  data_path = dataset_path

  # Load dataset
  with open(data_path) as f:
      dataset = [json.loads(line) for line in f]

  # We can inspect the data quickly by checking the number of examples and the first item

  # Initial dataset stats
  print("Num examples:", len(dataset))
  print("First example:")
  for message in dataset[0]["messages"]:
      print(message)

  # Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

  # Format error checks
  format_errors = defaultdict(int)

  for ex in dataset:
      if not isinstance(ex, dict):
          format_errors["data_type"] += 1
          continue

      messages = ex.get("messages", None)
      if not messages:
          format_errors["missing_messages_list"] += 1
          continue

      for message in messages:
          if "role" not in message or "content" not in message:
              format_errors["message_missing_key"] += 1

          if any(k not in ("role", "content", "name") for k in message):
              format_errors["message_unrecognized_key"] += 1

          if message.get("role", None) not in ("system", "user", "assistant"):
              format_errors["unrecognized_role"] += 1

          content = message.get("content", None)
          if not content or not isinstance(content, str):
              format_errors["missing_content"] += 1

      if not any(message.get("role", None) == "assistant" for message in messages):
          format_errors["example_missing_assistant_message"] += 1

  if format_errors:
      print("Found errors:")
      for k, v in format_errors.items():
          print(f"{k}: {v}")
  else:
      print("No errors found")

  # Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

  # Token counting functions
  encoding = tiktoken.get_encoding("cl100k_base")

  # not exact!
  # simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
  def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
      num_tokens = 0
      for message in messages:
          num_tokens += tokens_per_message
          for key, value in message.items():
              num_tokens += len(encoding.encode(value))
              if key == "name":
                  num_tokens += tokens_per_name
      num_tokens += 3
      return num_tokens

  def num_assistant_tokens_from_messages(messages):
      num_tokens = 0
      for message in messages:
          if message["role"] == "assistant":
              num_tokens += len(encoding.encode(message["content"]))
      return num_tokens

  def print_distribution(values, name):
      print(f"\n#### Distribution of {name}:")
      print(f"min / max: {min(values)}, {max(values)}")
      print(f"mean / median: {np.mean(values)}, {np.median(values)}")
      print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

  # Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

  # Warnings and tokens counts
  n_missing_system = 0
  n_missing_user = 0
  n_messages = []
  convo_lens = []
  assistant_message_lens = []

  for ex in dataset:
      messages = ex["messages"]
      if not any(message["role"] == "system" for message in messages):
          n_missing_system += 1
      if not any(message["role"] == "user" for message in messages):
          n_missing_user += 1
      n_messages.append(len(messages))
      convo_lens.append(num_tokens_from_messages(messages))
      assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

  print("Num examples missing system message:", n_missing_system)
  print("Num examples missing user message:", n_missing_user)
  print_distribution(n_messages, "num_messages_per_example")
  print_distribution(convo_lens, "num_total_tokens_per_example")
  print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
  n_too_long = sum(l > 4096 for l in convo_lens)
  print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

  # Pricing and default n_epochs estimate
  MAX_TOKENS_PER_EXAMPLE = 4096

  MIN_TARGET_EXAMPLES = 100
  MAX_TARGET_EXAMPLES = 25000
  TARGET_EPOCHS = 3
  MIN_EPOCHS = 1
  MAX_EPOCHS = 25

  n_epochs = TARGET_EPOCHS
  n_train_examples = len(dataset)
  if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
      n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
  elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
      n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

  n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
  print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
  print(f"By default, you'll train for {n_epochs} epochs on this dataset")
  print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
  print("See pricing page to estimate total costs")

In [16]:
openai_validate_data(train_file_path)

Num examples: 30
First example:
{'role': 'system', 'content': 'Identify if the clause provides that all Confidential Information shall be expressly identified by the Disclosing Party. Answer with only `Yes` or `No`'}
{'role': 'user', 'content': 'The obligations of the Receiving Party pursuant to the provisions of this Agreement shall not apply to any Confidential Information that – 8.3 is developed independently of the Disclosing Party by the Receiving Party in circumstances that do not amount to a breach of the provisions of this Agreement; '}
{'role': 'assistant', 'content': 'No'}
No errors found
Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 61, 273
mean / median: 128.83333333333334, 123.5
p5 / p95: 78.8, 186.90000000000003

#### Distribution of num_assistant_tokens_per_example:
min / ma

### Upload the training and validation data to OpenAI

Do not upload files repeatedly

In [22]:
openai_train_file_info = client.files.create(
    file=open(train_file_path, "rb"), purpose="fine-tune"
)

openai_valid_file_info = client.files.create(
    file=open(test_file_path, "rb"), purpose="fine-tune"
)

In [23]:
openai_train_file_info

FileObject(id='file-VYndPZHIuBw6XHOg6LRAUPG2', bytes=21475, created_at=1729261290, filename='encoded_train_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [24]:
openai_valid_file_info

FileObject(id='file-OPwQ1OtKgw2qOL5byaYdcuQE', bytes=69668, created_at=1729261291, filename='encoded_test_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

### Train the model and log to Weights & Biases

In [25]:
model = 'gpt-3.5-turbo'
# model = "gpt-4o-mini-2024-07-18"
n_epochs = 3

In [26]:
openai_ft_job_info = client.fine_tuning.jobs.create(
    training_file=openai_train_file_info.id,
    model=model,
    hyperparameters={"n_epochs": n_epochs},
    validation_file=openai_valid_file_info.id
)

ft_job_id = openai_ft_job_info.id

### Start Weight & Biases Sync

Calling WandbLogger.sync will start polling OpenAI for the fine-tuning job results and log them when they are retrieved, see the docs for how to modify this behaviour

In [27]:
# Log to Weights and Biases
# 79aec238fd9c8bd9024bbd0651d5518aba741797
from wandb.integration.openai.fine_tuning import WandbLogger
WandbLogger.sync(fine_tune_job_id=ft_job_id, project=WANDB_PROJECT, openai_client=client)

[34m[1mwandb[0m: Retrieving fine-tune job...
[34m[1mwandb[0m: Waiting for the OpenAI fine-tuning job to finish training...
[34m[1mwandb[0m: To avoid blocking, you can call `WandbLogger.sync` with `wait_for_job_success=False` after OpenAI training completes.
[34m[1mwandb[0m: Fine-tuning finished, logging metrics, model metadata, and run metadata to Weights & Biases
[34m[1mwandb[0m: Logging training/validation files...


0,1
train_accuracy,█▁██████▁██▁█████████████▁██████████████
train_loss,▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_loss,▁▁▂▁▁▁▁▁▁▁▄▅▁▁▁▁▁▁▁█▅▁▁▁▁▁▁▁▁▁▁▅▁▁▁▁▁▂▁▁
valid_mean_token_accuracy,███████████████████████████████▁███▁█▁██

0,1
fine_tuned_model,ft:gpt-3.5-turbo-012...
status,succeeded
train_accuracy,1
train_loss,0
valid_loss,6.86868
valid_mean_token_accuracy,0.66667


'🎉 wandb sync completed successfully'

Logging the fine-tuning job to W&B is straight forward. The integration will automatically log the following to W&B:

- training and validation metrics (if validation data is provided)
- log the training and validation data as W&B Tables for storage and versioning
- log the fine-tuned model's metadata.

The integration automatically creates the DAG lineage between the data and the model.

> You can call the `WandbLogger` with the job id. The cell will keep running till the fine-tuning job is not complete. Once the job's status is `succeeded`, the `WandbLogger` will log metrics and data to W&B. This way you don't have to wait for the fine-tune job to be completed to call `WandbLogger.sync`.

### Run evalution and log the results

The best way to evaluate a generative model is to explore sample predictions from your evaluation set.

Let's generate a few inference samples and log them to W&B and see how the performance compares to a baseline ChatGPT-3.5 model

We will be evaluating using the validation dataset. In the overview tab of the run page, find the "validation_files" in the Artifact Inputs 
section. 

Clicking on it will take you to the artifacts page. Copy the artifact URI (full name) as shown in the image below.

In [29]:
run = wandb.init(
    project=WANDB_PROJECT,
    job_type='eval'
)

VALIDATION_FILE_ARTIFACT_URI = 'tyliu1122-university-of-technology-sydney/OpenAI-Fine-Tune-test/valid-file-OPwQ1OtKgw2qOL5byaYdcuQE:v0' # REPLACE THIS WITH YOUR OWN ARTIFACT URI

artifact_valid = run.use_artifact(
    VALIDATION_FILE_ARTIFACT_URI,
    type='validation_files'
)

The code snippet below, download the logged validation data and prepare a pandas dataframe from it.

In [32]:
artifact_valid_path = artifact_valid.download()
print("Downloaded the validation data at: ", artifact_valid_path)

validation_file = glob.glob(f"{artifact_valid_path}/*.table.json")[0]
with open(validation_file, 'r') as file:
    data = json.load(file)

validation_df = pd.DataFrame(columns=data["columns"], data=data["data"])

print(f"There are {len(validation_df)} validation examples")
run.config.update({"num_validation_samples":len(validation_df)})

validation_df.head()

[34m[1mwandb[0m:   2 of 2 files downloaded.  


Downloaded the validation data at:  /Users/tianyuliu/Code/llm/NLP_examples/src/llm/openai_api/Finetune API/artifacts/valid-file-OPwQ1OtKgw2qOL5byaYdcuQE:v0
There are 87 validation examples


Unnamed: 0,role: system,role: user,role: assistant
0,Identify if the clause provides that all Confi...,"a. What is included, ""Confidential information...",No
1,Identify if the clause provides that all Confi...,Confidential Information provided by Disclosin...,Yes
2,Identify if the clause provides that all Confi...,"As used in this Agreement, the terms ""CompuCom...",Yes
3,Identify if the clause provides that all Confi...,"3.5. ""Confidential information"" means any info...",No
4,Identify if the clause provides that all Confi...,"(c) Each Party agrees that, without the prior ...",No


We will need to package the data in the dataframe in the format acceptable by GPT 3.5. The format is:

```
{"messages": [{"role": "system", "content": "some system prompt"}, {"role": "user", "content": "some user prompt"}, {"role": "assistant", "content": "completion text"}]}
```

For evaluation we don't need to pack the `{"role": "assistant", "content": "completition text"}` in `messages` as this is meant to be generated by GPT 3.5.

In [33]:
def eval_data_format(row):
    role_system_content = row["role: system"]
    role_system_dict = {"role": "system", "content": role_system_content}

    role_user_content = row["role: user"]
    role_user_dict = {"role": "user", "content": role_user_content}
    
    return [role_system_dict, role_user_dict]

validation_df["messages"] = validation_df.apply(lambda row: eval_data_format(row), axis=1)
validation_df.head()

Unnamed: 0,role: system,role: user,role: assistant,messages
0,Identify if the clause provides that all Confi...,"a. What is included, ""Confidential information...",No,"[{'role': 'system', 'content': 'Identify if th..."
1,Identify if the clause provides that all Confi...,Confidential Information provided by Disclosin...,Yes,"[{'role': 'system', 'content': 'Identify if th..."
2,Identify if the clause provides that all Confi...,"As used in this Agreement, the terms ""CompuCom...",Yes,"[{'role': 'system', 'content': 'Identify if th..."
3,Identify if the clause provides that all Confi...,"3.5. ""Confidential information"" means any info...",No,"[{'role': 'system', 'content': 'Identify if th..."
4,Identify if the clause provides that all Confi...,"(c) Each Party agrees that, without the prior ...",No,"[{'role': 'system', 'content': 'Identify if th..."


### Run evaluation on the Fine-Tuned Model

Next up we will get the fine-tuned model's id from the logged `model_metadata`. In the overview tab of the run page, find the "model" in the Artifact Outputs section. Clicking on it will take you to the artifacts page. Copy the artifact URI (full name) as shown in the image below.

In [34]:
MODEL_ARTIFACT_URI = 'tyliu1122-university-of-technology-sydney/OpenAI-Fine-Tune-test/model-metadata:v0' # REPLACE THIS WITH YOUR OWN ARTIFACT URI

model_artifact = run.use_artifact(
    MODEL_ARTIFACT_URI,
    type='model'
)

In [35]:
model_metadata_path = model_artifact.download()
print("Downloaded the validation data at: ", model_metadata_path)

model_metadata_file = glob.glob(f"{model_metadata_path}/*.json")[0]
with open(model_metadata_file, 'r') as file:
    model_metadata = json.load(file)

model_metadata

[34m[1mwandb[0m:   1 of 1 files downloaded.  


Downloaded the validation data at:  /Users/tianyuliu/Code/llm/NLP_examples/src/llm/openai_api/Finetune API/artifacts/model-metadata:v0


{'id': 'ftjob-Ef468OMa5MKacvqzD0LYtGEV',
 'created_at': 1729261503,
 'error': "{'code': None, 'message': None, 'param': None}",
 'fine_tuned_model': 'ft:gpt-3.5-turbo-0125:personal::AJiJ8bR6',
 'finished_at': 1729261912,
 'hyperparameters': "{'n_epochs': 3, 'batch_size': 1, 'learning_rate_multiplier': 2}",
 'model': 'gpt-3.5-turbo-0125',
 'object': 'fine_tuning.job',
 'organization_id': 'org-v1K2Lxyjnu8PFhpx9OPExASg',
 'result_files': "['file-RH2WFPkKmYye7mYt99YYKR6I']",
 'seed': 486798962,
 'status': 'succeeded',
 'trained_tokens': 11415,
 'training_file': 'file-VYndPZHIuBw6XHOg6LRAUPG2',
 'validation_file': 'file-OPwQ1OtKgw2qOL5byaYdcuQE',
 'estimated_finish': 'None',
 'integrations': '[]',
 'user_provided_suffix': 'None'}

In [36]:
fine_tuned_model = model_metadata["fine_tuned_model"]
client = OpenAI()

In [37]:
prediction_table = wandb.Table(columns=['messages', 'completion', 'target'])

eval_data = []

for idx, row in tqdm(validation_df.iterrows()):
    messages = row.messages
    target = row["role: assistant"]

    res = client.chat.completions.create(model=fine_tuned_model, messages=messages, max_tokens=10)
    completion = res.choices[0].message.content

    eval_data.append([messages, completion, target])
    prediction_table.add_data(messages[1]['content'], completion, target)

wandb.log({'predictions': prediction_table})

87it [02:41,  1.86s/it]


In [38]:
correct = 0
for e in eval_data:
  if e[1].lower() == e[2].lower():
    correct+=1

accuracy = correct / len(eval_data)

print(f"Accuracy is {accuracy}")
wandb.log({"eval/accuracy": accuracy})
wandb.summary["eval/accuracy"] = accuracy

Accuracy is 0.8620689655172413


## Run evaluation on a Baseline model for comparison

In [39]:
baseline_prediction_table = wandb.Table(columns=['messages', 'completion', 'target'])

baseline_eval_data = []

for idx, row in tqdm(validation_df.iterrows()):
    messages = row.messages
    target = row["role: assistant"]

    res = client.chat.completions.create(model="gpt-3.5-turbo", messages=messages, max_tokens=10)
    completion = res.choices[0].message.content

    baseline_eval_data.append([messages, completion, target])
    baseline_prediction_table.add_data(messages[1]['content'], completion, target)

wandb.log({'baseline_predictions': baseline_prediction_table})

87it [00:47,  1.82it/s]


In [40]:
baseline_correct = 0
for e in baseline_eval_data:
  if e[1].lower() == e[2].lower():
    baseline_correct+=1

baseline_accuracy = baseline_correct / len(baseline_eval_data)
print(f"Baseline Accurcy is: {baseline_accuracy}")
wandb.log({"eval/baseline_accuracy": baseline_accuracy})
wandb.summary["eval/baseline_accuracy"] =  baseline_accuracy

Baseline Accurcy is: 0.7931034482758621


In [41]:
wandb.finish()

0,1
eval/accuracy,▁
eval/baseline_accuracy,▁

0,1
eval/accuracy,0.86207
eval/baseline_accuracy,0.7931
