# Fine tuning Conditional Generation


In [1]:
!pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!pip install --upgrade openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import csv
import json
import openai
import signal
import datetime
import time
import os
import pandas as pd
import subprocess

In [4]:
# Set the working directory and specify the .csv file with prompts and completions

new_directory = "/Users/rachelweasley/PycharmProjects/TuningGPT"
os.chdir(new_directory)

In [5]:
# Provide your API key by reading it from a .txt file:

config_filename = "API_KEY.txt"

with open(config_filename, 'r') as file:
    api_key = file.readline().strip().split('=')[1]

openai.api_key = api_key

# Alternatively, you may directly assign your API key (if you prefer not to use a .txt file)
# openai.api_key = "<your_api_key>"

In [6]:
training_data = 'training_data.csv'

def prepare_data(csv_file, jsonl_file):
    training_data = []

    with open(csv_file, 'r', encoding='utf-8-sig') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prompt = row['prompt']
            if not prompt.endswith('?'):
                prompt += '?'  # Add question mark if missing
            prompt = prompt + '->'
            completion = ' ' + row['completion']
            if not completion.endswith('.'):
                completion += '.'  # Add period if missing
            completion += '\n'
            entry = {'prompt': prompt,
					           'completion': completion}
            training_data.append(entry)

    with open(jsonl_file, 'w') as jsonlfile:
        for entry in training_data:
            json.dump(entry, jsonlfile)
            jsonlfile.write('\n')


prepare_data(training_data, 'training_data.jsonl')

{'prompt': 'What is the capital of France?->', 'completion': ' The capital of France is Paris.\n'}
{'prompt': 'What is the primary function of the heart?->', 'completion': ' The primary function of the heart is to pump blood throughout the body.\n'}
{'prompt': 'What is photosynthesis?->', 'completion': '  Photosynthesis is the process by which green plants and some other organisms convert sunlight into chemical energy stored in the form of glucose.\n'}
{'prompt': "Who wrote the play 'Romeo and Juliet'?->", 'completion': "  William Shakespeare wrote the play 'Romeo and Juliet'.\n"}
{'prompt': 'Which element has the atomic number 1?->', 'completion': '  Hydrogen has the atomic number 1.\n'}
{'prompt': 'What is the largest planet in our solar system?->', 'completion': '  Jupiter is the largest planet in our solar system.\n'}
{'prompt': 'What is the freezing point of water in Celsius?->', 'completion': '  The freezing point of water in Celsius is 0 degrees.\n'}
{'prompt': 'What is the squa

In [None]:
!openai tools fine_tunes.prepare_data -f "training_data.jsonl"

In [19]:
training_file_id = openai.File.create(
  file=open("training_data.jsonl", "rb"),
  purpose='fine-tune'
)["id"]

In [61]:
create_args = {
	"training_file": training_file_id,
	"model": "davinci",
	"n_epochs": 15,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
}

response = openai.FineTune.create(**create_args)
job_id = response["id"]
status = response["status"]

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ft-LTqFIfBcnJPH7QWQ6AbCnAhb.
Training Response: {
  "object": "fine-tune",
  "id": "ft-LTqFIfBcnJPH7QWQ6AbCnAhb",
  "hyperparams": {
    "n_epochs": 15,
    "batch_size": 3,
    "prompt_loss_weight": 0.01,
    "learning_rate_multiplier": 0.3
  },
  "organization_id": "org-RNi40Jk2y7io1td5Be2MNZms",
  "model": "davinci",
  "training_files": [
    {
      "object": "file",
      "id": "file-9aAWkUG1sdThMMDnmsjo5v3n",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 1318,
      "created_at": 1689799566,
      "status": "uploaded",
      "status_details": null
    }
  ],
  "validation_files": [],
  "result_files": [],
  "created_at": 1689799568,
  "updated_at": 1689799568,
  "status": "pending",
  "fine_tuned_model": null,
  "events": [
    {
      "object": "fine-tune-event",
      "level": "info",
      "message": "Created fine-tune: ft-LTqFIfBcnJPH7QWQ6AbCnAhb",
      "created_at": 1689799568
    }
  ]
}
Training Status: pending


In [62]:
import signal
import datetime

def signal_handler(sig, frame):
	status = openai.FineTune.retrieve(job_id).status
	print(f"Stream interrupted. Job is still {status}.")
	return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = openai.FineTune.stream_events(job_id)
try:
  for event in events:
    print(f'{datetime.datetime.fromtimestamp(event["created_at"])} {event["message"]}')
except Exception:
  print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ft-LTqFIfBcnJPH7QWQ6AbCnAhb
2023-07-19 20:46:08 Created fine-tune: ft-LTqFIfBcnJPH7QWQ6AbCnAhb
Stream interrupted (client disconnected).


In [24]:
import time

status = openai.FineTune.retrieve(id=job_id)["status"]
if status not in ["succeeded", "failed"]:
  print(f'Job not in terminal status: {status}. Waiting.')
  while status not in ["succeeded", "failed"]:
    time.sleep(2)
    status = openai.FineTune.retrieve(id=job_id)["status"]
    print(f'Status: {status}')
else:
  print(f'Finetune job {job_id} finished with status: {status}')

print('Checking other finetune jobs in the subscription.')
result = openai.FineTune.list()
print(f'Found {len(result.data)} finetune jobs.')

Finetune job ft-LTqFIfBcnJPH7QWQ6AbCnAhb finished with status: succeeded
Checking other finetune jobs in the subscription.
Found 4 finetune jobs.


In [25]:
openai.FineTune.retrieve(id=job_id)

<FineTune fine-tune id=ft-LTqFIfBcnJPH7QWQ6AbCnAhb at 0x7ff5e4bd9090> JSON: {
  "object": "fine-tune",
  "id": "ft-LTqFIfBcnJPH7QWQ6AbCnAhb",
  "hyperparams": {
    "n_epochs": 15,
    "batch_size": 3,
    "prompt_loss_weight": 0.01,
    "learning_rate_multiplier": 0.3
  },
  "organization_id": "org-RNi40Jk2y7io1td5Be2MNZms",
  "model": "davinci",
  "training_files": [
    {
      "object": "file",
      "id": "file-9aAWkUG1sdThMMDnmsjo5v3n",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 1318,
      "created_at": 1689799566,
      "status": "processed",
      "status_details": null
    }
  ],
  "validation_files": [],
  "result_files": [
    {
      "object": "file",
      "id": "file-jjJF3ATJTJBh2A35cnrZq9ld",
      "purpose": "fine-tune-results",
      "filename": "compiled_results.csv",
      "bytes": 2475,
      "created_at": 1689815973,
      "status": "processed",
      "status_details": null
    }
  ],
  "created_at": 1689799568,
  "updated_at": 16898159

In [26]:
fine_tuned_model = openai.FineTune.retrieve(id=job_id)["fine_tuned_model"]

In [34]:
new_prompt = "Which part is the smallest bone in the entire human body?"
answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer['choices'][0]['text'])

new_prompt = """ Which type of gas is utilized by plants during the process of photosynthesis?"""
answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer['choices'][0]['text'])

-> The smallest bone in the entire human body is the hyoid bone.

-> The type of gas utilized by plants during the process of photosynthesis is oxygen
