In [1]:
pip install --upgrade openai -q

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from openai import OpenAI

client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],
)

## Dataset

In [5]:
training_data = [
	{
    	"prompt": "What is the capital of France?->",
    	"completion": """ The capital of France is Paris.\n"""
	},
	{
    	"prompt": "What is the primary function of the heart?->",
    	"completion": """ The primary function of the heart is to pump blood throughout the body.\n"""
	},
	{
    	"prompt": "What is photosynthesis?->",
    	"completion": """ Photosynthesis is the process by which green plants and some other organisms convert sunlight into chemical energy stored in the form of glucose.\n"""
	},
	{
    	"prompt": "Who wrote the play 'Romeo and Juliet'?->",
    	"completion": """ William Shakespeare wrote the play 'Romeo and Juliet'.\n"""
	},
	{
    	"prompt": "Which element has the atomic number 1?->",
    	"completion": """ Hydrogen has the atomic number 1.\n"""
	},
	{
    	"prompt": "What is the largest planet in our solar system?->",
    	"completion": """ Jupiter is the largest planet in our solar system.\n"""
	},
	{
    	"prompt": "What is the freezing point of water in Celsius?->",
    	"completion": """ The freezing point of water in Celsius is 0 degrees.\n"""
	},
	{
    	"prompt": "What is the square root of 144?->",
    	"completion": """ The square root of 144 is 12.\n"""
	},
	{
    	"prompt": "Who is the author of 'To Kill a Mockingbird'?->",
    	"completion": """ The author of 'To Kill a Mockingbird' is Harper Lee.\n"""
	},
	{
    	"prompt": "What is the smallest unit of life?->",
    	"completion": """ The smallest unit of life is the cell.\n"""
	}
]

validation_data = [
	{
    	"prompt": "Which gas do plants use for photosynthesis?->",
    	"completion": """ Plants use carbon dioxide for photosynthesis.\n"""
	},
	{
    	"prompt": "What are the three primary colors of light?->",
    	"completion": """ The three primary colors of light are red, green, and blue.\n"""
	},
	{
    	"prompt": "Who discovered penicillin?->",
    	"completion": """ Sir Alexander Fleming discovered penicillin.\n"""
	},
	{
    	"prompt": "What is the chemical formula for water?->",
    	"completion": """ The chemical formula for water is H2O.\n"""
	},
	{
    	"prompt": "What is the largest country by land area?->",
    	"completion": """ Russia is the largest country by land area.\n"""
	},
	{
    	"prompt": "What is the speed of light in a vacuum?->",
    	"completion": """ The speed of light in a vacuum is approximately 299,792 kilometers per second.\n"""
	},
	{
    	"prompt": "What is the currency of Japan?->",
    	"completion": """ The currency of Japan is the Japanese Yen.\n"""
	},
	{
    	"prompt": "What is the smallest bone in the human body?->",
    	"completion": """ The stapes, located in the middle ear, is the smallest bone in the human body.\n"""
	}
]

## Saving the Dataset

In [6]:
import json

training_file_name = "training_data.jsonl"
validation_file_name = "validation_data.jsonl"

def prepare_data(dictionary_data, final_file_name):
    with open(final_file_name, 'w') as outfile:
        for entry in dictionary_data:
        	json.dump(entry, outfile)
        	outfile.write('\n')

prepare_data(training_data, "training_data.jsonl")
prepare_data(validation_data, "validation_data.jsonl")

## Uploading the Dataset

In [8]:
training_file_id = client.files.create(
  file=open(training_file_name, "rb"),
  purpose="fine-tune"
)

validation_file_id = client.files.create(
  file=open(validation_file_name, "rb"),
  purpose="fine-tune"
)

print(f"Training File ID: {training_file_id}")
print(f"Validation File ID: {validation_file_id}")

Training File ID: FileObject(id='file-qTGysDnsoqWn8c2TlAk2D7d9', bytes=1310, created_at=1705099652, filename='training_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
Validation File ID: FileObject(id='file-zGs1SPbKq2YY9kQsjalabPZe', bytes=1036, created_at=1705099653, filename='validation_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


In [12]:
training_file_id.id

'file-qTGysDnsoqWn8c2TlAk2D7d9'

## Finetuning

In [18]:
response = client.fine_tuning.jobs.create(
  training_file=training_file_id.id, 
  validation_file=validation_file_id.id,
  model="davinci-002", 
  hyperparameters={
    "n_epochs": 15,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
  }
)

In [20]:
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ftjob-WsCjQMEmSvRMFCbhdTqxZzOf.
Training Response: FineTuningJob(id='ftjob-WsCjQMEmSvRMFCbhdTqxZzOf', created_at=1705100342, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=15, batch_size=3, learning_rate_multiplier=0.3), model='davinci-002', object='fine_tuning.job', organization_id='org-jLXWbL5JssIxj9KNgoFBK7Qi', result_files=[], status='validating_files', trained_tokens=None, training_file='file-qTGysDnsoqWn8c2TlAk2D7d9', validation_file='file-zGs1SPbKq2YY9kQsjalabPZe')
Training Status: validating_files


## Monitoring the Jobs

In [23]:
import signal
import datetime


def signal_handler(sig, frame):
    status = client.fine_tuning.jobs.retrieve(job_id).status
    print(f"Stream interrupted. Job is still {status}.")
    return


print(f"Streaming events for the fine-tuning job: {job_id}")

signal.signal(signal.SIGINT, signal_handler)

events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id)
try:
    for event in events:
        print(
            f'{datetime.datetime.fromtimestamp(event.created_at)} {event.message}'
        )
except Exception:
    print("Stream interrupted (client disconnected).")


Streaming events for the fine-tuning job: ftjob-WsCjQMEmSvRMFCbhdTqxZzOf
2024-01-12 23:00:45 The job has successfully completed
2024-01-12 23:00:42 New fine-tuned model created: ft:davinci-002:personal::8gKnyxn3
2024-01-12 23:00:37 Step 50/50: training loss=0.33, validation loss=0.53
2024-01-12 23:00:37 Step 49/50: training loss=0.33, validation loss=0.49
2024-01-12 23:00:37 Step 48/50: training loss=0.63, validation loss=0.67
2024-01-12 23:00:35 Step 47/50: training loss=0.40, validation loss=0.70
2024-01-12 23:00:35 Step 46/50: training loss=0.66, validation loss=0.47
2024-01-12 23:00:35 Step 45/50: training loss=0.37, validation loss=0.56
2024-01-12 23:00:35 Step 44/50: training loss=0.34, validation loss=0.39
2024-01-12 23:00:34 Step 43/50: training loss=0.37, validation loss=0.64
2024-01-12 23:00:32 Step 42/50: training loss=0.35, validation loss=0.54
2024-01-12 23:00:32 Step 41/50: training loss=0.60, validation loss=0.50
2024-01-12 23:00:32 Step 40/50: training loss=0.42, valida

In [26]:
import time

status = client.fine_tuning.jobs.retrieve(job_id).status
if status not in ["succeeded", "failed"]:
    print(f"Job not in terminal status: {status}. Waiting.")
    while status not in ["succeeded", "failed"]:
        time.sleep(2)
        status = client.fine_tuning.jobs.retrieve(job_id).status
        print(f"Status: {status}")
else:
    print(f"Finetune job {job_id} finished with status: {status}")
print("Checking other finetune jobs in the subscription.")
result = client.fine_tuning.jobs.list()
print(f"Found {len(result.data)} finetune jobs.")


Finetune job ftjob-WsCjQMEmSvRMFCbhdTqxZzOf finished with status: succeeded
Checking other finetune jobs in the subscription.
Found 2 finetune jobs.


## Accessing the Model

In [46]:
# Retrieve the finetuned model
fine_tuned_model = result.data[0].fine_tuned_model
print(fine_tuned_model)

ft:davinci-002:personal::8gKnyxn3


In [59]:
new_prompt = "Which part is the smallest bone in the entire human body?"
answer = client.completions.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer.choices[0].text)

new_prompt = "Which type of gas is utilized by plants during the process of photosynthesis?"
answer = client.completions.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer.choices[0].text)

 It’s so small that it goes unnoticed even with magnifying lenses or microscopes
 It is helpful to teach children that oxygen comes from plants that perform a different kind
