In [None]:
# make sure to use the latest version of the openai python package
!pip install --upgrade openai

In [None]:
import json
import openai
import os
import pandas as pd
from pprint import pprint
from openai import OpenAI # for calling the OpenAI API

In [None]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "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"))

In [None]:
# Read in the dataset we'll use for this task.
# This will be the RecipesNLG dataset, which we've cleaned to only contain documents from www.cookbooks.com
fashion_df = pd.read_csv("/articles_data.csv")

fashion_df.head()

In [None]:
training_data = []

system_message = "You are a helpful fashion sustainability assistant. You are to extract the data in the article content to respond back to user's questions on fashion sustainability."

def create_user_message(row):
    return f"""Title: {row['Title']}\n\nContent: {row['Article Content']}"""

def prepare_example_conversation(row):
    messages = []
    messages.append({"role": "system", "content": system_message})

    user_message = create_user_message(row)
    messages.append({"role": "user", "content": user_message})

    messages.append({"role": "assistant", "content": row["Article Content"]})

    return {"messages": messages}

pprint(prepare_example_conversation(fashion_df.iloc[0]))

In [None]:
# use the first 100 rows of the dataset for training
training_df = fashion_df.loc[0:2000]

# apply the prepare_example_conversation function to each row of the training_df
training_data = training_df.apply(prepare_example_conversation, axis=1).tolist()

for example in training_data[:100]:
    print(example)

In [None]:
validation_df = fashion_df.loc[451:900]
validation_data = validation_df.apply(prepare_example_conversation, axis=1).tolist()

In [None]:
def write_jsonl(data_list: list, filename: str) -> None:
    with open(filename, "w") as out:
        for ddict in data_list:
            jout = json.dumps(ddict) + "\n"
            out.write(jout)

In [None]:
training_file_name = "tmp_fashion_finetune_training.jsonl"
write_jsonl(training_data, training_file_name)

validation_file_name = "tmp_fashion_finetune_validation.jsonl"
write_jsonl(validation_data, validation_file_name)

In [None]:
# print the first 5 lines of the training file
!head -n 20 tmp_fashion_finetune_training.jsonl

In [None]:
with open(training_file_name, "rb") as training_fd:
    training_response = client.files.create(
        file=training_fd, purpose="fine-tune"
    )

training_file_id = training_response.id

with open(validation_file_name, "rb") as validation_fd:
    validation_response = client.files.create(
        file=validation_fd, purpose="fine-tune"
    )
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

In [None]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-3.5-turbo",
    suffix="recipe-ner",
)

job_id = response.id

print("Job ID:", response.id)
print("Status:", response.status)

In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print("Trained Tokens:", response.trained_tokens)

In [None]:
response = client.fine_tuning.jobs.list_events(job_id)

events = response.data
events.reverse()

for event in events:
    print(event.message)

In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = response.fine_tuned_model

if fine_tuned_model_id is None:
    raise RuntimeError("Fine-tuned model ID not found. Your job has likely not been completed yet.")

print("Fine-tuned model ID:", fine_tuned_model_id)