In [2]:
import pandas as pd
import json
import openai
import os
import time
import datetime
import signal
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sys
# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import RANDOM_STATE, OPENAI_API_KEY

# OpenAI API Key
openai.api_key = OPENAI_API_KEY
# OpenAI GPT Model parameters
GPT_MODEL_NAME = "gpt-3.5-turbo"
TEMPERATURE = 0

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/fine_tune.ipynb"))
print(f"current directory: {current_dir}")

current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty


In [3]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'merged_data.csv')
print(f'data path: {data_path}')

data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty/merged_data.csv


# Convert the pandas DataFrame to JSONL format:


In [4]:
# Read the dataset
movie_data = pd.read_csv(data_path)

# Split the dataset into training and validation sets (80-20% split)
train_data = movie_data.sample(frac=0.8, random_state=RANDOM_STATE)
validation_data = movie_data.drop(train_data.index)

# Convert the DataFrame to a list of dictionaries for training data
training_data = []
for index, row in train_data.iterrows():
    prompt = f"Title: {row['title']}"
    completion = str(row['rating'])
    training_data.append({"prompt": prompt, "completion": completion})

# Repeat for validation data
validation_data_list = []
for index, row in validation_data.iterrows():
    prompt = f"Title: {row['title']}"
    completion = str(row['rating'])
    validation_data_list.append({"prompt": prompt, "completion": completion})

# Define the current directory
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/fine_tune.ipynb"))

# Save the data in JSONL format
training_file_name = os.path.join(current_dir, "training_data.jsonl")
validation_file_name = os.path.join(current_dir, "validation_data.jsonl")

def prepare_data(dictionary_data, final_file_name):
    with open(final_file_name, 'w') as outfile:
        for entry in dictionary_data:
            json.dump(entry, outfile)
            outfile.write('\n')

prepare_data(training_data, training_file_name)
prepare_data(validation_data_list, validation_file_name)

#  Fine-tuning

In [5]:
%%time
# Upload the datasets to OpenAI
def upload_data_to_OpenAI(file_name):
    with open(file_name, "rb") as f:
        response = openai.File.create(file=f, purpose='fine-tune')
        return response.id

training_file_id = upload_data_to_OpenAI(training_file_name)
validation_file_id = upload_data_to_OpenAI(validation_file_name)

# Adjusted fine-tuning parameters
create_args = {
    "training_file": training_file_id,
    "validation_file": validation_file_id,
    "model": "davinci",
    "n_epochs": 10,               # Reduced number of epochs
    "batch_size": 2,              # Reduced batch size
    "learning_rate_multiplier": 0.5  # Slightly increased learning rate
}

response = openai.FineTune.create(**create_args)
job_id = response["id"]
status = response["status"]

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")


# Wait for the fine-tuning to complete
while status != "succeeded":
    print(f"Waiting for fine-tuning to complete. Current status: {status}")
    time.sleep(60)  # Wait for 60 seconds before checking again
    response = openai.FineTune.retrieve(job_id)
    status = response["status"]

# Once the fine-tuning is complete, proceed with the rest of the code
fine_tuned_model = response["id"]

# Generate predictions using the fine-tuned model
def get_model_predictions(model, prompts):
    predictions = []
    for prompt in prompts:
        response = openai.Completion.create(model=model, prompt=prompt)
        predictions.append(float(response['choices'][0]['text'].strip()))
    return predictions

# Prepare prompts from validation_data for rating predictions
prompts = validation_data['title'].tolist()  # Assuming 'title' is the feature you want to use for prediction

# Get predictions
predicted_ratings = get_model_predictions(fine_tuned_model, prompts)

# True ratings from the validation set
true_ratings = validation_data['rating'].tolist()

# Calculate evaluation metrics
mae = mean_absolute_error(true_ratings, predicted_ratings)
mse = mean_squared_error(true_ratings, predicted_ratings)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")



Fine-tuning model with jobID: ft-ifP1a6eOV8nLQqRdqiVWrPPh.
Training Response: {
  "created_at": 1695959342,
  "events": [
    {
      "created_at": 1695959342,
      "level": "info",
      "message": "Created fine-tune: ft-ifP1a6eOV8nLQqRdqiVWrPPh",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": 2,
    "learning_rate_multiplier": 0.5,
    "n_epochs": 10,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-ifP1a6eOV8nLQqRdqiVWrPPh",
  "model": "davinci",
  "object": "fine-tune",
  "organization_id": "org-ppCCXZWpTaByE4cI4jMbZjBx",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 3260,
      "created_at": 1695959341,
      "filename": "file",
      "id": "file-BINZ0Wf02A1T9y3v5P3UQGbE",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 1695959342,
  "validation_files": [
    {
      "bytes": 824,
      "c

In [None]:
%%time
# Stream events to monitor fine-tuning progress
def signal_handler(sig, frame):
    status = openai.FineTune.retrieve(job_id).status
    print(f"Stream interrupted. Job is still {status}.")
    return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = openai.FineTune.stream_events(job_id)
try:
    for event in events:
        print(f'{datetime.datetime.fromtimestamp(event["created_at"])} {event["message"]}')
except Exception:
    print("Stream interrupted (client disconnected).")

In [None]:
print('Checking other finetune jobs in the subscription.')
result = openai.FineTune.list()
print(f'Found {len(result.data)} finetune jobs.')

In [None]:
# Retrieve the finetuned model
fine_tuned_model = result
print(fine_tuned_model)

In [None]:
# Step 3: Retrieve fine-tuned model ID
fine_tuned_model_id = response["id"]
print(f"Fine-tuned model ID: {fine_tuned_model_id}")

### Completion of Fine-tuning: 

Ensure that the fine-tuning process for the model has been completed successfully. It's possible that the model is still being fine-tuned or there was an issue during the fine-tuning process.



# References

+ https://platform.openai.com/docs/guides/fine-tuning/use-a-fine-tuned-model
+ https://platform.openai.com/docs/guides/fine-tuning/estimate-costs