In [1]:
import pandas as pd
import json
import openai
import os
openai.api_key = openai.api_key = os.getenv("OPENAI_API_KEY")
import time
import datetime
import signal
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error



# Convert the pandas DataFrame to JSONL format:


In [2]:
import pandas as pd
import json

# Read the dataset
dataset_path = "../data/ml-latest-small/merged_data.csv"
movie_data = pd.read_csv(dataset_path)

# get 1000 random samples
movie_data = movie_data.sample(n=1000, random_state=42)

# Split the dataset into training and validation sets (80-20% split)
train_data = movie_data.sample(frac=0.8, random_state=42)
validation_data = movie_data.drop(train_data.index)

# Convert the DataFrame to a list of dictionaries for training data
training_data = []
for index, row in train_data.iterrows():
    prompt = f"Title: {row['title']}, Genres: {row['genres']}, Tag: {row['tag']}"
    completion = str(row['rating'])
    training_data.append({"prompt": prompt, "completion": completion})

# Repeat for validation data
validation_data_list = []
for index, row in validation_data.iterrows():
    prompt = f"Title: {row['title']}, Genres: {row['genres']}, Tag: {row['tag']}"
    completion = str(row['rating'])
    validation_data_list.append({"prompt": prompt, "completion": completion})

# Save the data in JSONL format
training_file_name = "training_data.jsonl"
validation_file_name = "validation_data.jsonl"

def prepare_data(dictionary_data, final_file_name):
    with open(final_file_name, 'w') as outfile:
        for entry in dictionary_data:
            json.dump(entry, outfile)
            outfile.write('\n')

prepare_data(training_data, training_file_name)
prepare_data(validation_data_list, validation_file_name)


#  Fine-tuning

In [3]:
# Upload the datasets to OpenAI
def upload_data_to_OpenAI(file_name):
    with open(file_name, "rb") as f:
        response = openai.File.create(file=f, purpose='fine-tune')
        return response.id

training_file_id = upload_data_to_OpenAI(training_file_name)
validation_file_id = upload_data_to_OpenAI(validation_file_name)

# Adjusted fine-tuning parameters
create_args = {
    "training_file": training_file_id,
    "validation_file": validation_file_id,
    "model": "davinci",
    "n_epochs": 10,               # Reduced number of epochs
    "batch_size": 2,              # Reduced batch size
    "learning_rate_multiplier": 0.5  # Slightly increased learning rate
}

response = openai.FineTune.create(**create_args)
job_id = response["id"]
status = response["status"]

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")


Fine-tuning model with jobID: ft-fyRKwrxeGVX29T7T6Ixjs6Qf.
Training Response: {
  "created_at": 1694149135,
  "events": [
    {
      "created_at": 1694149135,
      "level": "info",
      "message": "Created fine-tune: ft-fyRKwrxeGVX29T7T6Ixjs6Qf",
      "object": "fine-tune-event"
    }
  ],
  "fine_tuned_model": null,
  "hyperparams": {
    "batch_size": 2,
    "learning_rate_multiplier": 0.5,
    "n_epochs": 10,
    "prompt_loss_weight": 0.01
  },
  "id": "ft-fyRKwrxeGVX29T7T6Ixjs6Qf",
  "model": "davinci",
  "object": "fine-tune",
  "organization_id": "org-ppCCXZWpTaByE4cI4jMbZjBx",
  "result_files": [],
  "status": "pending",
  "training_files": [
    {
      "bytes": 92558,
      "created_at": 1694149134,
      "filename": "file",
      "id": "file-AajWnzfucSfiP0pJzMYGq5Ic",
      "object": "file",
      "purpose": "fine-tune",
      "status": "uploaded",
      "status_details": null
    }
  ],
  "updated_at": 1694149135,
  "validation_files": [
    {
      "bytes": 22827,
     

In [4]:
# Stream events to monitor fine-tuning progress
def signal_handler(sig, frame):
    status = openai.FineTune.retrieve(job_id).status
    print(f"Stream interrupted. Job is still {status}.")
    return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = openai.FineTune.stream_events(job_id)
try:
    for event in events:
        print(f'{datetime.datetime.fromtimestamp(event["created_at"])} {event["message"]}')
except Exception:
    print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ft-fyRKwrxeGVX29T7T6Ixjs6Qf
2023-09-08 14:58:55 Created fine-tune: ft-fyRKwrxeGVX29T7T6Ixjs6Qf
2023-09-08 14:59:21 Fine-tune costs $6.67
2023-09-08 14:59:21 Fine-tune enqueued. Queue number: 0
2023-09-08 14:59:22 Fine-tune started


In [8]:
# Define the fine-tuned model ID
fine_tuned_model = "ft-fyRKwrxeGVX29T7T6Ixjs6Qf"

# Generate predictions using the fine-tuned model
def get_model_predictions(model, prompts):
    predictions = []
    for prompt in prompts:
        response = openai.Completion.create(model=model, prompt=prompt)
        predictions.append(float(response['choices'][0]['text'].strip()))
    return predictions

# Prepare prompts from validation_data for rating predictions
prompts = validation_data['title'].tolist()  # Assuming 'title' is the feature you want to use for prediction

# Get predictions
predicted_ratings = get_model_predictions(fine_tuned_model, prompts)

# True ratings from the validation set
true_ratings = validation_data['rating'].tolist()

# Calculate evaluation metrics
mae = mean_absolute_error(true_ratings, predicted_ratings)
mse = mean_squared_error(true_ratings, predicted_ratings)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")


InvalidRequestError: The model `ft-fyRKwrxeGVX29T7T6Ixjs6Qf` does not exist

### Completion of Fine-tuning: 

Ensure that the fine-tuning process for the model has been completed successfully. It's possible that the model is still being fine-tuned or there was an issue during the fine-tuning process.

