# Fine-tuning Conditional Generation

## Load Packages

In [None]:
!pip install openai
!pip install langchain
!pip install PyPDF2
!pip install openai chromadb
!pip install tiktoken
!pip install python-pptx
!pip install pathlib
!pip install transformers

In [None]:
import csv
import json
import openai
import signal
import datetime
import time
import os
import pandas as pd
import subprocess
import langchain
import PyPDF2
from pptx import Presentation
from transformers import AutoTokenizer


## Prepare Data

In [3]:
# Set the working directory

current_directory = os.getcwd()
os.chdir(os.path.join(current_directory, '../TuningGPT'))

## Fine-tuning Conditional Generation

In [8]:
# Directly assign your API key if you prefer not to use a .txt file
default_api_key = "<your_api_key>"

# Or, specify the filename for the API key configuration
config_filename = "<api_key_file>.txt"

# Check if the <api_key_file>.txt file exists in the current directory
if os.path.isfile(config_filename):
    with open(config_filename, 'r') as file:
        api_key = file.readline().strip().split('=')[1]
else:
    # Use the default API key if the file doesn't exist
    api_key = default_api_key

openai.api_key = api_key

In [None]:
# Specify the .csv file with prompts and completions

training_data = 'training_data.csv'

def prepare_data(csv_file, jsonl_file):
    training_data = []

    with open(csv_file, 'r', encoding='utf-8-sig') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prompt = row['prompt']
            prompt = prompt.lower()   # Convert all prompts to lowercase
            if not prompt.endswith('?'):
                prompt += '?'  # Add question mark if missing
            prompt = prompt + '->'
            completion = ' ' + row['completion']
            if not completion.endswith('.'):
                completion += '.'  # Add period if missing
            completion += '\n'
            entry = {'prompt': prompt,
					           'completion': completion}
            training_data.append(entry)

    with open(jsonl_file, 'w') as jsonlfile:
        for entry in training_data:
            json.dump(entry, jsonlfile)
            jsonlfile.write('\n')


prepare_data(training_data, 'training_data.jsonl')

In [None]:
!openai tools fine_tunes.prepare_data -f "training_data.jsonl"

In [19]:
training_file_id = openai.File.create(
  file=open("training_data.jsonl", "rb"),
  purpose='fine-tune'
)["id"]

## Creating and Sending a Fine-tuning Job

In [None]:
create_args = {
	"training_file": training_file_id,
	"model": "davinci",
	"n_epochs": 15,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
}

response = openai.FineTune.create(**create_args)
job_id = response["id"]
status = response["status"]

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

In [None]:
import signal
import datetime

def signal_handler(sig, frame):
	status = openai.FineTune.retrieve(job_id).status
	print(f"Stream interrupted. Job is still {status}.")
	return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = openai.FineTune.stream_events(job_id)
try:
  for event in events:
    print(f'{datetime.datetime.fromtimestamp(event["created_at"])} {event["message"]}')
except Exception:
  print("Stream interrupted (client disconnected).")

In [None]:
# Check fine-tuning Status

import time

status = openai.FineTune.retrieve(id=job_id)["status"]
if status not in ["succeeded", "failed"]:
  print(f'Job not in terminal status: {status}. Waiting.')
  while status not in ["succeeded", "failed"]:
    time.sleep(2)
    status = openai.FineTune.retrieve(id=job_id)["status"]
    print(f'Status: {status}')
else:
  print(f'Finetune job {job_id} finished with status: {status}')

"""
print('Checking other finetune jobs in the subscription.')
result = openai.FineTune.list()
print(f'Found {len(result.data)} finetune jobs.')
"""

## Retrieve Trained Models

In [None]:
# Retrieve fine-tunning job information
 
openai.FineTune.retrieve(id=job_id)

In [None]:
# Or, fill in job_id manually if session expires
#job_id = '<fine_tunning_job_id>'

In [26]:
fine_tuned_model = openai.FineTune.retrieve(id=job_id)["fine_tuned_model"]

In [None]:
#Alternatively, fill in model manually if it is obtained from Postman
#fine_tuned_model = "<fine_tunned_model_id>"

## Sending a Prompt to a Selected Fine-tuned Model

In [None]:
# Add stop=[".\n"] to make sure the response ends at proper location
# Add max_tokens to avoid incomplete answers

new_prompt = "What is the name of the course?"
new_prompt = new_prompt.lower()

if not new_prompt.endswith('?'):
        new_prompt += '?'  # Add question mark if missing
new_prompt = new_prompt + '->'


answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt,
  stop=[".\n"],
  best_of = 10,
  max_tokens = 1000
)


generated_text = answer['choices'][0]['text']
if (not generated_text.endswith('.')) or (not generated_text.endswith('. ')):
    generated_text += '.'

print(generated_text)



In [None]:
def process_dataframe(file_path):
    df = pd.read_csv(file_path)
    responses = []
    for index, row in df.iterrows():
        prompt = row["prompt"]
        prompt = prompt.lower()
        
        if not prompt.endswith('?'):
                prompt += '?'  # Add question mark if missing
        prompt = prompt + '->'

        answer = openai.Completion.create(
          model=fine_tuned_model,
          prompt=prompt,
          stop=[".\n"],
          best_of = 10,
          max_tokens = 100
        )

        generated_text = answer['choices'][0]['text']
        if (not generated_text.endswith('.')) and (not generated_text.endswith('. ')):
            generated_text += '.'

        responses.append(generated_text)
    return responses

file_path = "/content/drive/MyDrive/Research/TuningGPT/test_SameQuestion.csv"
responses = process_dataframe(file_path)
responses

In [None]:
df = pd.read_csv(file_path)

# Make sure the number of items in the list matches the number of rows in the DataFrame
if len(responses) != len(df):
    print("Number of responses doesn't match the number of rows in the file.")
else:
    # Fill in the "response" column
    for i, response in enumerate(responses):
        df.at[i, "response"] = response

    # Save the updated DataFrame back to the CSV file
    df.to_csv(file_path, index=False)

    print("Responses successfully filled in the file.")