# Fine-tuning Conditional Generation

## Load Packages

In [None]:
!pip install openai
!pip install langchain
!pip install PyPDF2
!pip install openai chromadb
!pip install tiktoken
!pip install python-pptx
!pip install pathlib
!pip install transformers

In [None]:
import csv
import json
import openai
import signal
import datetime
import time
import os
import pandas as pd
import subprocess
import langchain
import PyPDF2
from pptx import Presentation
from transformers import AutoTokenizer


## Prepare Data

In [None]:
# Set the working directory

current_directory = os.getcwd()
os.chdir(os.path.join(current_directory, '../TuningGPT'))

In [5]:
# Convert everything from a folder to .txt

def pdf_to_txt(pdf_file_path, txt_file_path):
    try:
        with open(pdf_file_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            num_pages = len(pdf_reader.pages)

            with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
                for page_num in range(num_pages):
                    page = pdf_reader.pages[page_num]
                    txt_file.write(page.extract_text())

        print(f"Successfully converted '{pdf_file_path}' to '{txt_file_path}'.")
    except Exception as e:
        print(f"Error occurred while converting '{pdf_file_path}': {e}")

def pptx_to_txt(pptx_file_path, txt_file_path):
    try:
        prs = Presentation(pptx_file_path)
        text_content = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text_content.append(shape.text)

        with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write('\n'.join(text_content))

        print(f"Successfully converted '{pptx_file_path}' to '{txt_file_path}'.")
    except Exception as e:
        print(f"Error occurred while converting '{pptx_file_path}': {e}")

def convert_non_txt_to_txt(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            continue  # Skip txt files

        old_file_path = os.path.join(folder_path, filename)
        new_file_path = os.path.join(folder_path, os.path.splitext(filename)[0] + ".txt")

        if filename.endswith(".pdf"):
            pdf_to_txt(old_file_path, new_file_path)
        elif filename.endswith(".pptx"):
            pptx_to_txt(old_file_path, new_file_path)
        else:
            print(f"Unsupported file format: '{filename}'")

folder_path = "../Material"
convert_non_txt_to_txt(folder_path)


In [None]:
# Remove empty lines from all .txt files

from pathlib import Path

def remove_empty_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Filter out empty lines
    non_empty_lines = [line.strip() for line in lines if line.strip()]

    with open(file_path, 'w') as file:
        file.write('\n'.join(non_empty_lines))

def remove_empty_lines_from_files(directory_path):
    path = Path(directory_path)
    txt_files = path.glob("*.txt")

    for file in txt_files:
        remove_empty_lines(file)

if __name__ == "__main__":
    directory_path = "../Material"
    remove_empty_lines_from_files(directory_path)


In [None]:
# Prepare the training data (leaving prompt blank and just filling in completion with around 1,000 tokens. Token definition can be found here: https://platform.openai.com/tokenizer)
from transformers import GPT2Tokenizer

def count_tokens(text):
    # Load the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    # Tokenize the text and return the token count
    tokens = tokenizer.encode(text, add_special_tokens=False)
    return len(tokens)

def chunk_text(text, max_tokens):
    chunks = []
    current_chunk = ''
    current_token_count = 0

    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)
        if current_token_count + sentence_tokens <= max_tokens:
            current_chunk += sentence
            current_token_count += sentence_tokens
        else:
            chunks.append((current_chunk, current_token_count))
            current_chunk = sentence
            current_token_count = sentence_tokens

    if current_chunk:
        chunks.append((current_chunk, current_token_count))

    return chunks

def read_txt_files_and_create_csv(txt_folder_path, csv_file_path, max_tokens):
    with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['prompt', 'completion', 'token_count'])

        for filename in os.listdir(txt_folder_path):
            if filename.endswith(".txt"):
                txt_file_path = os.path.join(txt_folder_path, filename)
                with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
                    text_content = txt_file.read()
                    chunks = chunk_text(text_content, max_tokens)
                    for chunk, token_count in chunks:
                        csv_writer.writerow(['', chunk, token_count])

txt_folder_path = "../Material"
csv_file_path = "training_data.csv"
max_tokens_per_row = 999

read_txt_files_and_create_csv(txt_folder_path, csv_file_path, max_tokens_per_row)

In [None]:
# Remove the 'token_count' column
csv_file_path = "training_data.csv"
df = pd.read_csv(csv_file_path)
df = df.drop(columns=['token_count'])

output_file_path = "training_data.csv"
df.to_csv(output_file_path, index=False)

In [None]:
"""
# If you do not want to inspect token count, you may:

from transformers import GPT2Tokenizer


def count_tokens(text):
    # Load the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    # Tokenize the text and return the token count
    tokens = tokenizer.encode(text, add_special_tokens=False)
    return len(tokens)

def chunk_text(text, max_tokens):
    chunks = []
    current_chunk = ''
    current_token_count = 0

    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    for sentence in sentences:
        sentence_tokens = count_tokens(sentence)
        if current_token_count + sentence_tokens <= max_tokens:
            current_chunk += sentence
            current_token_count += sentence_tokens
        else:
            chunks.append((current_chunk, current_token_count))
            current_chunk = sentence
            current_token_count = sentence_tokens

    if current_chunk:
        chunks.append((current_chunk, current_token_count))

    return chunks

def read_txt_files_and_create_csv(txt_folder_path, csv_file_path, max_tokens):
    with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['prompt', 'completion'])

        for filename in os.listdir(txt_folder_path):
            if filename.endswith(".txt"):
                txt_file_path = os.path.join(txt_folder_path, filename)
                with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
                    text_content = txt_file.read()
                    chunks = chunk_text(text_content, max_tokens)
                    for chunk, _ in chunks:  # We don't need the token_count here.
                        csv_writer.writerow(['', chunk])

txt_folder_path = "../Material"
csv_file_path = "training_data.csv"
max_tokens_per_row = 999

read_txt_files_and_create_csv(txt_folder_path, csv_file_path, max_tokens_per_row)
"""

## Fine-tuning Conditional Generation

In [8]:
# Directly assign your API key if you prefer not to use a .txt file
default_api_key = "<your_api_key>"

# Or, specify the filename for the API key configuration
config_filename = "<api_key_file>.txt"

# Check if the <api_key_file>.txt file exists in the current directory
if os.path.isfile(config_filename):
    with open(config_filename, 'r') as file:
        api_key = file.readline().strip().split('=')[1]
else:
    # Use the default API key if the file doesn't exist
    api_key = default_api_key

openai.api_key = api_key

In [12]:
# Prepare data as block of content

training_data = 'training_data.csv'

def prepare_data(csv_file, jsonl_file):
    training_data = []

    with open(csv_file, 'r', encoding='utf-8-sig') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            prompt = row['prompt']
            if not prompt.endswith('?'):
                prompt += '?'  # Add question mark if missing
            prompt = prompt + '->'
            completion = ' ' + row['completion']
            if not completion.endswith('.'):
                completion += '.'  # Add period if missing
            completion += '\n'
            entry = {'prompt': prompt,
					           'completion': completion}
            training_data.append(entry)

    with open(jsonl_file, 'w') as jsonlfile:
        for entry in training_data:
            print (entry)
            json.dump(entry, jsonlfile)
            jsonlfile.write('\n')


prepare_data(training_data, 'training_data.jsonl')


In [None]:
!openai tools fine_tunes.prepare_data -f "training_data.jsonl"

In [19]:
training_file_id = openai.File.create(
  file=open("training_data.jsonl", "rb"),
  purpose='fine-tune'
)["id"]

## Creating and Sending a Fine-tuning Job

In [None]:
create_args = {
	"training_file": training_file_id,
	"model": "davinci",
	"n_epochs": 15,
	"batch_size": 3,
	"learning_rate_multiplier": 0.3
}

response = openai.FineTune.create(**create_args)
job_id = response["id"]
status = response["status"]

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

In [None]:
import signal
import datetime

def signal_handler(sig, frame):
	status = openai.FineTune.retrieve(job_id).status
	print(f"Stream interrupted. Job is still {status}.")
	return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = openai.FineTune.stream_events(job_id)
try:
  for event in events:
    print(f'{datetime.datetime.fromtimestamp(event["created_at"])} {event["message"]}')
except Exception:
  print("Stream interrupted (client disconnected).")

## Retrieve Trained Models

In [None]:
# Check fine-tuning Status

import time

status = openai.FineTune.retrieve(id=job_id)["status"]
if status not in ["succeeded", "failed"]:
  print(f'Job not in terminal status: {status}. Waiting.')
  while status not in ["succeeded", "failed"]:
    time.sleep(2)
    status = openai.FineTune.retrieve(id=job_id)["status"]
    print(f'Status: {status}')
else:
  print(f'Finetune job {job_id} finished with status: {status}')

"""
print('Checking other finetune jobs in the subscription.')
result = openai.FineTune.list()
print(f'Found {len(result.data)} finetune jobs.')
"""

In [None]:
# Retrieve fine-tunning job information
 
openai.FineTune.retrieve(id=job_id)

In [None]:
#Fill in job_id manually if session expires
#job_id = 'ft-LTqFIfBcnJPH7QWQ6AbCnAhb'

In [26]:
fine_tuned_model = openai.FineTune.retrieve(id=job_id)["fine_tuned_model"]

In [None]:
#Alternatively, fill in model manually if it is obtained from Postman
#fine_tuned_model = "<fine_tunned_model_id>"

## Sending a Prompt to a Selected Fine-tuned Model

In [None]:
new_prompt = "What is the name of the course?"
answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer['choices'][0]['text'])

new_prompt = "What are the grading criteria?"
answer = openai.Completion.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer['choices'][0]['text'])

In [None]:
def process_dataframe(source, file_path):
    df = pd.read_csv(file_path)
    responses = []
    for index, row in df.iterrows():
        prompt = row["prompt"]

        answer = openai.Completion.create(
          model=fine_tuned_model,
          prompt=prompt
        )

        response = answer['choices'][0]['text']
        responses.append(response)
    return responses

source = "../Material/merge.txt"
file_path = "Test_NaturalQuestion.csv"
responses = process_dataframe(source, file_path)
responses

In [None]:
df = pd.read_csv(file_path)

# Make sure the number of items in the list matches the number of rows in the DataFrame
if len(responses) != len(df):
    print("Number of responses doesn't match the number of rows in the file.")
else:
    # Fill in the "response" column
    for i, response in enumerate(responses):
        df.at[i, "response"] = response

    # Save the updated DataFrame back to the CSV file
    df.to_csv(file_path, index=False)

    print("Responses successfully filled in the file.")