# Chatting With Fine-Tuned Model

In [2]:
# userPrompt = input("User Prompt: ")

# stream = client.chat.completions.create(
#     model="ft:gpt-3.5-turbo-0125:personal::8yogN9zm",
#     messages=[
#         {"role": "system", "content": systemPrompt},
#         {"role": "user", "content": userPrompt}
#     ],
#     stream=True,
# )
# print("GPT Response:\n")
# for chunk in stream:
#     print(chunk.choices[0].delta.content or "", end="")

# Automate QnA

##### Get .env file regardless of which directory you're in

In [2]:
from pathlib import Path
import os
import re
import random
from dotenv import load_dotenv
from openai import OpenAI
import json
from collections import defaultdict


def find_project_root(current_directory, marker):
    current_directory = Path(current_directory).absolute()
    for parent in current_directory.parents:
        if (parent / marker).exists():
            return parent
    raise FileNotFoundError(f"Project root with {marker} not found")

current_directory = Path.cwd()
project_root = find_project_root(current_directory, '.git')

# Load the environment variables from the .env file
env_path = project_root / '.env'
load_dotenv(dotenv_path=env_path)

client = OpenAI() # uses Jinyue's GPT-4 model
print(client.api_key)

sk-proj-nyQmuRs6rAhBh2jpxRCTT3BlbkFJCJx2zZ7yAvDKF13IcySw


In [2]:
def load_single_line_string_from_file(file_path, delim=' '):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().replace('\n', delim)
    
system_prompt_expert = load_single_line_string_from_file('immigration_expert_model_prompt.txt')
system_prompt_query = load_single_line_string_from_file('immigration_query_model_prompt.txt')
system_prompt = load_single_line_string_from_file('system_prompt.txt')

In [5]:
def format_content_as_json(question, answer, filename):
    directory = "JSONL"
    if not os.path.exists(directory):
        os.makedirs(directory)

    data = {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
    }

    file_path = os.path.join(directory, filename)
    
    with open(file_path, 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=0)
        f.write('\n')

In [6]:
def qna(file_content):
    questions = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt_query},
            {"role": "user", "content": file_content}
        ],
        model="gpt-4o"
    )
    questions = questions.choices[0].message.content.replace('\n', ' ').split('###')[1:]

    answers = []
    for question in questions:
        answer = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt_expert},
                {"role": "user", "content": question}
            ],
            model="gpt-4o"
        )
        answers.append(answer.choices[0].message.content.replace('\n', ' '))

    return questions, answers

In [7]:
def explore_directory(directory):
    i = 1
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if i <= 186:
                i += 1
                continue
            full_path = os.path.join(dirpath, filename)
            file_content = load_single_line_string_from_file(full_path)
            questions, answers = qna(file_content)
            for question, answer in zip(questions, answers):
                format_content_as_json(question, answer, re.sub(r'\.txt$', '.jsonl', filename))
            
explore_directory('data')

In [27]:
def format_error_check(data, filename):
    # Format error checks
    format_errors = defaultdict(int)

    for ex in data:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue
            
        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue
            
        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
            
            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1
            
            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1
                
            content = message.get("content", None)
            function_call = message.get("function_call", None)
            
            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1
        
        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print(f"Found errors in {filename}:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
            quit()

In [26]:
def load_JSON(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    format_error_check(data, filename)

In [10]:
for dirpath, _, filenames in os.walk('JSONL'):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)

        # read file on a snigle line and format each JSON object on one line
        single_line_jsonl = load_single_line_string_from_file(full_path, delim='').replace('}{', '}\n{')
        
        with open(full_path, 'w', encoding='utf-8') as f:
            f.write(single_line_jsonl)

        load_JSON(full_path)

In [11]:
all_files = []
output_dir = 'Training-Files'
os.makedirs(output_dir, exist_ok=True)

for dirpath, _, filenames in os.walk('JSONL'):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)
        all_files.append(full_path)

# Process files in batches of 20
for i in range(0, len(all_files), 20):
    batch_files = all_files[i:i + 20]
    combined_content = ''
    
    for file_path in batch_files:
        single_line_jsonl = load_single_line_string_from_file(file_path, delim='').replace('}{', '}\n{')
        combined_content += single_line_jsonl + '\n'
        load_JSON(file_path)

    start_index = i + 1
    end_index = i + len(batch_files)
    output_file_name = f"{output_dir}/{start_index}-{end_index}.jsonl"
    with open(output_file_name, 'w', encoding='utf-8') as f:
        f.write(combined_content)

In [3]:
all_filepaths = []

for dirpath, _, filenames in os.walk('Training-Files'):
    for filename in filenames:
        all_filepaths.append(os.path.join(dirpath, filename))

In [4]:
print(len(all_filepaths))

22


In [31]:
class FineTuningManager:
    def __init__(self, client, id_file='id_counter.txt'):
        self.client = client
        self.id_file = id_file
        self._id_counter = self._load_id()
        self.output_dir = 'Training Data Split'
        os.makedirs(self.output_dir, exist_ok=True)

    def _load_id(self):
        if os.path.exists(self.id_file):
            with open(self.id_file, 'r') as file:
                return int(file.read().strip())
        else:
            return 0

    def _save_id(self):
        with open(self.id_file, 'w') as file:
            file.write(str(self._id_counter))

    def _increment_id(self):
        self._id_counter += 1
        self._save_id()

    def _get_output_path(self, base_name):
        return os.path.join(self.output_dir, f"{base_name}-{self._id_counter}.jsonl")

    def _concatenate_files(self, filepaths, output_path):
        with open(output_path, 'wb') as outfile:
            for filepath in filepaths:
                with open(filepath, 'rb') as infile:
                    outfile.write(infile.read())

    def fine_tune_model(self, all_filepaths, split_ratio=(0.7, 0.3), hyperparams=None):
        """
        Fine-tune a model with the given split ratio and hyperparameters.

        Parameters:
        - all_filepaths: List of file paths to be used for training and validation.
        - split_ratio: Tuple specifying the training and validation split ratio (default is (0.7, 0.3)).
        - hyperparams: Dictionary containing hyperparameters such as n_epochs, learning_rate_multiplier, and batch_size (default is None).

        Returns:
        - The response from the fine-tuning job creation.
        """
        self._increment_id()

        # Randomize the order of all_filepaths
        random.shuffle(all_filepaths)

        # Calculate the number of training and validation files
        num_files = len(all_filepaths)
        num_training_files = int(split_ratio[0] * num_files)

        # Select files for training and validation
        training_filepaths = all_filepaths[:num_training_files]
        validation_filepaths = all_filepaths[num_training_files:]

        # Define the paths for the concatenated files
        training_full_path = self._get_output_path('concatenated_training_file')
        validation_full_path = self._get_output_path('concatenated_validation_file')

        # Concatenate the files
        self._concatenate_files(training_filepaths, training_full_path)
        self._concatenate_files(validation_filepaths, validation_full_path)

        # Create the training file
        TrainingFileObject = self.client.files.create(
            file=open(training_full_path, "rb"),
            purpose="fine-tune",
        )
        training_file_id = TrainingFileObject.id
        print(f'Training File ID: {training_file_id}')

        # Create the validation file
        ValidationFileObject = self.client.files.create(
            file=open(validation_full_path, 'rb'),
            purpose='fine-tune'
        )
        validation_file_id = ValidationFileObject.id
        print(f'Validation File ID: {validation_file_id}')

        # Create the fine-tuning job
        fine_tune_params = {
            'training_file': training_file_id,
            'validation_file': validation_file_id,
            'model': "gpt-3.5-turbo-0125"
        }

        if hyperparams:
            fine_tune_params['hyperparameters'] = hyperparams

        FineTuningJob = self.client.fine_tuning.jobs.create(**fine_tune_params)
        print(FineTuningJob)

        return FineTuningJob

In [32]:
# Initialize the FineTuningManager with the client
# DO NOT CREATE A NEW OBJECT
manager = FineTuningManager(client)

In [33]:
finetuning_job_1 = manager.fine_tune_model(
    all_filepaths, 
    split_ratio=(0.8, 0.2)
)

Training File ID: file-1bSGl6f9QY0uIb3M861eE4Wl
Validation File ID: file-xV2ApQ8rv8wMJxjpMlu1PwD0
FineTuningJob(id='ftjob-gkJlfY0lqemFLHv2mNrhaAEQ', created_at=1715835759, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-6CNZXJUUW37cfOBO7YkjA6mu', result_files=[], status='validating_files', trained_tokens=None, training_file='file-1bSGl6f9QY0uIb3M861eE4Wl', validation_file='file-xV2ApQ8rv8wMJxjpMlu1PwD0', user_provided_suffix=None, seed=1815113034, estimated_finish=None, integrations=[])


In [34]:
finetuning_job_2 = manager.fine_tune_model(
    all_filepaths, 
    split_ratio=(0.8, 0.2), 
    hyperparams={
        "n_epochs": 3, 
        "learning_rate_multiplier": 0.1,
        "batch_size": 32
    }
)

Training File ID: file-ZkZzyoCIVEFAWsws3THRoeXO
Validation File ID: file-yqPZTustCYsI7TnoaRVXkAsB
FineTuningJob(id='ftjob-KWVqaHteLvw6bVPZF24IeGst', created_at=1715835778, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=32, learning_rate_multiplier=0.1), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-6CNZXJUUW37cfOBO7YkjA6mu', result_files=[], status='validating_files', trained_tokens=None, training_file='file-ZkZzyoCIVEFAWsws3THRoeXO', validation_file='file-yqPZTustCYsI7TnoaRVXkAsB', user_provided_suffix=None, seed=1872424921, estimated_finish=None, integrations=[])


In [35]:
finetuning_job_3 = manager.fine_tune_model(
    all_filepaths, 
    split_ratio=(0.7, 0.3) 
)

Training File ID: file-1BTie63SwKdy9v59K9HsSQ7H
Validation File ID: file-TdKZrkg8PUyUpuQBeRdx65Hx
FineTuningJob(id='ftjob-wjUHCVImuJY3G0ITHjkoYrIq', created_at=1715835827, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-6CNZXJUUW37cfOBO7YkjA6mu', result_files=[], status='validating_files', trained_tokens=None, training_file='file-1BTie63SwKdy9v59K9HsSQ7H', validation_file='file-TdKZrkg8PUyUpuQBeRdx65Hx', user_provided_suffix=None, seed=2033361132, estimated_finish=None, integrations=[])


In [None]:
finetuning_job_4 = manager.fine_tune_model(
    all_filepaths, 
    split_ratio=(0.7, 0.3), 
    hyperparams={
        "n_epochs": 3, 
        "learning_rate_multiplier": 0.1,
        "batch_size": 32
    }
)