# Chatting With Fine-Tuned Model

In [2]:
# userPrompt = input("User Prompt: ")

# stream = client.chat.completions.create(
#     model="ft:gpt-3.5-turbo-0125:personal::8yogN9zm",
#     messages=[
#         {"role": "system", "content": systemPrompt},
#         {"role": "user", "content": userPrompt}
#     ],
#     stream=True,
# )
# print("GPT Response:\n")
# for chunk in stream:
#     print(chunk.choices[0].delta.content or "", end="")

# Automate QnA

##### Get .env file regardless of which directory you're in

In [3]:
from pathlib import Path
import os
import re
from dotenv import load_dotenv
from openai import OpenAI
import json
from collections import defaultdict


def find_project_root(current_directory, marker):
    current_directory = Path(current_directory).absolute()
    for parent in current_directory.parents:
        if (parent / marker).exists():
            return parent
    raise FileNotFoundError(f"Project root with {marker} not found")

current_directory = Path.cwd()
project_root = find_project_root(current_directory, '.git')

# Load the environment variables from the .env file
env_path = project_root / '.env'
load_dotenv(dotenv_path=env_path)

client = OpenAI() # uses Jinyue's GPT-4 model
print(client.api_key)

sk-proj-nyQmuRs6rAhBh2jpxRCTT3BlbkFJCJx2zZ7yAvDKF13IcySw


In [4]:
def load_single_line_string_from_file(file_path, delim=' '):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().replace('\n', delim)
    
system_prompt_expert = load_single_line_string_from_file('immigration_expert_model_prompt.txt')
system_prompt_query = load_single_line_string_from_file('immigration_query_model_prompt.txt')
system_prompt = load_single_line_string_from_file('system_prompt.txt')

In [5]:
def format_content_as_json(question, answer, filename):
    directory = "JSONL"
    if not os.path.exists(directory):
        os.makedirs(directory)

    data = {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question},
            {"role": "assistant", "content": answer}
        ]
    }

    file_path = os.path.join(directory, filename)
    
    with open(file_path, 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=0)
        f.write('\n')

In [6]:
def qna(file_content):
    questions = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt_query},
            {"role": "user", "content": file_content}
        ],
        model="gpt-4o"
    )
    questions = questions.choices[0].message.content.replace('\n', ' ').split('###')[1:]

    answers = []
    for question in questions:
        answer = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt_expert},
                {"role": "user", "content": question}
            ],
            model="gpt-4o"
        )
        answers.append(answer.choices[0].message.content.replace('\n', ' '))

    # for question, answer in zip(questions, answers):
    #     print(f"Question: {question}")
    #     print(f"Answer: {answer}\n")
    return questions, answers

In [7]:
# def explore_directory(directory):
#     i = 1
#     for dirpath, _, filenames in os.walk(directory):
#         for filename in filenames:
#             if i <= 186:
#                 i += 1
#                 continue
#             full_path = os.path.join(dirpath, filename)
#             file_content = load_single_line_string_from_file(full_path)
#             questions, answers = qna(file_content)
#             for question, answer in zip(questions, answers):
#                 format_content_as_json(question, answer, re.sub(r'\.txt$', '.jsonl', filename))
            
# explore_directory('data')

In [8]:
def format_error_check(data, filename):
    # Format error checks
    format_errors = defaultdict(int)

    for ex in data:
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue
            
        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue
            
        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
            
            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1
            
            if message.get("role", None) not in ("system", "user", "assistant", "function"):
                format_errors["unrecognized_role"] += 1
                
            content = message.get("content", None)
            function_call = message.get("function_call", None)
            
            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1
        
        if not any(message.get("role", None) == "assistant" for message in messages):
            format_errors["example_missing_assistant_message"] += 1

    if format_errors:
        print(f"Found errors in {filename}:")
        for k, v in format_errors.items():
            print(f"{k}: {v}")
            quit()

In [9]:
def load_JSON(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]

    format_error_check(data, filename)

In [10]:
for dirpath, _, filenames in os.walk('JSONL'):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)

        # read file on a snigle line and format each JSON object on one line
        single_line_jsonl = load_single_line_string_from_file(full_path, delim='').replace('}{', '}\n{')
        
        with open(full_path, 'w', encoding='utf-8') as f:
            f.write(single_line_jsonl)

        load_JSON(full_path)

In [11]:
all_files = []
output_dir = 'Training-Files'
os.makedirs(output_dir, exist_ok=True)

for dirpath, _, filenames in os.walk('JSONL'):
    for filename in filenames:
        full_path = os.path.join(dirpath, filename)
        all_files.append(full_path)

# Process files in batches of 20
for i in range(0, len(all_files), 20):
    batch_files = all_files[i:i + 20]
    combined_content = ''
    
    for file_path in batch_files:
        single_line_jsonl = load_single_line_string_from_file(file_path, delim='').replace('}{', '}\n{')
        combined_content += single_line_jsonl + '\n'
        load_JSON(file_path)

    start_index = i + 1
    end_index = i + len(batch_files)
    output_file_name = f"{output_dir}/{start_index}-{end_index}.jsonl"
    with open(output_file_name, 'w', encoding='utf-8') as f:
        f.write(combined_content)

In [8]:
model = "gpt-3.5-turbo-0125"

for dirpath, _, filenames in os.walk('Training-Files'):
    for filename in filenames:
        print(filename)
        full_path = os.path.join(dirpath, filename)

        FileObject = client.files.create(
            file=open(full_path, "rb"),
            purpose="fine-tune"
        )

        file_id = FileObject.id
        print(f'File ID: {file_id}')
        print(f'Model: {model}')

        FineTuningJob = client.fine_tuning.jobs.create(
            training_file=file_id, 
            model=model
        )

        model = FineTuningJob.id
        break
    break

1-20.jsonl
File ID: file-t8yEXJfqyS5Y8QRHSdjZC8YX
Model: gpt-3.5-turbo-0125
