In [1]:
import sys
import os

# Add the root directory to Python path
os.chdir(os.path.abspath('..'))

# Data preparation

In [2]:
from src.data_loader.taiwanchat_loader import TaiwanChatDataLoader
from src.data_processor.message_handler import (
    format_fine_tune_dataset_as_messages,
    check_openai_format_errors,
    num_tokens_from_messages,
    format_fine_tune_dataset_as_openai_input,
    format_fine_tune_dataset_as_openai_input_with_threshold,
    is_valid_openai_example
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
twchat_loader = TaiwanChatDataLoader("yentinglin/TaiwanChat", cache_dir='./data/fine_tuning')
twchat_dataset = twchat_loader.load_dataset()
filtered_dataset = twchat_dataset.filter(lambda example: example['dataset_name'] == 'sharegpt')

Dataset already exists at ./data/fine_tuning\yentinglin/TaiwanChat. Loading from disk...


In [5]:
openai_input = format_fine_tune_dataset_as_openai_input_with_threshold(filtered_dataset, 2)
check_openai_format_errors(openai_input)

No errors found


In [7]:
len(openai_input)

1593

In [11]:
token_list = [num_tokens_from_messages(message['messages']) for message in openai_input]
sum(token_list)

1999737

In [6]:
import json
from datetime import datetime
from typing import List, Dict

def export_to_jsonl(data: List[Dict], base_filename: str, output_dir: str = "."):
    """
    Exports data to a JSONL file with a timestamped suffix.
    
    Args:
        data (List[Dict]): The data to be exported, with each dictionary representing a line.
        base_filename (str): The base name for the JSONL file.
        output_dir (str): The directory where the JSONL file will be saved.
    """
    # Generate the current timestamp in the format YYYY-MM-DD_HH-MM-SS
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    
    # Create the full filename with the base name and timestamp
    filename = f"{base_filename}_{timestamp}.jsonl"
    filepath = f"{output_dir}/{filename}"
    
    # Write the data to a JSONL file
    with open(filepath, "w", encoding="utf-8") as f:
        for record in data:
            json_line = json.dumps(record, ensure_ascii=False)
            f.write(json_line + "\n")
    
    print(f"Data exported to {filepath}")

In [7]:
export_to_jsonl(openai_input, 'Taiwan_Chat_sharegpt_2M', './data/fine_tuning/openai')

Data exported to ./data/fine_tuning/openai/Taiwan_Chat_sharegpt_2M_2024-09-22_20-59-35.jsonl


# Upload a training file

In [9]:
from openai import OpenAI
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

client.files.create(
  file=open("./data/fine_tuning/openai/Taiwan_Chat_sharegpt_2M_2024-09-22_20-59-35.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-jS2NSDODgq49G2kTaSnIzOFT', bytes=6968989, created_at=1727053235, filename='Taiwan_Chat_sharegpt_2M_2024-09-22_20-59-35.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [10]:
from openai import OpenAI
client = OpenAI()

client.files.list()


SyncPage[FileObject](data=[FileObject(id='file-jS2NSDODgq49G2kTaSnIzOFT', bytes=6968989, created_at=1727053235, filename='Taiwan_Chat_sharegpt_2M_2024-09-22_20-59-35.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)], object='list', has_more=False, first_id='file-jS2NSDODgq49G2kTaSnIzOFT', last_id='file-jS2NSDODgq49G2kTaSnIzOFT')

# Create a fine-tuned model

In [13]:
from openai import OpenAI
client = OpenAI()

client.fine_tuning.jobs.create(
  training_file="file-jS2NSDODgq49G2kTaSnIzOFT", 
  model="gpt-4o-mini-2024-07-18",
  suffix="Taiwan_Chat_sharegpt_2M"
)

FineTuningJob(id='ftjob-SiimdLuhiSnT8f5EUmsOWnPh', created_at=1727053499, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-JjIWWudV9j3i9nwPjwcSwS2n', result_files=[], seed=588826881, status='validating_files', trained_tokens=None, training_file='file-jS2NSDODgq49G2kTaSnIzOFT', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='Taiwan_Chat_sharegpt_2M')

In [14]:
from openai import OpenAI
client = OpenAI()

# List 10 fine-tuning jobs
client.fine_tuning.jobs.list(limit=10)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-SiimdLuhiSnT8f5EUmsOWnPh', created_at=1727053499, error=Error(code='invalid_training_file', message='The job failed due to an invalid training file. Invalid file format. Example 52 The last message must be from the assistant', param='training_file'), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-JjIWWudV9j3i9nwPjwcSwS2n', result_files=[], seed=588826881, status='failed', trained_tokens=None, training_file='file-jS2NSDODgq49G2kTaSnIzOFT', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix='Taiwan_Chat_sharegpt_2M')], object='list', has_more=False)