In [1]:
import sys
import os

# Add the root directory to Python path
os.chdir(os.path.abspath('..'))



# Data Preparation

In [3]:
from src.data_loader.tmlu_loader import TMLUDataLoader

# TMLU dataset
tmlu_loader = TMLUDataLoader("miulab/tmlu")
tmlu_dataset = tmlu_loader.load_dataset()

Dataset already exists at ./data/eval\miulab/tmlu. Loading from disk...


In [4]:
tmlu_dataset

DatasetDict({
    test: Dataset({
        features: ['id', 'question', 'A', 'B', 'C', 'D', 'E', 'F', 'answer', 'explanation', 'metadata', 'human_evaluation', 'subject', 'user_content'],
        num_rows: 2796
    })
    dev: Dataset({
        features: ['id', 'question', 'A', 'B', 'C', 'D', 'E', 'F', 'answer', 'explanation', 'metadata', 'human_evaluation', 'subject', 'user_content'],
        num_rows: 185
    })
})

In [10]:
def format_dataset_as_messages(dataset_dict):
    """
    Formats each row in the 'test' split of the dataset into a list of messages.
    
    Args:
        dataset_dict (DatasetDict): A DatasetDict containing 'test' and 'dev' splits.
    
    Returns:
        list: A list of message lists for the 'test' split.
    """
    # Define the fixed system role message
    system_message = {
        "role": "system",
        "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English."
    }

    # Get the 'test' dataset
    test_dataset = dataset_dict['test']
    
    # Initialize a list to hold the formatted messages
    messages_list = []
    
    for example in test_dataset:
        # Create the user role message using 'user_content' from the dataset
        user_message = {
            "role": "user",
            "content": example["user_content"]
        }
        
        # Combine the system message and user message into a list
        messages = [system_message, user_message]
        
        # Add the list of messages to the final list
        messages_list.append(messages)
    
    return messages_list

In [11]:
result = format_dataset_as_messages(tmlu_dataset)

In [13]:
result[0]

[{'role': 'system',
  'content': 'You are a helpful, pattern-following assistant that translates corporate jargon into plain English.'},
 {'role': 'user',
  'content': '\n        以下選擇題為出自臺灣的考題，答案為其中一個選項。\n\n        問題:\n        閱讀下文，選出依序最適合填入□內的選項：\n甲、我母親和我姑姑一同出洋去，上船的那天她伏在竹床上痛哭，綠衣綠裙上面釘有□□發光的小片子。(張愛玲〈私語〉)\n乙、蝴蝶的本能是吮吸花蜜，女人的愛亦是一種本能：採集所有美好事物引誘自己進入想像，從自身記憶□□□□並且偷摘他人經驗之片段，想像繁殖成更豐饒的想像，織成一張華麗的密網。（簡媜〈母者〉）\n丙、母親是天可汗，當家的天可汗，一家之王，絕對的威權，分配空間與食物的主人。她要我報告的事，或她突如其來要我□□的事，我最好都要知道，所以我在覲見可汗時，不論她問不問我話，我的心中就是會先有腹稿。(鍾文音〈我的天可汗〉)\n\n        (A) 張揚／綢繆未雨／奏疏 (B) 抽搐／煮繭抽絲／奏疏 (C) 張揚／煮繭抽絲／進貢\t (D) 抽搐／綢繆未雨／進貢\t\n        正確答案：(\n        '}]

In [3]:
def create_user_prompt(example):
    # Template for the question
    prompt_template_part_one = '''
    以下選擇題為出自臺灣的考題，答案為其中一個選項。

    問題:
    {question}

    '''
    prompt_template_part_two = '''
    正確答案：(
    '''
    # List of possible options (A to F)
    options = ["A", "B", "C", "D", "E", "F"]
    
    # Check if each option exists in the example and construct answer string
    answer_template = ' '.join([f"({option}) {example[option]}" for option in options if example.get(option)])
    
    # Fill the question in the prompt template
    prompt = prompt_template_part_one.format(question=example["question"]) + answer_template + prompt_template_part_two
    
    # Add the final prompt to 'user_content'
    example['user_content'] = prompt
    
    return example


In [4]:
dataset = tmlu_dataset.map(create_user_prompt, num_proc=2)

Map (num_proc=2): 100%|██████████| 2796/2796 [00:05<00:00, 547.68 examples/s] 
Map (num_proc=2): 100%|██████████| 185/185 [00:04<00:00, 41.71 examples/s]


In [7]:
dataset['test']['user_content'][-1]

'\n    以下選擇題為出自臺灣的考題，答案為其中一個選項。\n\n    問題:\n    依商業會計法及商業會計處理準則之規定，下列敘述何者錯誤？\n\n    (A) 會計憑證分為原始溤證及記帳憑證 (B) 經濟部對於記帳憑證之名稱及格式並無規定 (C) 原始憑證係用以證明會計事項之經過, 而為造具記帳憑證所根據之憑證 (D) 記帳憑證可由代表商業負責人授權經理人、主辦或經辦會計人員簽名或蓋章\n    正確答案：(\n    '

In [8]:
tmlu_dataset

DatasetDict({
    test: Dataset({
        features: ['id', 'question', 'A', 'B', 'C', 'D', 'E', 'F', 'answer', 'explanation', 'metadata', 'human_evaluation', 'subject'],
        num_rows: 2796
    })
    dev: Dataset({
        features: ['id', 'question', 'A', 'B', 'C', 'D', 'E', 'F', 'answer', 'explanation', 'metadata', 'human_evaluation', 'subject'],
        num_rows: 185
    })
})

In [5]:
from datasets import load_dataset, get_dataset_config_names
dataset_name = "miulab/tmlu"
config="AST_chinese"
cache_dir = 'c:\\Users\\l501l\\Desktop\\TaiwanGPT\\data\\eval'
load_dataset(dataset_name, config, cache_dir=cache_dir)

DatasetDict({
    test: Dataset({
        features: ['id', 'question', 'A', 'B', 'C', 'D', 'E', 'F', 'answer', 'explanation', 'metadata', 'human_evaluation'],
        num_rows: 126
    })
    dev: Dataset({
        features: ['id', 'question', 'A', 'B', 'C', 'D', 'E', 'F', 'answer', 'explanation', 'metadata', 'human_evaluation'],
        num_rows: 5
    })
})

## Token Count

In [1]:
import tiktoken

In [2]:
def num_tokens_from_messages(messages, model="gpt-4o-mini"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-4o-2024-08-06",
        "gpt-4o-mini",
        "gpt-4o-mini-2024-07-18"
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [3]:
from openai import OpenAI
from dotenv import load_dotenv
import os
_ = load_dotenv('.env')

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "<your OpenAI API key if not set as env var>"))

example_messages = [
    {
        "role": "system",
        "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English.",
    },
    {
        "role": "user",
        "content": "This late pivot means we don't have time to boil the ocean for the client deliverable.",
    },
]

for model in [
        "gpt-4o-2024-08-06",
        "gpt-4o-mini",
        "gpt-4o-mini-2024-07-18"
    ]:
    print(model)
    # example token count from the function defined above
    print(f"{num_tokens_from_messages(example_messages, model)} prompt tokens counted by num_tokens_from_messages().")
    # example token count from the OpenAI API
    response = client.chat.completions.create(model=model,
    messages=example_messages,
    temperature=0,
    max_tokens=1)
    print(f'{response.usage.prompt_tokens} prompt tokens counted by the OpenAI API.')
    print()

gpt-4o-2024-08-06
124 prompt tokens counted by num_tokens_from_messages().
124 prompt tokens counted by the OpenAI API.

gpt-4o-mini
124 prompt tokens counted by num_tokens_from_messages().
124 prompt tokens counted by the OpenAI API.

gpt-4o-mini-2024-07-18
124 prompt tokens counted by num_tokens_from_messages().
124 prompt tokens counted by the OpenAI API.

