In [2]:
import pandas as pd
from ast import literal_eval
from transformers import AutoTokenizer

In [3]:
# 모델 변경
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [4]:
prompt_no_question = (
    "지문:\n {paragraph}\n\n 질문:\n {question}\n\n 선택지:\n {choices}\n\n "
    "1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.\n 정답:"
)
prompt_with_question = (
    "지문:\n {paragraph}\n\n 질문:\n {question}\n\n <보기>:\n {question_plus}\n\n "
    "선택지:\n {choices}\n\n 1, 2, 3, 4, 5 중에 하나를 정답으로 고르세요.\n 정답:"
)

In [5]:
def parse_problems(df):
    records = []
    for _, row in df.iterrows():
        problems = literal_eval(row["problems"])
        record = {
            "id": row["id"],
            "paragraph": row["paragraph"],
            "question": problems["question"],
            "choices": problems["choices"],
            "answer": problems.get("answer", None),
            "question_plus": problems.get("question_plus", None),
        }
        records.append(record)
    return pd.DataFrame(records)

In [6]:
chat_template = """
{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}
{% if system_message is defined %}{{ system_message }}{% endif %}
{% for message in messages %}
    {% set content = message['content'] %}
    {% if message['role'] == 'user' %}{{ '<start_of_turn>user\n' + content + '<end_of_turn>\n<start_of_turn>model\n' }}
    {% elif message['role'] == 'assistant' %}{{ content + '<end_of_turn>\n' }}
    {% endif %}
{% endfor %}
"""

In [None]:
# 파일 경로 수정
input_path = "./data/train.csv"
output_path = "./data/processed_train.csv"

In [7]:
def calculate_input_ids_length_with_chat_template(row):
    choices_string = "\n".join([f"{idx + 1} - {choice}" for idx, choice in enumerate(row["choices"])])

    if row["question_plus"]:
        user_message = prompt_with_question.format(
            paragraph=row["paragraph"],
            question=row["question"],
            question_plus=row["question_plus"],
            choices=choices_string,
        )
    else:
        user_message = prompt_no_question.format(
            paragraph=row["paragraph"],
            question=row["question"],
            choices=choices_string,
        )

    messages = [
        {"role": "system", "content": "지문을 읽고 질문의 답을 구하세요."},
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": str(row["answer"])},
    ]

    formatted_text = tokenizer.apply_chat_template(
        messages,
        chat_template=chat_template,
        tools=None,  
        tokenize=False
    )

    tokens = tokenizer(formatted_text, truncation=False, padding=False, return_overflowing_tokens=False)
    return len(tokens["input_ids"])

input_df = pd.read_csv(input_path)

processed_df = parse_problems(input_df)
processed_df["input_ids_length"] = processed_df.apply(calculate_input_ids_length_with_chat_template, axis=1)-1

processed_df.to_csv(output_path, index=False, encoding='utf-8-sig')

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(processed_df.index, processed_df['input_ids_length'],s=3,color='navy')
plt.axvline(x=1380, color='red', linestyle='-', linewidth=1.5)  # 빨간색 세로선
plt.axhline(y=1024, color='red', linestyle='-', linewidth=1.5)  # 가로선

plt.title('Length')
plt.xlabel('Index')
plt.ylabel('Input Length')
plt.grid()
plt.show()