In [1]:
from dotenv import load_dotenv
load_dotenv()

import os

In [2]:
import json

# ELYZA-tasks-100-TVデータセットのロード
def load_elyza_tasks(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

test_dataset = load_elyza_tasks('elyza-tasks-100-TV_0.jsonl')

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

# モデルのロード
model_name = "lora_gemma_9b"  # 微調整後のモデルのパス
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)
base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-9b",
    device_map="auto",
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    token=os.getenv("HF_TOKEN"),
)

# LoRAアダプターの読み込み
model = PeftModel.from_pretrained(base_model, model_name)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 8/8 [00:11<00:00,  1.44s/it]


In [4]:
from datasets import load_dataset

# データセットをロード
dataset = load_dataset("elyza/ELYZA-tasks-100")  # 実際のデータセットパスに変更

# トレーニングデータのみを使用
train_dataset = dataset["test"]

# プロンプトエンジニアリングに使うサンプル
num_samples = 3
few_shot_samples = train_dataset.select(range(num_samples))

In [18]:
import torch
from tqdm import tqdm

def generate_prompt(input_text, examples):
    prompt = ""
    for idx, example in enumerate(examples, 1):
        prompt += f"[例{idx}]\n"
        prompt += f"入力: {example['input']}\n"
        prompt += f"出力: {example['output']}\n\n"
    prompt += "[あなたの質問]\n"
    prompt += f"入力: {input_text}\n"
    prompt += "出力:"
    return prompt

def generate_response(model, tokenizer, prompt):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
    ).to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=256,
            temperature=0.7,
            repetition_penalty=1.1,
            top_p=0.9,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    output_text = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    # print(f"output_text: {output_text}\n")
    return output_text

# 推論と結果の収集
results = []
for test_data in tqdm(test_dataset):
    task_id = test_data["task_id"]
    input_text = test_data["input"]
    # print(f"task_id: {task_id}, input_text: {input_text}\n")
    prompt = generate_prompt(input_text, few_shot_samples)
    # print(f"prompt: {prompt}\n")
    response = generate_response(model, tokenizer, prompt)
    results.append({
        "task_id": task_id,
        "output": response,
    })

100%|██████████| 100/100 [14:58<00:00,  8.98s/it]


In [19]:
# 結果をJSON Lines形式で保存
with open("submission.jsonl", "w", encoding="utf-8") as f:
    for result in results:
        json_line = json.dumps(result, ensure_ascii=False)
        f.write(json_line + "\n")