In [3]:
# !pip install transformers datasets matplotlib
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, DatasetDict
import matplotlib.pyplot as plt
import numpy as np
import torch

In [None]:
models = {
    "NTQAI/chatntq-ja-7b-v1.0": None,
    "TFMC/Japanese-Starling-ChatV-7B": None,
    "Aratako/Antler-7B-RP-v2": None,
    "final_merge": "./workspace/final_merge"    # マージ後のモデル
}
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
elyza_tasks = load_dataset('elyza/ELYZA-tasks-100', split='test[:10]') 

def evaluate_model(model_name, model_path):
    if model_path is not None:
        model = AutoModelForCausalLM.from_pretrained(model_path)
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name)
        
    scores = []
    
    for example in elyza_tasks:
        input_text = example['input']
        output_text = example['output']
        
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        
        with torch.no_grad():
            generated_ids = model.generate(input_ids, max_length=1500, temperature=0.7, do_sample=False)
        
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        
        # OpenAI APIによる評価を模倣 (簡易版スコア: 完全一致 5/部分一致 3/一致しない 1)
        if output_text in generated_text:
            score = 5
        elif any(word in generated_text for word in output_text.split()):
            score = 3
        else:
            score = 1
        
        scores.append(score)
    
    # 平均スコアを返す
    return np.mean(scores)

In [None]:
results = {}
for model_name, model_path in models.items():
    score = evaluate_model(model_name, model_path)
    results[model_name] = score
    print(f"{model_name}: {score}")

In [None]:
model_names = list(results.keys())
scores = list(results.values())

plt.figure(figsize=(10, 6))
plt.barh(model_names, scores, color='skyblue')
plt.xlabel('評価スコア')
plt.title('各モデルの評価結果 (Elyza Tasks)')
plt.show()