In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Dict
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

class MMULEvaluator:
    def __init__(self, model_path: str):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForCausalLM.from_pretrained(model_path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def format_prompt(self, question: str, choices: List[str]) -> str:
        """MMLU 형식의 프롬프트 생성"""
        prompt = f"질문: {question}\n\n"
        for i, choice in enumerate(['A', 'B', 'C', 'D']):
            prompt += f"{choice}. {choices[i]}\n"
        prompt += "\n답변:"
        return prompt

    def get_model_prediction(self, prompt: str) -> str:
        """모델 예측 수행"""
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                inputs.input_ids,
                max_new_tokens=1,
                temperature=0.0
            )
        prediction = self.tokenizer.decode(outputs[0][-1:], skip_special_tokens=True)
        return prediction.strip()

    def evaluate_subject(self, questions: List[Dict]) -> Dict:
        """특정 과목에 대한 평가 수행"""
        predictions = []
        correct_answers = []
        
        for q in questions:
            prompt = self.format_prompt(q['question'], q['choices'])
            pred = self.get_model_prediction(prompt)
            predictions.append(pred)
            correct_answers.append(q['answer'])
        
        accuracy = accuracy_score(correct_answers, predictions)
        conf_matrix = confusion_matrix(correct_answers, predictions, 
                                     labels=['A', 'B', 'C', 'D'])
        
        return {
            'accuracy': accuracy,
            'confusion_matrix': conf_matrix,
            'predictions': predictions
        }

# 평가 실행 예시
def run_mmlu_evaluation():
    # 샘플 데이터
    sample_questions = [
        {
            'subject': '세계사',
            'question': '제2차 세계대전이 시작된 연도는?',
            'choices': ['1937년', '1939년', '1941년', '1945년'],
            'answer': 'B'
        },
        {
            'subject': '물리학',
            'question': '빛의 속도는?',
            'choices': ['299,792 km/s', '199,792 km/s', '399,792 km/s', '499,792 km/s'],
            'answer': 'A'
        }
    ]
    
    evaluator = MMULEvaluator("path/to/model")
    
    # 과목별 평가
    results = evaluator.evaluate_subject(sample_questions)
    
    print(f"정확도: {results['accuracy']:.3f}")
    print("\n혼동 행렬:")
    print(pd.DataFrame(
        results['confusion_matrix'],
        columns=['예측 A', '예측 B', '예측 C', '예측 D'],
        index=['실제 A', '실제 B', '실제 C', '실제 D']
    ))

if __name__ == "__main__":
    run_mmlu_evaluation()