# Harness 평가방법
- LM-Evaluation-Harness v0.4.0
- 라이브러리를 그대로 사용하게되면, 버전차이로 소스를 수정해야할 가능성이 높음


In [None]:
# Install LM-Eval
!pip install git+https://github.com/EleutherAI/lm-evalutation-harness.git@big-refactor

In [None]:
from lm_eval import api

## 설정 기반의 새로운 Evaluation task 생성

In [None]:
YAML_boolq_string = '''
task: demo_boolq
dataset_path: super_glue
dataset_name: boolq
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
doc_to_target: label
doc_to_choice: ["no", "yes"]
should_decontaminate: true
doc_to_decontamination_query: passage
metric_list:
  - metric: acc
'''
with open('boolq.yaml', 'w') as f:
    f.write(YAML_boolq_string)

In [None]:
!lm_eval \
    --model hf \
    --model_args pretrained=wonik-hi/phi3_fine_tuning \
    --include_path ./ \
    --tasks demo_boolq \
    --limit 10

"""
!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-2.8b \
    --include_path ./ \
    --tasks demo_boolq \
    --limit 10
"""

In [None]:
YAML_cola_string = '''
tag: yes_or_no_tasks
task: demo_cola
dataset_path: glue
dataset_name: cola
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "{{sentence}}\nQuestion: Does this sentence make sense?\nAnswer:"
doc_to_target: label
doc_to_choice: ["no", "yes"]
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
  - metric: acc
'''
with open('cola.yaml', 'w') as f:
    f.write(YAML_cola_string)

In [None]:
# !accelerate launch --no_python
!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-2.8b \
    --include_path ./ \
    --tasks yes_or_no_tasks \
    --limit 10 \
    --output output/yes_or_no_tasks/ \
    --log_samples

## Edit Prompt Template

In [None]:
YAML_mmlu_geo_string = '''
task: demo_mmlu_high_school_geography
dataset_path: cais/mmlu
dataset_name: high_school_geography
description: "The following are multiple choice questions (with answers) about high school geography.\n\n"
test_split: test
fewshot_split: dev
fewshot_config:
  sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
'''
with open('mmlu_high_school_geography.yaml', 'w') as f:
    f.write(YAML_mmlu_geo_string)

In [None]:
# !accelerate launch --no_python
!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-2.8b \
    --include_path ./ \
    --tasks demo_mmlu_high_school_geography \
    --limit 10 \
    --output output/mmlu_high_school_geography/ \
    --log_samples

In [None]:
from google.colab import files
files.view("output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl")


## YAML Fields 유형 설정
- output_type
    - loglikehood
    - loglikehood_rolling
    - multiple_choice
    - greedy_until

- core prompt
    - doc_to_text : 모델에 대한 입력으로 사용될 프롬프트 템플릿
    - doc_to_choice : 모델의 연속으로 사용될 사용 가능한 선택 사항. output_type이 multiple_choice일때 사용
    - doc_to_target : output_type이 multiple_choice인 경우 정답에 해당하는 인덱스일 수 도 있고, 답변 문자열 자체일 수도 있음.

In [None]:
YAML_mmlu_geo_string = '''
include: mmlu_high_school_geography.yaml
task: demo_mmlu_high_school_geography_function_prompt
doc_to_text: !function utils.doc_to_text
doc_to_choice: "{{choices}}"
'''
with open('demo_mmlu_high_school_geography_function_prompt.yaml', 'w') as f:
    f.write(YAML_mmlu_geo_string)

DOC_TO_TEXT = '''
def doc_to_text(x):
    question = x["question"].strip()
    choices = x["choices"]
    option_a = choices[0]
    option_b = choices[1]
    option_c = choices[2]
    option_d = choices[3]
    return f"{question}\\nA. {option_a}\\nB. {option_b}\\nC. {option_c}\\nD. {option_d}\\nAnswer:"
'''
with open('utils.py', 'w') as f:
    f.write(DOC_TO_TEXT)

!lm_eval \
    --model hf \
    --model_args pretrained=EleutherAI/pythia-2.8b \
    --include_path ./ \
    --tasks demo_mmlu_high_school_geography_function_prompt \
    --limit 10 \
    --output output/demo_mmlu_high_school_geography_function_prompt/ \
    --log_samples
