In [2]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import openai
from typing import List, Tuple, Literal
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
def generate_scoring_criteria(
    dataset_type: Literal["ASAP", "TOEFL11"],
    prompt: str,
    rubric_guidelines: str,
    model_name="gpt-4o-mini",
    excerpt=None,
    trait=None
) -> str:
    """ChatGPTを使って採点基準を生成 (論文中の Figure 2)."""
    
    user_prompt = ""
    if "ASAP" == dataset_type:
        if excerpt:
            user_prompt = f"""
            [Excerpt]
            {excerpt}
            (end of [Excerpt])
            [Prompt]
            {prompt}
            (end of [Prompt])
            [Rubric Guidelines]
            {rubric_guidelines}
            (end of [Rubric Guidelines])
            Refer to the provided [Prompt] and [Rubric Guidelines] to generate an essay scoring rubric divided into four primary dimensions of writing quality. Adhere to the requirements of [Prompt] and [Rubric Guidelines] when you determine the four dimensions of writing quality. At each dimension, make sure a brief description of the dimension is added before the scoring criteria. The score scale of each dimension ranges from 0 to 10, and the total score is 40.
            """
        else:
            user_prompt = f"""
            [Prompt]
            {prompt}
            (end of [Prompt])
            [Rubric Guidelines]
            {rubric_guidelines}
            (end of [Rubric Guidelines])
            Refer to the provided [Prompt] and [Rubric Guidelines] to generate an essay scoring rubric divided into four primary dimensions of writing quality. Adhere to the requirements of [Prompt] and [Rubric Guidelines] when you determine the four dimensions of writing quality. At each dimension, make sure a brief description of the dimension is added before the scoring criteria. The score scale of each dimension ranges from 0 to 10, and the total score is 40.
            """
    elif "TOEFL11" == dataset_type and trait:
        user_prompt = f"""
        [Scoring Rubric]
        {rubric_guidelines}
        (end of [Scoring Rubric])
        Refer to [Scoring Rubric] to generate a scoring criteria with score ranging from 0 to 10, following the instruction below:
        1. Briefly describe '{trait}' with one sentence.
        2. Divide the score range [0-10] into 5 appropriate intervals.
        3. For each interval, summarize its characteristics.
        """

    if "gpt" in model_name:
        import openai
        client = openai.Client(api_key=os.getenv("OPENAI_API_KEY"))
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "user", "content": user_prompt},
            ],
            temperature=0.1,
            max_tokens=2048,
        )
        generated_criteria = response.choices[0].message.content
        return generated_criteria
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        inputs = tokenizer(user_prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=2048, temperature=0.1, repetition_penalty=1.1, do_sample=True)
            generated_criteria = tokenizer.decode(outputs[0], skip_special_tokens=True)
            return generated_criteria

In [8]:
# ASAP
for i in range(1, 9):
    with open(f"llm_prompts/ASAP/info/prompt{i}.md", "r") as f:
        prompt = f.read()
    with open(f"llm_prompts/ASAP/info/rubric{i}.md", "r") as f:
        rubric = f.read()
    if i in [3, 4, 5, 6]:
        with open(f"llm_prompts/ASAP/info/source{i}.md", "r") as f:
            excerpt = f.read()
    else:
        excerpt = None
    
    scoring_criteria = generate_scoring_criteria("ASAP", prompt, rubric, excerpt=excerpt, model_name="gpt-4o")
    with open(f"outputs/multi-trait-decomposition/asap_prompt{i}.txt", "w") as f:
        f.write(scoring_criteria)

In [4]:
# TOEFL11
for i in range(7, 9):
    with open(f"llm_prompts/TOEFL11/info/prompt{i}.md", "r") as f:
        prompt = f.read()
    for j, trait in enumerate(["Task Response", "Coherence and Cohesion", "Lexical Resource", "Grammatical Range and Accuracy"]):
        with open(f"llm_prompts/TOEFL11/info/rubric_trait{j+1}.md", "r") as f:
            rubric = f.read()
        scoring_criteria = generate_scoring_criteria("TOEFL11", prompt, rubric, trait=trait)
        with open(f"outputs/multi-trait-decomposition/toefl11_prompt{i}_{j+1}.txt", "w") as f:
            f.write(scoring_criteria)