In [1]:
import os
import torch
import transformers
from tqdm import tqdm
import json
import numpy as np
from sklearn.metrics import cohen_kappa_score
from typing import Optional, Literal
from utils import load_asap_dataset, load_toefl_dataset


def mts_scoring(essay, prompt, scoring_criteria, model_id):
    """MTS (Multi-Trait Specialization) に基づくエッセイ採点."""

    # Define system prompt template
    system_prompt_template = f"""You are a member of the English essay writing test evaluation committee. Four teachers will be provided with a [Prompt] and an [Essay] written by a student in response to the [Prompt]. Each teacher will score the essays based on different dimensions of writing quality. Your specific responsibility is to score the essays in terms of "{trait}". {trait_desc} Focus on the content of the [Essay] and the [Scoring Rubric] to determine the score."""

    # Define initial user prompt template
    user_prompt_template = """
    [Prompt]
    {prompt}
    (end of [Prompt])
    [Essay]
    {essay}
    (end of [Essay])
    Q. List the quotations from the [Essay] that are relevant to "{trait}" and evaluate whether each quotation is well-written or not.
    """

    # Define scoring user prompt template
    scoring_prompt_template = """
    [Scoring Rubric]
    **{trait}**:
    {criteria}
    (end of [Scoring Rubric])
    Q. Based on the [Scoring Rubric] and the quotations you found, how would you rate the "{trait}" of this essay? Assign a score from 0 to 10, strictly following the [Output Format] below.
    [Output Format]
    Score: <score>insert ONLY the numeric score (from 0 to 10) here</score>
    (End of [Output Format])
    """

    trait_scores = []
    for info in scoring_criteria:
        # Create initial messages
        messages = [
            {"role": "system", "content": system_prompt_template.format(trait=info['name'], trait_desc=info['description'])},
            {"role": "user", "content": user_prompt_template.format(prompt=prompt, essay=essay, trait=info['name'])}
        ]

        pipline = transformers.pipeline(
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": torch.bfloat16},
            device_map="auto"
        )

        response_1 = pipline(messages, max_new_tokens=512, temperature=0.1, repetition_penalty=1.1)[0]["generated_text"][-1]['content']

        # Add scoring prompt to messages
        messages.append({"role": "assistant", "content": response_1})
        messages.append({
            "role": "user", 
            "content": scoring_prompt_template.format(
                trait=info['name'],
                criteria=info['scoring_criteria']
            )
        })

        # Generate response for scoring
        response_2 = pipline(messages, max_new_tokens=64, temperature=0.1, repetition_penalty=1.1)[0]["generated_text"][-1]['content']

        # Extract score
        try:
            # Find first number in response
            trait_score = 0
            for char in response_2:
                if char.isdigit():
                    trait_score = int(char)
                    break
            trait_scores.append(trait_score)
        except (ValueError, IndexError) as e:
            print(f"Error extracting score for trait {info['name']}: {e}")
            print(f"Raw response: {response_2}")  # デバッグ用
            trait_scores.append(0) # エラー時はとりあえず0を代入
            continue

    return trait_scores

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = load_asap_dataset('datasets/ASAP', stratify=True)

Could not determine dtype for column 5, falling back to string
Could not determine dtype for column 7, falling back to string
Could not determine dtype for column 8, falling back to string
Could not determine dtype for column 9, falling back to string
Could not determine dtype for column 10, falling back to string
Could not determine dtype for column 11, falling back to string
Could not determine dtype for column 12, falling back to string
Could not determine dtype for column 13, falling back to string
Could not determine dtype for column 14, falling back to string
Could not determine dtype for column 15, falling back to string
Could not determine dtype for column 16, falling back to string
Could not determine dtype for column 17, falling back to string
Could not determine dtype for column 18, falling back to string
Could not determine dtype for column 19, falling back to string
Could not determine dtype for column 20, falling back to string
Could not determine dtype for column 21, fal

In [5]:
df

essay_set,essay_id,essay,score
i64,i64,str,i64
8,20826,""" Bell rings. Shuffle, shuffle…",60
4,10064,"""The author concludes the story…",1
3,6127,"""The features of the setting in…",3
5,13551,"""The mood created by the author…",1
6,16370,"""some of the obstacles the buil…",2
…,…,…,…
2,3090,"""They were talking about thinki…",3
3,6187,"""The features of the setting af…",1
7,18150,"""One @DATE1 @TIME1 I was very p…",14
8,20968,""" Laug…",36


In [8]:
with open ('outputs/multi-trait-decomposition/asap_rubrics_gpt-4o-mini.json') as f:
    all_scoring_criteria = json.load(f)

In [9]:
all_scoring_criteria

{'prompt1': {'dimensions': [{'name': 'Content and Development',
    'description': 'This dimension assesses the clarity of the position taken on the effects of computers on society, as well as the depth and relevance of the supporting details provided.',
    'scoring_criteria': 'Score 0-2: The response lacks a clear position or provides minimal support. Details are vague or irrelevant.\nScore 3-4: The position is unclear or underdeveloped, with general reasons that lack elaboration. Support is limited and may be list-like.\nScore 5-6: The position is present but may not be fully developed. Reasons are somewhat elaborated but still general, with some relevant details.\nScore 7-8: The position is clear and adequately supported with a mix of general and specific details. Reasons are elaborated but may lack depth.\nScore 9-10: The position is clear, thoughtful, and well-developed. Reasons are fully elaborated with specific, relevant details that effectively support the argument.'},
   {'na

In [None]:

outputs = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    essay = row['essay']
    essay_set = row['essay_set']
    with open(f"llm_prompts/ASAP/info/prompt{essay_set}.md", "r") as f:
        prompt = f.read()
    scoring_criteria = all_scoring_criteria[f'prompt{essay_set}']['dimensions']
    trait_scores = mts_scoring(essay, prompt, scoring_criteria, 'meta-llama/Llama-2-7b-hf')
    outputs.append(trait_scores)