In [1]:
import os
import re
import torch
import transformers
from tqdm import tqdm
import json
import numpy as np
from sklearn.metrics import cohen_kappa_score
from typing import Optional, Literal
from utils import load_asap_dataset, load_toefl_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig


def mts_scoring(essay, prompt, scoring_criteria, model, tokenizer):
    """MTS (Multi-Trait Specialization) に基づくエッセイ採点."""

    # Define system prompt template
    system_prompt_template = """You are a member of the English essay writing test evaluation committee. Four teachers will be provided with a [Prompt] and an [Essay] written by a student in response to the [Prompt]. Each teacher will score the essays based on different dimensions of writing quality. Your specific responsibility is to score the essays in terms of "{trait}". {trait_desc} Focus on the content of the [Essay] and the [Scoring Rubric] to determine the score."""

    # Define initial user prompt template
    user_prompt_template = """
    [Prompt]
    {prompt}
    (end of [Prompt])
    [Essay]
    {essay}
    (end of [Essay])
    Q. List the quotations from the [Essay] that are relevant to "{trait}" and evaluate whether each quotation is well-written or not.
    """

    # Define scoring user prompt template
    scoring_prompt_template = """
    [Scoring Rubric]
    **{trait}**:
    {criteria}
    (end of [Scoring Rubric])
    Q. Based on the [Scoring Rubric] and the quotations you found, how would you rate the "{trait}" of this essay? Assign a score from 0 to 10, strictly following the [Output Format] below.
    [Output Format]
    Score: <score>insert ONLY the numeric score (from 0 to 10) here</score>
    (End of [Output Format])
    """
    
    gen_config = GenerationConfig(
        max_new_tokens=512,
        temperature=0.1,
        do_sample=True
    )
    trait_scores = []
    for info in scoring_criteria:
        # Create initial messages
        messages = [
            {"role": "system", "content": system_prompt_template.format(trait=info['name'], trait_desc=info['description'])},
            {"role": "user", "content": user_prompt_template.format(prompt=prompt, essay=essay, trait=info['name'])}
        ]

        chat = tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = tokenizer(chat, return_tensors="pt").to(model.device)

        gen_config = GenerationConfig(
            max_new_tokens=512,
            temperature=0.1,
            do_sample=True
        )
        with torch.no_grad():
            output_tokens = model.generate(**inputs, generation_config=gen_config)

        response_1 = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

        # Add scoring prompt to messages
        messages.append({"role": "assistant", "content": response_1})
        messages.append({
            "role": "user", 
            "content": scoring_prompt_template.format(
                trait=info['name'],
                criteria=info['scoring_criteria']
            )
        })

        # Generate response for scoring
        chat = tokenizer.apply_chat_template(messages, tokenize=False)
        inputs = tokenizer(chat, return_tensors="pt").to(model.device)

        gen_config = GenerationConfig(
            max_new_tokens=64,
            temperature=0.1,
            do_sample=True
        )
        with torch.no_grad():
            output_tokens = model.generate(**inputs, generation_config=gen_config, return_dict_in_generate=True, output_scores=True)

        response_2 = tokenizer.decode(output_tokens.sequences[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)

        # Extract score
        try:
            # 数値を抽出するための正規表現パターン
            score_pattern = r'\d+'
            match = re.search(score_pattern, response_2)
            if match:
                score = int(match.group())
                trait_scores.append(score)
            else:
                raise ValueError("数値が見つかりませんでした")
        except (ValueError, IndexError) as e:
            print(f"Error extracting score for trait {info['name']}: {e}")
            print(f"Raw response: {response_2}")  # デバッグ用
            trait_scores.append(-1) # エラー時はとりあえず-1を代入
            continue

    return trait_scores

In [2]:
def get_score_range(dataset_name, prompt_id):
    """ASAPデータセットのスコア範囲を取得."""
    score_ranges = {
        "ASAP": {
            1: (2, 12),
            2: (1, 6),
            3: (0, 3),
            4: (0, 3),
            5: (0, 4),
            6: (0, 4),
            7: (0, 30),
            8: (0, 60),
        }
    }
    return score_ranges[dataset_name][prompt_id]

In [4]:
with open ('outputs/multi-trait-decomposition/asap_rubrics_gpt-4o-mini.json') as f:
    all_scoring_criteria = json.load(f)

In [5]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    use_auth_token=True
)

outputs = []
for essay_set, essay_id, essay, score in tqdm(df.iter_rows(), total=len(df)):
    with open(f"llm_prompts/ASAP/info/prompt{essay_set}.md", "r") as f:
        prompt = f.read()
    scoring_criteria = all_scoring_criteria[f'prompt{essay_set}']['dimensions']
    trait_scores = mts_scoring(essay, prompt, scoring_criteria, model, tokenizer)
    print(f'essay_set: {essay_set}, scores: {trait_scores}')
    outputs.append(trait_scores)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 1/1299 [01:01<22:10:39, 61.51s/it]

essay_set: 8, scores: [8, 8, 9, 7]


  0%|          | 2/1299 [01:54<20:17:58, 56.34s/it]

essay_set: 4, scores: [4, 6, 6, 5]


  0%|          | 3/1299 [02:52<20:33:40, 57.11s/it]

essay_set: 3, scores: [7, 7, 7, 6]


  0%|          | 4/1299 [03:47<20:17:36, 56.41s/it]

essay_set: 5, scores: [6, 4, 3, 4]


  0%|          | 4/1299 [04:08<22:18:12, 62.00s/it]


KeyboardInterrupt: 

In [6]:
outputs

[[' Based on the quotations I found in the essay, I would rate the "Ideas and Content" of this essay as a 8 out of 10. The writer presents clear main ideas and uses relevant details to support their argument, but the exploration of the topic could be more in',
  ' Based on the quotations I found in the essay, I would rate the "Organization" of this essay as a score of 8. The writing has a clear structure, with identifiable sequencing and effective transitions. The ideas are mostly easy to follow, although some transitions could be improved',
  ' Based on the quotations I found in the essay, I would rate the "Voice" of this essay as a 9. The writer\'s voice is strong and appropriate, effectively engaging the audience. The writing is expressive and sincere, with a clear sense of the writer',
  ' Based on the quotations I found in the essay, I would rate the "Conventions" of this essay as follows:\n\nScore: 7\n\nThe essay contains some errors in conventions, such as inconsistent capitaliz

In [15]:
import polars as pl
results = pl.read_csv("outputs/trait_scores_llama3_3B.csv")
results[['0', '1', '2', '3']]

0,1,2,3
i64,i64,i64,i64
8,4,8,2
2,2,2,2
6,5,4,5
2,0,2,2
4,2,4,2
…,…,…,…
4,4,4,4
4,2,2,2
2,2,2,2
5,4,6,4


In [16]:
def drop_rows_with_negative_one(df):
    """
    Polarsデータフレームから-1を含む行を削除する関数
    
    Parameters:
    -----------
    df : pl.DataFrame
        入力データフレーム
    
    Returns:
    --------
    pl.DataFrame
        -1を含む行が削除されたデータフレーム
    """
    # 各列について-1かどうかをチェックし、行ごとにいずれかの列が-1の場合にTrueとなるマスクを作成
    mask = df.select(
        pl.fold(
            False,
            lambda acc, x: acc | (x == -1),
            pl.all().exclude([])
        )
    ).to_series()
    
    # マスクの否定を使用して-1を含まない行だけを残す
    return df.filter(~mask)

In [17]:
final = pl.concat([df, results[['0', '1', '2', '3']]], how='horizontal')
final = drop_rows_with_negative_one(final)
final

essay_set,essay_id,essay,score,0,1,2,3
i64,i64,str,i64,i64,i64,i64,i64
8,20826,""" Bell rings. Shuffle, shuffle…",60,8,4,8,2
4,10064,"""The author concludes the story…",1,2,2,2,2
3,6127,"""The features of the setting in…",3,6,5,4,5
5,13551,"""The mood created by the author…",1,2,0,2,2
6,16370,"""some of the obstacles the buil…",2,4,2,4,2
…,…,…,…,…,…,…,…
2,3090,"""They were talking about thinki…",3,4,4,4,4
3,6187,"""The features of the setting af…",1,4,2,2,2
7,18150,"""One @DATE1 @TIME1 I was very p…",14,2,2,2,2
8,20968,""" Laug…",36,5,4,6,4


In [18]:
final.describe()

statistic,essay_set,essay_id,essay,score,0,1,2,3
str,f64,f64,str,f64,f64,f64,f64,f64
"""count""",1299.0,1299.0,"""1299""",1299.0,1299.0,1299.0,1299.0,1299.0
"""null_count""",0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0
"""mean""",4.163202,10237.457275,,6.800616,3.765204,3.285604,3.530408,3.34719
"""std""",2.143791,6344.699906,,8.999117,1.456981,1.462168,1.435718,1.42842
"""min""",1.0,1.0,""" …",0.0,0.0,0.0,0.0,0.0
"""25%""",2.0,4404.0,,2.0,2.0,2.0,2.0,2.0
"""50%""",4.0,9934.0,,3.0,4.0,4.0,4.0,4.0
"""75%""",6.0,15784.0,,8.0,4.0,4.0,4.0,4.0
"""max""",8.0,21599.0,"""“When they come back, Saeng vo…",60.0,8.0,8.0,8.0,8.0


In [19]:
final = final.with_columns(
    (pl.col('0') + pl.col('1') + pl.col('2') + pl.col('3')).alias('total_score')
)
# essay_setごとにIQRでクリッピング
def clip_by_iqr(series, lower_limit=1.5, upper_limit=1.5):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - lower_limit * iqr
    upper_bound = q3 + upper_limit * iqr
    return series.clip(lower_bound, upper_bound)

clipped = final.group_by('essay_set').map_groups(lambda group: 
    group.with_columns([
        pl.Series(clip_by_iqr(group['total_score'])).alias('total_score_iqr')
    ])
)
clipped

essay_set,essay_id,essay,score,0,1,2,3,total_score,total_score_iqr
i64,i64,str,i64,i64,i64,i64,i64,i64,i64
6,16370,"""some of the obstacles the buil…",2,4,2,4,2,12,12
6,16486,"""The builders had to go through…",2,2,2,2,4,10,10
6,16303,"""On December 11, 1929, Al Smith…",4,4,6,4,4,18,18
6,16051,"""The builders of the Empire Sta…",4,4,4,4,4,16,16
6,15892,"""In the excerpt, The Mooring Ma…",3,4,4,2,4,14,14
…,…,…,…,…,…,…,…,…,…
5,13021,"""The mood created by the author…",3,4,4,5,4,17,17
5,13425,"""The mood that was created by t…",2,4,4,4,6,18,18
5,12439,"""The story, ""Narciso Rodriguez""…",3,4,4,4,4,16,16
5,11951,"""Reading this article expresses…",3,4,4,2,2,12,12


In [20]:
clipped.describe()

statistic,essay_set,essay_id,essay,score,0,1,2,3,total_score,total_score_iqr
str,f64,f64,str,f64,f64,f64,f64,f64,f64,f64
"""count""",1299.0,1299.0,"""1299""",1299.0,1299.0,1299.0,1299.0,1299.0,1299.0,1299.0
"""null_count""",0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",4.163202,10237.457275,,6.800616,3.765204,3.285604,3.530408,3.34719,13.928406,13.91378
"""std""",2.143791,6344.699906,,8.999117,1.456981,1.462168,1.435718,1.42842,5.042832,4.993859
"""min""",1.0,1.0,""" …",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",2.0,4404.0,,2.0,2.0,2.0,2.0,2.0,10.0,10.0
"""50%""",4.0,9934.0,,3.0,4.0,4.0,4.0,4.0,14.0,14.0
"""75%""",6.0,15784.0,,8.0,4.0,4.0,4.0,4.0,16.0,16.0
"""max""",8.0,21599.0,"""“When they come back, Saeng vo…",60.0,8.0,8.0,8.0,8.0,31.0,30.0


In [21]:
def get_score_range(dataset_name, prompt_id):
    """ASAPデータセットのスコア範囲を取得."""
    score_ranges = {
        "ASAP": {
            1: (2, 12),
            2: (1, 6),
            3: (0, 3),
            4: (0, 3),
            5: (0, 4),
            6: (0, 4),
            7: (0, 30),
            8: (0, 60),
        }
    }
    return score_ranges[dataset_name][prompt_id]

# essay_setごとにスコアを正規化（min-max変換）
normalized = clipped.group_by('essay_set').map_groups(lambda group: 
    group.with_columns([
        pl.col('total_score_iqr').map_elements(
            lambda x: int(
                (x - group['total_score_iqr'].min()) * (get_score_range("ASAP", group['essay_set'][0])[1] - get_score_range("ASAP", group['essay_set'][0])[0]) / (group['total_score_iqr'].max() - group['total_score_iqr'].min()) + get_score_range("ASAP", group['essay_set'][0])[0]
            )
        ).alias('normalized_score')
    ])
)
normalized

  group.with_columns([
  group.with_columns([
  group.with_columns([
  group.with_columns([
  group.with_columns([
  group.with_columns([
  group.with_columns([
  group.with_columns([


essay_set,essay_id,essay,score,0,1,2,3,total_score,total_score_iqr,normalized_score
i64,i64,str,i64,i64,i64,i64,i64,i64,i64,i64
3,6127,"""The features of the setting in…",3,6,5,4,5,20,20,2
3,7471,"""Well if the setting relates to…",1,2,2,2,2,8,8,0
3,7241,"""The features of the setting ef…",2,4,4,2,2,12,12,0
3,6619,"""I think that there were many t…",3,4,4,4,4,16,16,1
3,6547,"""He’s on a very hot climate, in…",2,2,2,2,2,8,8,0
…,…,…,…,…,…,…,…,…,…,…
2,3461,"""No books should be taken off t…",3,4,4,4,4,16,16,3
2,3479,"""A library is a place to go and…",3,2,4,4,2,12,12,2
2,3172,"""Should books, magazines, movie…",4,4,4,4,4,16,16,3
2,3501,"""There should not be any style …",4,4,4,4,4,16,16,3


In [22]:
from scipy.stats import spearmanr
# essay_setごとにQWKとスピアマンの順位相関係数を計算
qwk_scores = []
spearman_scores = []
for essay_set in normalized['essay_set'].unique():
    subset = normalized.filter(pl.col('essay_set') == essay_set)
    qwk = cohen_kappa_score(
        subset['score'].to_numpy(),
        subset['normalized_score'].to_numpy(),
        weights='quadratic',
        labels=np.arange(get_score_range("ASAP", essay_set)[0], get_score_range("ASAP", essay_set)[1] + 1)
    )
    spearman_corr, _ = spearmanr(subset['score'].to_numpy(), subset['normalized_score'].to_numpy())
    qwk_scores.append({
        'essay_set': essay_set,
        'qwk': qwk
    })
    spearman_scores.append({
        'essay_set': essay_set,
        'spearman_corr': spearman_corr
    })

qwk_df = pl.DataFrame(qwk_scores)
spearman_df = pl.DataFrame(spearman_scores)
print("QWK scores by essay set:")
print(qwk_df)
print("スピアマンの順位相関係数 by essay set:")
print(spearman_df)

QWK scores by essay set:
shape: (8, 2)
┌───────────┬──────────┐
│ essay_set ┆ qwk      │
│ ---       ┆ ---      │
│ i64       ┆ f64      │
╞═══════════╪══════════╡
│ 1         ┆ 0.356677 │
│ 2         ┆ 0.442847 │
│ 3         ┆ 0.236312 │
│ 4         ┆ 0.441607 │
│ 5         ┆ 0.453303 │
│ 6         ┆ 0.275021 │
│ 7         ┆ 0.346879 │
│ 8         ┆ 0.210931 │
└───────────┴──────────┘
スピアマンの順位相関係数 by essay set:
shape: (8, 2)
┌───────────┬───────────────┐
│ essay_set ┆ spearman_corr │
│ ---       ┆ ---           │
│ i64       ┆ f64           │
╞═══════════╪═══════════════╡
│ 1         ┆ 0.383062      │
│ 2         ┆ 0.519228      │
│ 3         ┆ 0.509863      │
│ 4         ┆ 0.630465      │
│ 5         ┆ 0.693902      │
│ 6         ┆ 0.465543      │
│ 7         ┆ 0.327896      │
│ 8         ┆ 0.423483      │
└───────────┴───────────────┘


In [23]:
qwk_df['qwk'].mean()

0.34544701334307726

In [24]:
spearman_df['spearman_corr'].mean()

0.4941801932936464