In [39]:
import os
import json
from utils import EXP_ROOT, load_jsonl

dump_root = os.path.join(EXP_ROOT, "datasets/kg-datasets/ja-0.5/eval_qa/04_filtering")


In [None]:
target_ids = [
    "jjsca@@28/2/28_2_204_sentid:0",
    "jplantres1887@@77/909/77_909_73_sentid:2",
    "jjshs1925@@55/1/55_1_22_sentid:10",
    "ringe1963@@58/12/58_12_2891_sentid:9",
    "tando1987@@15/5/15_354_sentid:6",
    "tonyobyo@@52/7/52_7_527_sentid:1",
    "dermatol@@132/7/132_1665_sentid:2",
    "cleftpalate1976@@9/1/9_42_sentid:6",
    "jpi@@49/6/49_6_315_sentid:1",
    "jrd@@43/6/43_97-436j107_sentid:5",
    "jdh@@50/3/50_KJ00003759094_sentid:6",
]

data_path = os.path.join(dump_root, "en_generation.0.jsonl")
items = list(load_jsonl(data_path))

for target_id in target_ids:
    print("================================")
    for item in items:
        if item["request_id"] == target_id:
            print(item['message'][1]['content'])
            print(json.dumps(item["generation"], indent=4, ensure_ascii=False))
    

In [10]:
data_path = os.path.join(dump_root, "ja_generation.0.jsonl")
data = list(load_jsonl(data_path))

print_detail = False
fill_in_blank_qualities = 0
paraphrase_qualities = 0
options_qualities = 0
usable_instance = 0
usable_instances = set()

id2usable = {}
id2data = {item["request_id"]: item for item in data}
for i in range(len(data)):
    if print_detail:
        print("================== Instance ", i, " ==================")
        print(data[i]["message"][1]["content"])
        print("fill_in_blank_quality: ", data[i]["generation"]["fill_in_blank_quality"])
        print("paraphrase_quality: ", data[i]["generation"]["paraphrase_quality"])
        print("options_quality: ", data[i]["generation"]["options_quality"])
        print("explanations: ", data[i]["generation"]["explanations"])

    options_quality = int(data[i]["generation"]["options_quality"] == "Good")
    fill_in_blank_quality = int(data[i]["generation"]["fill_in_blank_quality"] == "Good")
    paraphrase_quality = int(data[i]["generation"]["paraphrase_quality"] == "Good")
    options_qualities += options_quality
    fill_in_blank_qualities += fill_in_blank_quality
    paraphrase_qualities += paraphrase_quality

    usable_instance += int(options_quality + fill_in_blank_quality + paraphrase_quality == 3)
    if int(options_quality + fill_in_blank_quality + paraphrase_quality == 3):
        usable_instances.add(data[i]["request_id"])

    # if not int(options_quality + fill_in_blank_quality + paraphrase_quality == 3):
    #     print(data[i]['message'][1]['content'])
    #     print(json.dumps(data[i]["generation"], indent=4, ensure_ascii=False))
    id2usable[data[i]["request_id"]] = int(options_quality + fill_in_blank_quality + paraphrase_quality == 3)
print("Overall options_quality: ", options_qualities / len(data))
print("Overall fill_in_blank_quality: ", fill_in_blank_qualities / len(data))
print("Overall paraphrase_quality: ", paraphrase_qualities / len(data))
print("Overall usable_instance: ", usable_instance / len(data), len(data))


Overall options_quality:  0.8258769700050839
Overall fill_in_blank_quality:  0.818759532282664
Overall paraphrase_quality:  0.8253685815963396
Overall usable_instance:  0.8185053380782918 3934


In [None]:
import pandas as pd
# df = pd.read_csv("./ja_evaluation.final.csv")
df = pd.read_excel("./annotation/ja_evaluation.ynaga.xlsx")

tp, tn, fp, fn = 0, 0, 0, 0
for request_id, evaluation in zip(df["request_id"].tolist()[:100], df["選択肢としての的確性（Aから判断）"].tolist()[:100]):
    assert evaluation.lower().strip().startswith("ok") or evaluation.lower().strip().startswith("ng")
    evaluation = False if evaluation.lower().strip().startswith("ng") else True
    assert request_id in id2usable
    if evaluation and id2usable[request_id] == 1:
        # print(request_id, "ok (both human and GPT agree) ", id2usable[request_id])
        tp += 1
    elif (not evaluation) and id2usable[request_id] == 0:
        # print(request_id, "ng (both human and GPT agree) ", id2usable[request_id])
        tn += 1
    elif (not evaluation) and id2usable[request_id] == 1:
        # print(request_id, "ok (GPT) but ng (human) ", id2usable[request_id])
        fp += 1
        # print(id2data[request_id]['message'][1]['content'])
        # print(json.dumps(id2data[request_id]["generation"], indent=4, ensure_ascii=False))

    elif evaluation and id2usable[request_id] == 0:
        # print(request_id, "ng (GPT) but ok (human) ", id2usable[request_id])
        print(id2data[request_id]['message'][1]['content'])
        print(json.dumps(id2data[request_id]["generation"], indent=4, ensure_ascii=False))
        fn += 1

print("tp, tn, fp, fn: ", tp, tn, fp, fn)

### 入力:
```json
{
    "sentence": "「絵具」は,着色剤(顔料)と展色剤の混合物であり,着色剤は,展色剤に溶けるものであってはならない。",
    "triple": {
        "subject": "絵具",
        "relation": "は",
        "object": "着色剤（顔料）と展色剤の混合物"
    },
    "fill_in_blank": "「[BLANK]」は、着色剤（顔料）と展色剤の混合物であり、着色剤は展色剤に溶けるものであってはならない。",
    "question": "着色剤（顔料）と展色剤の混合物として定義されるものは何ですか？",
    "answer": "絵具",
    "distractors": [
        "インク",
        "染料",
        "ワニス"
    ]
}
```

{
    "explanations": {
        "fill_in_blank_quality": "空欄に『絵具』を入れると文法的にも意味的にも自然で、元文の事実と一致する。ただし内容は美術材料の定義であり、生物医学的知識要件（医学・生物学の一般化可能な事実）を満たしていないためStrict評価ではPoor。",
        "paraphrase_quality": "質問も『着色剤（顔料）と展色剤の混合物として定義されるものは何か』と意味的には一致しているが、問う内容が美術材料の定義であり生物医学的知識ではないためPoor。",
        "options_quality": "正答とディストラクターは形式的に妥当で、意味カテゴリや長さも揃っているが、設問自体が生物医学的知識評価になっていないためStrict評価でPoor。"
    },
    "fill_in_blank_quality": "Poor",
    "paraphrase_quality": "Poor",
    "options_quality": "Poor"
}
### 入力:
```json
{
    "sentence": "実際、隕石中に含まれている鉱物は珪酸塩鉱物(かんらん石、輝石、長石)と鉄ニッケル合金が中心であり、

In [41]:
from typing import Counter

data_path = os.path.join(dump_root, "ja_generation.crosslang.jsonl")
data = list(load_jsonl(data_path))
usable_instances = set()
id2score = {}
id2data = {item["request_id"]: item for item in data}
for i in range(len(data)):
    crosslang_score = int(data[i]["generation"]["score"])
    id2score[data[i]["request_id"]] = crosslang_score    
counter = Counter(id2score.values())
print(counter)
print("Proportion of fully supported instances:", counter[2] / len(data))

Counter({2: 2106, 0: 333, 1: 233})
Proportion of fully supported instances: 0.7881736526946108


In [None]:
import pandas as pd
# df = pd.read_csv("./ja_evaluation.final.csv")
df = pd.read_csv("./annotation/ja_evaluation.csv")

for request_id, score in id2score.items():
    assert request_id in df["request_id"].tolist()
    human_eval = df[df["request_id"] == request_id]["cross-lang"].values[0]
    print("human_eval:", human_eval, "machine_score:", score)
    if score == 2 and human_eval != 2:
        print(id2data[request_id]['message'][1]['content'])
        print(json.dumps(id2data[request_id]["generation"], indent=4, ensure_ascii=False))
    # fn += 1

# print("tp, tn, fp, fn: ", tp, tn, fp, fn)

human_eval: 2 machine_score: 2
human_eval: 0 machine_score: 0
human_eval: 1 machine_score: 2
### 入力:
```json
{
    "en-context": "Using the model of knee pain reaction induced by intra-articular injection of endogenous pain substances, especially bradykinin (BK) in rats, the mechanism of the analgesic effect of sodium hyaluronate (SPH) was investigated. The simultaneous administration of prostaglandin E2 with BK or hyaluronidase digestion of endogenous hyaluronic acid (HA) in our experiments brought remarkable hyperalgesia on BK-induced knee pain. These results suggest that higher sensitivity to the pain reaction is induced in a diseased joint (higher prostaglandin content, lower concentration and molecular size of HA in synovial fluid) than in a normal one. SPH definitely decreased BK-induced pain, and its analgesic effect was observed for a longer period, depending on its dose in pre-treatment and the degree of its distribution in synovial tissues. As the analgesic effect of SPH was 