In [None]:
import os

from utils import EXP_ROOT, DATA_ROOT, load_jsonl, load_jsonl_iteratively

data_root = os.path.join(DATA_ROOT, "datasets/medical")

ja_items = {}
for item in load_jsonl_iteratively(os.path.join(data_root, "ja/data.jsonl")):
    ja_items[item['docid']] = item

en_items = {}
for item in load_jsonl_iteratively(os.path.join(data_root, "en_jstage/data.jsonl")):
    en_items[item['docid']] = item

In [9]:
all_items = {}
for docid in ja_items:
    en_item = en_items[docid]
    ja_item = ja_items[docid]
    all_items[docid] = {
        'ja': ja_item,
        'en': en_item,
    }

In [3]:
qa_root = os.path.join(EXP_ROOT, "datasets/kg-datasets/ja-0.5/eval_qa/03_en_qa")
en_generation_path = os.path.join(qa_root, "en_generation.jsonl")
en_generations = load_jsonl(en_generation_path)

In [37]:
for i, item in enumerate(en_generations):
    docid = item['metadata']['docid']
    all_items[docid]["en_qa"] = item['generation']
    all_items[docid]['en_triple'] = item['metadata']['input']['triple']
    if i > 2:
        break

In [38]:
item

{'request_id': 'ojjscn@@45/4/45_267_sentid:8',
 'message': [{'role': 'system',
   'content': '### Instruction: \nGiven a biomedical sentence and its associated knowledge triple, follow the steps below to generate a multiple-choice question:\n\n1. Generate a Fill-in-the-Blank Prompt: Create a fill-in-the-blank prompt for the triple by using the subject and relation as context, placing the object at the end as [BLANK]. Do not largely modify the original sentence, but ensure it is grammatically correct and clear. The prompt should be a complete sentence that can stand alone.\n2. Generate Distractors: Create three plausible but incorrect choices (distractors). Each distractor should:\n    - Be similar in length and semantic category to the given object, please ensure they are not too short or too long compared to the object.\n    - Be relevant to the subject and relation, but not the correct answer.\n3. Paraphrase into a Question: Rephrase the fill-in-the-blank prompt into a well-formed, s

In [39]:
print(all_items[docid]['ja']['abstract'])

 遺伝子プログラムの時系列的発現により形成される初期シナプス回路は, ステレオタイプで個性に乏しく, 重複が多く特異性に乏しい未熟な回路である. 生後, 受容器からの感覚刺激の増大とそれによる神経活動の亢進は, ポスト側となる中枢ニューロンの活性化と競合を招く. この過程において, 使用状況に応じたシナプスの強化と除去の選別化が起こり, 個体の経験や環境に適応した機能的回路へと改築される. われわれは, 遺伝子ノックアウトマウスを用いた形態学的解析を通して, グルタミン酸シグナル伝達に関わる分子が活動依存的なシナプス回路改築を制御している事実を, 小脳皮質や大脳皮質において明らかにしてきた.


In [40]:
print(all_items[docid]['en']['abstract'])

  Proper functioning of the nervous system relies on the precise formation of neural circuits during development. At birth, neurons have redundant synaptic connections not only to their proper targets but also to other neighboring cells. Then, functional neural circuits are formed during early postnatal development by the selective strengthening of necessary synapses and weakening of surplus connections. Synaptic connections are also modified so that projection fields of active afferents expand at the expense of lesser ones. We have studied the molecular mechanisms underlying these activity-dependent prunings and the plasticity of synaptic circuitry using gene-engineered mice defective in the glutamatergic signaling system. NMDA-type glutamate receptors are critically involved in the establishment of the somatosensory pathway ascending from the brainstem trigeminal nucleus to the somatosensory cortex. Without NMDA receptors, whisker-related patterning fails to develop, whereas lesion-i

In [41]:
import nltk
import hanlp
split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
def split_document(document, lang='en'):
    if lang == 'en' or lang == 'en_jstage':
        sentences = nltk.tokenize.sent_tokenize(document)
    elif lang == 'ja' or lang == 'zh':
        sentences = split_sent(document)
    return sentences

                                   

In [None]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')

input_texts = ['query: how much protein should a female eat',
               'query: 南瓜的家常做法',
               "passage: As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
               "passage: 1.清炒南瓜丝 原料:嫩南瓜半个 调料:葱、盐、白糖、鸡精 做法: 1、南瓜用刀薄薄的削去表面一层皮,用勺子刮去瓤 2、擦成细丝(没有擦菜板就用刀慢慢切成细丝) 3、锅烧热放油,入葱花煸出香味 4、入南瓜丝快速翻炒一分钟左右,放盐、一点白糖和鸡精调味出锅 2.香葱炒南瓜 原料:南瓜1只 调料:香葱、蒜末、橄榄油、盐 做法: 1、将南瓜去皮,切成片 2、油锅8成热后,将蒜末放入爆香 3、爆香后,将南瓜片放入,翻炒 4、在翻炒的同时,可以不时地往锅里加水,但不要太多 5、放入盐,炒匀 6、南瓜差不多软和绵了之后,就可以关火 7、撒入香葱,即可出锅"]

batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:2] @ embeddings[2:].T) * 100
print(scores.tolist())


In [12]:
en_sentences

['Recent studies showed that makeup reduces mental stress, indicated by not only psychological but also physiological values.',
 'In this study, we examined the relationship between mental stress and the activity of reactive oxygen scavenging enzymes such as superoxide dismutase (SOD) and catalase (CAT), and the effect of the mental stress reduction by makeup on the activity of reactive oxygen scavenging enzymes.',
 'In experiment 1, we measured the concentration of cortisol and activity of SOD in saliva after the addition of mental stress.',
 'In this result, the concentration of cortisol increased and the activity of SOD decreased significantly.',
 'As we examined the effect of makeup following the mental stress addition in experiment 2, the concentration of cortisol decreased and the activity of SOD and CAT increased.',
 'Moreover, we showed a decrease in the anxiety state and increase in spiritual health as the psychological effects.',
 'Those results suggest that makeup psychologi

In [13]:
ja_sentences

['近年の研究から, メイクアップは心理的な指標だけでなく生理的な指標においても精神的ストレスの緩和効果があることが示されている。',
 '本実験において, われわれは, 精神的ストレスとsuperoxide dismutase (SOD), catalase (CAT) のような活性酸素消去酵素の関係およびメイクアップによる精神的ストレス緩和効果の活性酸素消去酵素に対する影響について調べた。',
 '実験1において, 精神的ストレス負荷前後に唾液中のコルチゾール濃度とSOD活性を測定したところ, 負荷前に比べコルチゾール濃度の増加とSOD活性の低下がみられた。',
 'また, 実験2において, 精神的ストレス負荷後にメイクアップを行った実験群は, 精神的ストレス負荷前に比べ, 唾液中のコルチゾール濃度は減少傾向にあり, SOD, CAT活性は増加した。',
 'また, 心理的効果として, 状況不安の低下や精神的健康度の上昇などが示された。',
 '以上の結果から, メイクアップは精神的ストレスによる心理的不安を解消し, 活性酸素消去酵素の活性低下を抑制すると考えられた。']