<a href="https://colab.research.google.com/github/tanakt-hub/Test/blob/main/Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#ライブラリの準備

In [85]:
!pip install transformers
!pip install mecab-python3 fugashi ipadic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [86]:
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import torch.nn as nn
from transformers import  AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, AdamW, BertConfig, BertJapaneseTokenizer, BertTokenizer, TFBertModel, AutoModel, AutoTokenizer

from tqdm.notebook import tqdm
from IPython.display import display, HTML
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix

device = torch.device("cuda")

# Mecab関連の準備

In [87]:
import os, sys

# MeCab & NEologd
!apt install mecab libmecab-dev mecab-ipadic-utf8 file
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -a -y # 公式では -a オプションはついていないが多分必要
os.environ['MECABRC'] = "/etc/mecabrc" # 環境変数でmecabrcの場所を指定

# 万病辞書
!wget http://sociocom.jp/~data/2018-manbyo/data/MANBYO_201907_Dic-utf8.dic

import subprocess
cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
neologd_dic_dir_path = subprocess.check_output(cmd, shell=True).decode('utf-8').strip()

# 万病辞書へのパス
manbyo_dic_path = 'MANBYO_201907_Dic-utf8.dic'

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libmecab-dev is already the newest version (0.996-5).
mecab is already the newest version (0.996-5).
mecab-ipadic-utf8 is already the newest version (2.7.0-20070801+main-1).
file is already the newest version (1:5.32-2ubuntu0.4).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
fatal: destination path 'mecab-ipadic-neologd' already exists and is not an empty directory.
[install-mecab-ipadic-NEologd] : Start..
[install-mecab-ipadic-NEologd] : Check the existance of libraries
[install-mecab-ipadic-NEologd] :     find => ok
[install-mecab-ipadic-NEologd] :     sort => ok
[install-mecab-ipadic-NEologd] :     head => ok
[install-mecab-ipadic-NEologd] :     cut => ok
[install-mecab-ipadic-NEologd] :     egrep => ok
[install-mecab-ipadic-NEologd] :   

#BERTモデル


## medBERTjp

In [None]:
!wget https://github.com/ou-medinfo/medbertjp/releases/download/v0.1-minj/medBERTjp_L12_H768_A12_WWM_mecab-ipadic-neologd-jmedic.zip
!unzip medBERTjp_L12_H768_A12_WWM_mecab-ipadic-neologd-jmedic.zip

--2022-09-20 14:53:57--  https://github.com/ou-medinfo/medbertjp/releases/download/v0.1-minj/medBERTjp_L12_H768_A12_WWM_mecab-ipadic-neologd-jmedic.zip
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/306421029/750ea280-155c-11eb-9eb1-dd3e8ea4da0f?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220920%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220920T145357Z&X-Amz-Expires=300&X-Amz-Signature=9320a94ebd2ff16d972ed3fcb580a4da9c843b081b3a84225b45f8481336fd83&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=306421029&response-content-disposition=attachment%3B%20filename%3DmedBERTjp_L12_H768_A12_WWM_mecab-ipadic-neologd-jmedic.zip&response-content-type=application%2Foctet-stream [following]
--2022-09-20 14:53:57--  https://objects.githubusercontent.c

In [None]:
MEDBERT = 'medBERTjp_L12_H768_A12_WWM_mecab-ipadic-neologd-jmedic'

# 学習

学習用データの処理

In [None]:
SEED = 0

import urllib.request
df = pd.read_table("https://raw.githubusercontent.com/tanakt-hub/Test/main/data/Label-y1.txt")

# ラベルと文章を分ける
labels = df["flg"].values
sentences = df["text"].values

label_ids = labels

# 7:3に学習データとテストデータを分割する
train_sentence, test_sentence, y_train, y_test = train_test_split(sentences, label_ids, test_size=0.3, random_state=SEED, stratify=label_ids)

学習用クラスと損失関数の定義

In [None]:
class TrainDataset():
    def __init__(self, toks, targets):
        self.toks = toks
        self.targets = targets

    def __len__(self):
        return len(self.toks)

    def __getitem__(self, item):
        tok = self.toks[item]
        target = self.targets[item]

        input_ids = torch.tensor(tok["input_ids"])
        attention_mask = torch.tensor(tok["attention_mask"])
        token_type_ids = torch.tensor(tok["token_type_ids"])
        target = torch.tensor(target).float()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "target": target,
        }

In [None]:
class BertClassification(nn.Module):
    def __init__(self, model_type, tokenizer):
        super(BertClassification, self).__init__()

        bert_conf = BertConfig(model_type, output_hidden_states=False, output_attentions=True)
        bert_conf.vocab_size = tokenizer.vocab_size

        self.bert = AutoModel.from_pretrained(model_type, config=bert_conf, ignore_mismatched_sizes=True)
        self.fc = nn.Linear(bert_conf.hidden_size, 1)

    def forward(self, ids, mask, token_type_ids):
        out = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        h = out['pooler_output']
        a = out['attentions']
        h = nn.ReLU()(h)
        h = self.fc(h)
        h = h[:, 0]
        a = a[-1].sum(1)[:, 0, :]
        return h, a

In [None]:
loss_fn = nn.BCEWithLogitsLoss()

def train_loop(train_dataloader, model, optimizer, device, tqdm):
    losses = []
    model.train()
    optimizer.zero_grad()
    for n_iter, d in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        token_type_ids = d["token_type_ids"].to(device)
        target = d["target"].to(device)

        output, _ = model(input_ids, attention_mask, token_type_ids)
        loss = loss_fn(output, target)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        losses.append(loss.item())
    return losses

def test_loop(test_dataloader, model, device, tqdm):
    losses, predicts = [], []
    model.eval()
    for n_iter, d in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        token_type_ids = d["token_type_ids"].to(device)
        target = d["target"].to(device)

        with torch.no_grad():
            output, _ = model(input_ids, attention_mask, token_type_ids)

        loss = loss_fn(output, target)
        losses.append(loss.item())
        predicts += output.sigmoid().cpu().tolist()

    return predicts, np.array(losses).mean()

パラメータとトークナイザの定義

In [None]:
MODEL_TYPE = MEDBERT
LEAENING_RATE = 1e-6
BATCH_SIZE = 30
N_EPOCHS = 30

TOKENIZER = BertJapaneseTokenizer.from_pretrained(MODEL_TYPE,  mecab_kwargs={ "mecab_option": "-d " + neologd_dic_dir_path + " -u " + manbyo_dic_path})

In [None]:
TOKENIZER.tokenize("病理組織学的所見では、真皮における血管周囲のリンパ球浸潤、並びに表皮における軽度の空胞変性及びリンパ球浸潤などを認めた。脳脊髄液検査では、蛋白 105 mg/dL、細胞数 24/μLで、細菌培養及びウイルス検査は異常なかった。")

In [None]:
TOKENIZER.batch_encode_plus(["病理組織学的所見では、真皮における血管周囲のリンパ球浸潤、並びに表皮における軽度の空胞変性及びリンパ球浸潤などを認めた。脳脊髄液検査では、蛋白 105 mg/dL、細胞数 24/μLで、細菌培養及びウイルス検査は異常なかった。"])

学習データとテストデータをともにトークナイズし、dataloaderを定義する

In [None]:
train_toks = []
for sent in train_sentence:
    tok = TOKENIZER.encode_plus(sent,
                                   add_special_tokens=True,
                                   max_length=128,
                                   pad_to_max_length=True)
    train_toks.append(tok)

test_toks = []
for sent in test_sentence:
    tok = TOKENIZER.encode_plus(sent,
                                   add_special_tokens=True,
                                   max_length=128,
                                   pad_to_max_length=True)
    test_toks.append(tok)

train_dataset = TrainDataset(train_toks, y_train)
test_dataset = TrainDataset(test_toks, y_test)
train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        drop_last=True,
        shuffle=True,
)
test_dataloader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        drop_last=False,
        shuffle=False,
)

学習を行う

In [None]:
model = BertClassification(MODEL_TYPE, TOKENIZER)
model.to(device)

In [None]:
model = BertClassification(MODEL_TYPE, TOKENIZER)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LEAENING_RATE)

train_losses, test_losses = [], []
for epoch in range(N_EPOCHS):
    print(f"Epoch-{epoch}")
    train_losses += train_loop(train_dataloader, model, optimizer, device, tqdm)
    y_pred, test_loss = test_loop(test_dataloader, model, device, tqdm)

    test_losses.append(test_loss)

    # 各epochでのの　Confusion Matrixを確認
    _y_pred = (np.array(y_pred) > 0.5).astype(int)
    cm = confusion_matrix(y_test, _y_pred)
    cm_df = pd.DataFrame(cm,columns=['Predicted Neg', 'Predicted Pos'], index=['Actual Neg', 'Actual Pos'])
    display(cm_df)

学習結果の確認

In [None]:
plt.plot(train_losses)

In [None]:
plt.plot(test_losses)

# LIMEによる可視化

In [None]:
!pip install lime

In [None]:
def predictor(texts):
    tok = TOKENIZER.batch_encode_plus(texts, padding=True)
    input_ids = torch.tensor(tok['input_ids']).to(device)
    attention_mask = torch.tensor(tok['attention_mask']).to(device)
    token_type_ids = torch.tensor(tok['token_type_ids']).to(device)

    with torch.no_grad():
        output, _ = model(input_ids, attention_mask, token_type_ids)

    probas = output.sigmoid().cpu().numpy()
    return np.vstack([1 - probas, probas]).T

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=['Neg', 'Pos'])

# トライアル

全てのセルの実行完了後は↓のセルのみで実行可能。

ダブルクォーテーションの中の文章を好きに入れ替えてCtrl+Enterで実行結果が更新されます。

In [None]:

str_to_predict = "病理組織学的所見では、真皮における血管周囲のリンパ球浸潤、並びに表皮における軽度の空胞変性及びリンパ球浸潤などを認めた。脳脊髄液検査では、蛋白 105 mg/dL、細胞数 24/μLで、細菌培養及びウイルス検査は異常なかった。"
exp = explainer.explain_instance(str_to_predict, predictor, num_features=20, num_samples=100)
exp.show_in_notebook(text=str_to_predict)


In [None]:
txt = "病理組織学的所見では、真皮における血管周囲のリンパ球浸潤、並びに表皮における軽度の空胞変性及びリンパ球浸潤などを認めた。"
str_to_predict = TOKENIZER.tokenize(txt)
print(len(txt))
print(len(str_to_predict))
print(len(predictor(str_to_predict)))
print(' '.join(str_to_predict))

In [None]:
predictor(str_to_predict)

In [None]:
str_to_predict

In [None]:
##LimeはSplitterを適切に設定することが必要。たぶん分かち書きしてスペースでくっつけてSplitterを\sにすればいける。
txt = "病理組織脳脊髄液検査では異常が無かったが、細菌培養及びウイルス検査は異常があった。"
str_to_predict = TOKENIZER.tokenize(txt)
txtw = ' '.join(str_to_predict)

explainer = LimeTextExplainer(class_names=['Neg', 'Pos'], split_expression=r'\s', bow=False,)

exp = explainer.explain_instance(txtw, predictor, num_features=10, num_samples=1000)
exp.show_in_notebook(text=str_to_predict)

In [None]:

str_to_predict = "病理組織脳脊髄液検査では、蛋白 105 mg/dL、細胞数 24/μLで、細菌培養及びウイルス検査は異常なかった。学的所見では、真皮における血管周囲のリンパ球浸潤、並びに表皮における軽度の空胞変性及びリンパ球浸潤などを認めた。"
exp = explainer.explain_instance(str_to_predict, predictor, num_features=20, num_samples=100)
exp.show_in_notebook(text=str_to_predict)

In [None]:
TOKENIZER.batch_encode_plus("ほげろ", padding=True)

In [None]:
TOKENIZER.encode_plus("ほげろ", padding=True)

In [None]:
tok = TOKENIZER.encode_plus("ほげろ", padding=True, return_tensors = 'pt')
input_ids = torch.tensor(tok['input_ids']).to(device)
attention_mask = torch.tensor(tok['attention_mask']).to(device)
token_type_ids = torch.tensor(tok['token_type_ids']).to(device)

with torch.no_grad():
    print(model(input_ids, attention_mask, token_type_ids))

In [None]:
texts = "病理組織学的所見では、真皮における血管周囲のリンパ球浸潤、並びに表皮における軽度の空胞変性及びリンパ球浸潤などを認めた。"
tok = TOKENIZER.encode_plus(texts, padding=True,  return_tensors = 'pt')
tok1 = TOKENIZER.batch_encode_plus(texts, padding=True)
print(len(texts))
print(tok)
print(tok1)

In [None]:
input_ids = tok['input_ids'].to(device)
attention_mask = tok['attention_mask'].to(device)
token_type_ids = tok['token_type_ids'].to(device)

print(len(input_ids))
with torch.no_grad():
    output, _, a = model(input_ids, attention_mask, token_type_ids)

probas = output.sigmoid().cpu().numpy()
print(probas)

In [None]:
input_ids = torch.tensor(tok1['input_ids']).to(device)
attention_mask = torch.tensor(tok1['attention_mask']).to(device)
token_type_ids = torch.tensor(tok1['token_type_ids']).to(device)
print(len(input_ids))
with torch.no_grad():
    output, _, a = model(input_ids, attention_mask, token_type_ids)

probas = output.sigmoid().cpu().numpy()
print(probas)
np.vstack([1 - probas, probas]).T

In [None]:

input_ids = torch.tensor(tok['input_ids']).to(device)
attention_mask = torch.tensor(tok['attention_mask']).to(device)
token_type_ids = torch.tensor(tok['token_type_ids']).to(device)

with torch.no_grad():
    output, _ = model(input_ids, attention_mask, token_type_ids)

probas = output.sigmoid().cpu().numpy()
return np.vstack([1 - probas, probas]).T