In [1]:
import random

import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
import pytorch_lightning as pl

from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import jaccard_score
from sklearn.metrics import hamming_loss
import numpy as np

MODEL_NAME='bert-base-uncased'


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# コーパスのロード
ma_reuters = LazyCorpusLoader(
    'ma_reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
    cat_file='cats.txt', encoding='ISO-8859-2')

# MA_Reutersのロード
documents = ma_reuters.fileids()

# 訓練とテストデータの文書IDの抽出
train_docs_id = [doc for doc in documents if doc.startswith("train")]
test_docs_id = [doc for doc in documents if doc.startswith("test")]

# 訓練とテストデータの生データの抽出
train_docs = [ma_reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [ma_reuters.raw(doc_id) for doc_id in test_docs_id]

# カテゴリーのリスト
categories = ma_reuters.categories()
num_categories = len(categories)


mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id)
                                  for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id)
                             for doc_id in test_docs_id])

In [3]:
class BertForSequenceClassificationMultiLabel(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        # BertModelのロード
        self.bert = BertModel.from_pretrained(model_name)
        # 線形変換を初期化しておく
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        # データを入力しBERTの最終層の出力を得る
        bert_output = self.bert(
            input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)  # attention_mask:paddingじゃないトークンのこと
        last_hidden_state = bert_output.last_hidden_state

        # [PAD]以外のトークンで隠れ状態の平均を取る
        averaged_hidden_state = (
            last_hidden_state*attention_mask.unsqueeze(-1)).sum(1)/attention_mask.sum(1, keepdim=True)

        # 線形変換
        scores = self.linear(averaged_hidden_state)

        # 出力の形式を整える
        output = {"logits": scores}

        # labelsが入力に含まれていたら、損失を計算し出力する
        if labels is not None:
            loss = torch.nn.BCEWithLogitsLoss()(scores, labels.float())
            output["loss"] = loss

        # 属性でアクセスできるようにする
        output = type("bert_output", (object,), output)

        return output


In [4]:
# トークナイザのロード
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# 各データの形式を整える
max_length = 128
dataset_for_loader = []
for i in range(len(train_docs)):
    text=train_docs[i]
    labels=train_labels[i]
    encoding = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True
    )
    encoding['labels'] = labels
    encoding = { k: torch.tensor(v) for k, v in encoding.items() }
    dataset_for_loader.append(encoding)

# データセットの分割
random.shuffle(dataset_for_loader) 
n = len(dataset_for_loader)
n_train = int(0.9*n)
n_val = int(0.1*n)
dataset_train = dataset_for_loader[:n_train] # 学習データ
dataset_val = dataset_for_loader[n_train:n_train+n_val] # 検証データ

#　データセットからデータローダを作成
dataloader_train = DataLoader(
    dataset_train, batch_size=32, shuffle=True
) 
dataloader_val = DataLoader(dataset_val, batch_size=256)

In [5]:
class BertForSequenceClassificationMultiLabel_pl(pl.LightningModule):

    def __init__(self, model_name, num_labels, lr):
        super().__init__()
        self.save_hyperparameters() 
        self.bert_scml = BertForSequenceClassificationMultiLabel(
            model_name, num_labels=num_labels
        ) 

    def training_step(self, batch, batch_idx):
        output = self.bert_scml(**batch)
        loss = output.loss
        self.log('train_loss', loss)
        return loss
        
    def validation_step(self, batch, batch_idx):
        output = self.bert_scml(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)

    def test_step(self, batch, batch_idx):
        labels = batch.pop('labels')
        output = self.bert_scml(**batch)
        scores = output.logits
        labels_predicted = ( scores > 0 ).int()
        num_correct = ( labels_predicted == labels ).all(-1).sum().item()
        accuracy = num_correct/scores.size(0)
        self.log('accuracy', accuracy)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath='model/',
)

trainer = pl.Trainer(
    gpus=1, 
    max_epochs=10,
    callbacks = [checkpoint]
)
model = BertForSequenceClassificationMultiLabel_pl(
    MODEL_NAME, 
    num_labels=55, 
    lr=1e-5
)

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceC

In [6]:
#学習
trainer.fit(model, dataloader_train, dataloader_val)
#test = trainer.test(dataloaders=dataloader_test)
#print(f'Accuracy: {test[0]["accuracy"]:.2f}')

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4]

  | Name      | Type                                    | Params
----------------------------------------------------------------------
0 | bert_scml | BertForSequenceClassificationMultiLabel | 109 M 
----------------------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
438.098   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 9: 100%|██████████| 221/221 [01:00<00:00,  3.67it/s, loss=0.0213, v_num=11]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 221/221 [01:05<00:00,  3.35it/s, loss=0.0213, v_num=11]


In [7]:
#model=BertForSequenceClassificationMultiLabel_pl.load_from_checkpoint("model/epoch=4-step=1025.ckpt")

In [8]:
#GPUのメモリ容量の都合でテストデータを分割
test_docs_split=[test_docs[0:900],test_docs[900:1800],test_docs[1800:]]
labels_predicted=[]
bert_scml = model.bert_scml.cuda()

for i in range(3): #推論
    encoding=tokenizer(test_docs_split[i],max_length=128,padding='max_length',truncation=True,return_tensors="pt")
    encoding = { k: v.cuda() for k, v in encoding.items() }
    with torch.no_grad():
        output=bert_scml(**encoding)
    scores=output.logits
    label=(scores>0).int().cpu().numpy().tolist()
    labels_predicted+=label

In [9]:
# Jaccard係数の計算
print(f"Jaccard係数による評価:"
      f"{np.round(jaccard_score(test_labels,labels_predicted, average='samples'),3)}")
# Hamming損失の計算
print(f"Hamming損失による評価:"
      f"{np.round(hamming_loss(test_labels,labels_predicted),3)}")

Jaccard係数による評価:0.849
Hamming損失による評価:0.006
