In [17]:
import random
import glob
import json
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertModel
import pytorch_lightning as pl

from nltk import word_tokenize
from nltk.corpus.util import LazyCorpusLoader
from nltk.corpus.reader import *
import re # 正規表現
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import jaccard_score
from sklearn.metrics import hamming_loss
import numpy as np
MODEL_NAME="bert-base-cased"


In [16]:
# コーパスのロード
ma_reuters = LazyCorpusLoader(
    'ma_reuters', CategorizedPlaintextCorpusReader, '(training|test).*',
    cat_file='cats.txt', encoding='ISO-8859-2')

# MA_Reutersのロード
documents = ma_reuters.fileids()

# 訓練とテストデータの文書IDの抽出
train_docs_id = [doc for doc in documents if doc.startswith("train")]
test_docs_id = [doc for doc in documents if doc.startswith("test")]

# 訓練とテストデータの生データの抽出
train_docs = [ma_reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [ma_reuters.raw(doc_id) for doc_id in test_docs_id]

# カテゴリーのリスト
categories = ma_reuters.categories()
num_categories = len(categories)


mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([ma_reuters.categories(doc_id)
                                  for doc_id in train_docs_id])
test_labels = mlb.transform([ma_reuters.categories(doc_id)
                             for doc_id in test_docs_id])

In [3]:
class BertForSequenceClassificationMultiLabel(torch.nn.Module):
    def __init__(self, model_name, num_labels):
        super().__init__()
        # BertModelのロード
        self.bert = BertModel.from_pretrained(model_name)
        # 線形変換を初期化しておく
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        # データを入力しBERTの最終層の出力を得る
        bert_output = self.bert(
            input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)  # attention_mask:paddingじゃないトークンのこと
        last_hidden_state = bert_output.last_hidden_state

        # [PAD]以外のトークンで隠れ状態の平均を取る
        averaged_hidden_state = (
            last_hidden_state*attention_mask.unsqueeze(-1)).sum(1)/attention_mask.sum(1, keepdim=True)

        # 線形変換
        scores = self.linear(averaged_hidden_state)

        # 出力の形式を整える
        output = {"logits": scores}

        # labelsが入力に含まれていたら、損失を計算し出力する
        if labels is not None:
            loss = torch.nn.BCEWithLogitsLoss()(scores, labels.float())
            output["loss"] = loss

        # 属性でアクセスできるようにする
        output = type("bert_output", (object,), output)

        return output


In [19]:
tokenizer=BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
bert_scml=BertForSequenceClassificationMultiLabel(MODEL_NAME,num_labels=num_categories)
#bert_scml=bert_scml.cuda()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'BertJapaneseTokenizer'.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model

In [20]:
text_list = test_docs
labels_list = test_labels

# データの符号化
encoding = tokenizer(text_list, padding="longest", return_tensors="pt")
#encoding = {k: v.cuda() for k, v in encoding.items()}  # データをGPUに載せる
#labels = torch.tensor(labels_list).cuda()
labels = torch.tensor(labels_list)

# BERTへデータを入力し分類スコアを得る
with torch.no_grad():
    output = bert_scml(**encoding)  # **encoding:辞書であるencodingの中身を展開して入力する
scores = output.logits

# スコアが正ならば、そのカテゴリを選択する
labels_predicted = (scores > 0).int()

# 精度の計算
num_correct = (labels_predicted == labels).all(-1).sum().item()
accuracy = num_correct/labels.size(0)


KeyboardInterrupt: 

In [6]:
def tokenize(text):  # テキストを小文字英語トークンに変換してリストで返す関数
    min_length = 3  # 3文字以上のものだけ残す
    words = [word.lower() for word in word_tokenize(text)]
    p = re.compile('[a-zA-Z]+')  # 小文字化しているが一応アルファベットで開始
    filtered_tokens = [token for token in words
                       if p.match(token) and len(token) >= min_length]
    return filtered_tokens
