## 第12回レポート課題

In [1]:
!pip install transformers
!apt install git make curl xz-utils file
!apt install mecab libmecab-dev mecab-ipadic mecab-ipadic-utf8
!pip install mecab-python3==0.996.5

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 5.4MB/s eta 0:00:01[K     |▉                               | 20kB 1.8MB/s eta 0:00:01[K     |█▎                              | 30kB 2.3MB/s eta 0:00:01[K     |█▊                              | 40kB 2.6MB/s eta 0:00:01[K     |██▏                             | 51kB 2.0MB/s eta 0:00:01[K     |██▋                             | 61kB 2.3MB/s eta 0:00:01[K     |███                             | 71kB 2.5MB/s eta 0:00:01[K     |███▍                            | 81kB 2.7MB/s eta 0:00:01[K     |███▉                            | 92kB 2.9MB/s eta 0:00:01[K     |████▎                           | 102kB 2.8MB/s eta 0:00:01[K     |████▊                           | 112kB 2.8MB/s eta 0:00:01[K     |█████▏                          | 122kB 2.8MB

In [2]:
import torch
from transformers import BertJapaneseTokenizer, BertForMaskedLM
import transformers

pretrained_model_name = 'cl-tohoku/bert-base-japanese-whole-word-masking'

# 事前学習済みモデルのトークナイザを使用
tokenizer = BertJapaneseTokenizer.from_pretrained(pretrained_model_name)

# 形態素解析 (内部でMeCabを使用)
text1 = "今日はいい天気だね"
text2 = "明日は雨がふるかもしれませんね"

print("text1", tokenizer.tokenize(text1))
print("text2", tokenizer.tokenize(text2))

"""
text1 ['今日', 'は', 'いい', '天気', 'だ', 'ね']
text2 ['明日', 'は', '雨', 'が', 'ふる', 'かも', 'しれ', 'ませ', 'ん', 'ね']
"""

# BERTに入力する形式に変換
for_bert_inputs = tokenizer([text1, text2], padding=True, return_tensors="pt")
print("for_bert_inputs", for_bert_inputs)
"""
for_bert_inputs {'input_ids': tensor([[    2,  3246,     9,  2575, 11385,    75,  1852,     3,     0,     0,
             0,     0],
        [    2, 11475,     9,  3741,    14,  8491,  4830,  6758,  6769,  1058,
          1852,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
"""
# input_ids: 単語をIDに変換した系列．padding済み
# token_type_ids： 2文からなるペアを入力した場合に，1文目と2文目を区別するための系列
# attention_mask： input_idsのpadding部分とそうでない部分を区別するための系列


# 参考．文のペアを入れる場合の例
text3 = "そうかな"
text4 = "違うと思います"
tmp = tokenizer([[text1, text2],[text3, text4]], padding=True, return_tensors="pt")
print("tmp", tmp)
"""
tmp {'input_ids': tensor([[    2,  3246,     9,  2575, 11385,    75,  1852,     3, 11475,     9,
          3741,    14,  8491,  4830,  6758,  6769,  1058,  1852,     3],
        [    2,  1778,    29,    18,     3,  5720,    13,  2502,  2610,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
"""


# BERTの事前学習済みモデルをロード．BertForSequenceClassificationは1文が与えられて分類を行うクラス．num_labelsでラベル数を指定
# 他にもいろいろ用意されている．使用例も書かれている→  https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification
model = transformers.BertForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=5)

out = model(input_ids=for_bert_inputs["input_ids"], token_type_ids=for_bert_inputs["token_type_ids"], attention_mask=for_bert_inputs["attention_mask"])
print("out", out)

model = transformers.BertModel.from_pretrained(pretrained_model_name)
out = model(input_ids=for_bert_inputs["input_ids"], token_type_ids=for_bert_inputs["token_type_ids"], attention_mask=for_bert_inputs["attention_mask"])
# pooled_output = outputs[1]

# pooled_output = self.dropout(pooled_output)
# logits = self.classifier(pooled_output)
print("out", out[1].size())
"""
out (tensor([[-0.4694, -0.2888,  0.1584,  0.1443,  0.2474],
        [-0.4795, -0.2614,  0.0915,  0.1614,  0.1209]],
       grad_fn=<AddmmBackward>),)
"""

# あとは損失を計算していつもどおりbackpropすればOK

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=257706.0, style=ProgressStyle(descripti…


text1 ['今日', 'は', 'いい', '天気', 'だ', 'ね']
text2 ['明日', 'は', '雨', 'が', 'ふる', 'かも', 'しれ', 'ませ', 'ん', 'ね']
for_bert_inputs {'input_ids': tensor([[    2,  3246,     9,  2575, 11385,    75,  1852,     3,     0,     0,
             0,     0],
        [    2, 11475,     9,  3741,    14,  8491,  4830,  6758,  6769,  1058,
          1852,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tmp {'input_ids': tensor([[    2,  3246,     9,  2575, 11385,    75,  1852,     3, 11475,     9,
          3741,    14,  8491,  4830,  6758,  6769,  1058,  1852,     3],
        [    2,  1778,    29,    18,     3,  5720,    13,  2502,  2610,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1,

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445021143.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

out (tensor([[0.0972, 0.0191, 0.2723, 0.2366, 0.2427],
        [0.0785, 0.1013, 0.2346, 0.3516, 0.2025]], grad_fn=<AddmmBackward>),)
out torch.Size([2, 768])


'\nout (tensor([[-0.4694, -0.2888,  0.1584,  0.1443,  0.2474],\n        [-0.4795, -0.2614,  0.0915,  0.1614,  0.1209]],\n       grad_fn=<AddmmBackward>),)\n'

In [3]:
!wget http://www.rondhuit.com/download/ldcc-20140209.tar.gz
!tar xfz ldcc-20140209.tar.gz

category2id = {"kaden-channel":0, "peachy":1, "sports-watch":2, "dokujo-tsushin":3, "livedoor-homme":4, "it-life-hack":5, "movie-enter":6}

# Livedoorニュースコーパスからテキスト分類用のデータを作成．
import glob
import random

# 1行は [文][TAB][ラベル]からなる
write_lines = []
for d in category2id.keys():
  for file in glob.glob("text/" + d + "/*.txt"):
    with open(file) as f:
      lines = f.readlines()
      # 最初の2行はURLと日付なので捨てる
      for line in lines[3:]:
        line = line.strip()
        if len(line) > 20 and len(line) < 256 and "http" not in line:
          write_lines.append(line + "\t" + str(category2id[d]) + "\n")

random.shuffle(write_lines)
# Train, Dev, Testの3つに分ける.
dev = write_lines[0:2000]
test =write_lines[2000:4000]
train = write_lines[4000:]
# ファイルに保存
w = open("train.tsv", "w")
w.writelines(train)
w.close()
w = open("dev.tsv", "w")
w.writelines(dev)
w.close()
w = open("test.tsv", "w")
w.writelines(test)
w.close()
print("data size ", len(train), len(dev), len(test))

!head dev.tsv

--2020-08-23 12:59:12--  http://www.rondhuit.com/download/ldcc-20140209.tar.gz
Resolving www.rondhuit.com (www.rondhuit.com)... 59.106.19.174
Connecting to www.rondhuit.com (www.rondhuit.com)|59.106.19.174|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www.rondhuit.com/download/ldcc-20140209.tar.gz [following]
--2020-08-23 12:59:12--  https://www.rondhuit.com/download/ldcc-20140209.tar.gz
Connecting to www.rondhuit.com (www.rondhuit.com)|59.106.19.174|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8855190 (8.4M) [application/x-gzip]
Saving to: ‘ldcc-20140209.tar.gz’


2020-08-23 12:59:14 (7.76 MB/s) - ‘ldcc-20140209.tar.gz’ saved [8855190/8855190]

data size  67653 2000 2000
S：はい、僕は、見た目は宮崎あおいさんみたいなナチュラルな感じの人が好きですね。性格的にはリードしてくれる女性かな。どちらかという年上の女性に惹かれます。	1
※当選時にご連絡いただく住所、氏名、電話番号は、その確認などの関連情報のご案内のみに使用し、キャンペーン終了後は弊社の定める方法に基づき消去いたします。	6
・4回目（3/18）『5.15から2.26へ』	4
■「docomo with series MEDIAS ES N-05D」製品情報	5
「トリー バーチ」

In [11]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_sequence, pad_packed_sequence, pad_sequence, pack_padded_sequence


def get_data(fname):
    label_list = [] # ラベル(カテゴリーのid(0~6))を格納するリスト
    title_list = [] # タイトルを格納するリスト

    with open(fname) as f:
        for line in f:
            if not line:
                continue
            line = line.strip()

            title = line.split('\t')[0] # タイトルを取得
            title_list.append(title)

            # label = int(line.split('\t')[1])
            label = line.split('\t')[1] # ラベルを取得
            if str.isdecimal(label) == False: # ラベルが数字以外の場合は除く
                continue
            label = int(label) # ラベルをintに変換
            label_list.append(label)
          
    labels = torch.tensor(label_list) # label_listをtensorに変換
    return title_list, labels



class RnnDataset(torch.utils.data.Dataset):
    def __init__(self, data, label):
        self.for_bert_inputs = tokenizer(data, padding=True, return_tensors="pt")
        self.label = label

    def __len__(self):
        return len(self.for_bert_inputs["input_ids"])

    def __getitem__(self, idx):
        return self.for_bert_inputs["input_ids"][idx], self.for_bert_inputs["token_type_ids"][idx], self.for_bert_inputs["attention_mask"][idx], self.label[idx]


batch_size = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# データの準備
train_path = 'train.tsv'
train_x, train_y = get_data(train_path)
valid_path = 'dev.tsv'
valid_x, valid_y = get_data(valid_path)


dataset = RnnDataset(train_x, train_y)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

dataset2 = RnnDataset(valid_x, valid_y)
valid_loader = DataLoader(dataset2, batch_size=batch_size, shuffle=True)


model = transformers.BertForSequenceClassification.from_pretrained(pretrained_model_name, num_labels=7)  
model.to(device)

optimizer = optim.SGD(model.parameters(), lr=0.05)
loss_fn = nn.CrossEntropyLoss()


def train(model, train_loader, len_train):
    model.train()
    total_loss = 0
    correct_num = 0
    len_loader = len(train_loader)

    for ini, am, tti, target in train_loader:
        # GPUへ
        ini = ini.to(device)
        am = am.to(device)
        tti = tti.to(device)
        target = target.to(device)

        optimizer.zero_grad()
        pred = model(input_ids=ini, attention_mask=am, token_type_ids=tti) # modelで予測
        pred = pred[0]
        loss = loss_fn(pred, target) # lossを計算
        loss.backward()
        optimizer.step()
        total_loss += loss # lossをtotal_lossに加算
        correct_num += (torch.argmax(pred, axis=1) == target).sum().item() # 予測の正解数をカウント
    
    batch_loss = total_loss / len_loader #バッチごとの平均ロス
    acc = correct_num / len_train # 正解率
    return batch_loss, acc


def evaluation(model, valid_loader, len_valid):
    model.eval()
    total_loss = 0
    correct_num = 0
    len_loader = len(valid_loader)

    for ini, am, tti, target in valid_loader:
        # GPUへ
        ini = ini.to(device)
        am = am.to(device)
        tti = tti.to(device)
        target = target.to(device)

        with torch.no_grad():
          pred = model(input_ids=ini, attention_mask=am, token_type_ids=tti) # modelで予測
          pred = pred[0]

          loss = loss_fn(pred, target) # lossを計算
          total_loss += loss # lossをtotal_lossに加算
          correct_num += (torch.argmax(pred, axis=1) == target).sum().item() # 予測の正解数をカウント

    batch_loss = total_loss / len_loader #バッチごとの平均ロス
    acc = correct_num / len_valid # 正解率
    return batch_loss, acc

len_train = len(train_y)
len_valid = len(valid_y)

for epoch in range(30):
    train_loss, train_acc = train(model, train_loader, len_train)
    valid_loss, valid_acc = evaluation(model, valid_loader, len_valid)

    print(f"epoch: {epoch}")
    print(f"<train> Loss: {train_loss}\tAccuracy: {train_acc}")
    print(f"<valid> Loss: {valid_loss}\tAccuracy: {valid_acc}") 

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

RuntimeError: ignored

このエラーに対して以下の改善を行った.

・バッチサイズの減少

・GPUに渡すものを削減

しかし, いずれも効果は見られなかった.