In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import numpy as np
import random
import glob
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
import pytorch_lightning as pl


In [10]:
MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"

category_list = [
    'dokujo-tsushin',
    'it-life-hack',
    'kaden-channel',
    'livedoor-homme',
    'movie-enter',
    'peachy',
    'sports-watch',
    'topic-news'
]

tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

max_length = 128
dataset_for_loader = []
for label, category in enumerate(tqdm(category_list)):
    for file in glob.glob(f"../data/livedoor_corpus/text/{category}/{category}*"):
        lines = open(file).read().splitlines()
        text = '\n'.join(lines[3:]) # 4行目から抜き出す
        encoding = tokenizer(text, max_length=max_length, padding="max_length", truncation=True)
        encoding["labels"] = label
        encoding = { k: torch.tensor(v) for k, v in encoding.items()}
        dataset_for_loader.append(encoding)

print(dataset_for_loader[0])

100%|██████████| 8/8 [00:30<00:00,  3.79s/it]

{'input_ids': tensor([    2, 17057,   213,    37,     7,   327, 28628,   399, 28918,    14,
          602,    15,    10,    36, 16017, 17359, 18999,  1568,     5,  1519,
          606,    13,  6675,     5,  9996,    23,   815,   573,  1143,  4037,
          573,     5, 18999,  5523,    11,    17,   228, 10871,   125,    11,
         1471,  1108,    12,     9,     6,   569,   335,     5,    73,  1286,
        29400,     5, 17372,     7,   126,  1233,    14,   920,     6,    37,
        28884,    23, 12274,    35,   160, 20743,  1310,    24,    40,   167,
        30013,    49,   107, 29392,  2251,    64,    11, 10446,    10,     6,
        16594,  2203,  1287,     7, 10265, 16682,  1964,     9,  1982,    12,
         1241,   228,  5756, 28501,   758,    75,    13,   625,     8,    59,
           51,    12,  5362, 28457,  9205, 28477,  2102,    14,  2023,    60,
         4551,   569,   335,   784,     8,   113,  1284,    71, 29416,     9,
            6,  1356,   255,    28, 28458,    18, 




In [11]:
random.shuffle(dataset_for_loader)
n = len(dataset_for_loader)
n_train = int(0.6*n)
n_val = int(0.2*n)
dataset_train = dataset_for_loader[:n_train]
dataset_val = dataset_for_loader[n_train:n_train+n_val]
dataset_test = dataset_for_loader[n_train+n_val:]

num_workers=4
dataloader_train = DataLoader(dataset_train, batch_size=32, shuffle=True, num_workers=num_workers)
dataloader_val = DataLoader(dataset_val, batch_size=256, num_workers=num_workers)
dataloader_test = DataLoader(dataset_test, batch_size=256, num_workers=num_workers)

In [12]:
class BertForSequenceClassification_pl(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr):
        super().__init__()

        self.save_hyperparameters()
        self.bert_sc = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)

    def test_step(self, batch, batch_idx):
        labels = batch.pop("labels")
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        num_correct = (labels_predicted == labels).sum().item()
        accuracy = num_correct/labels.size(0)
        self.log('accuracy', accuracy)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

In [13]:
checkpoint = pl.callbacks.ModelCheckpoint(
    monitor = 'val_loss',
    mode = 'min',
    save_top_k = 1,
    save_weights_only=True,
    dirpath='../model/'
)

trainer = pl.Trainer(gpus=1, max_epochs=2, callbacks=[checkpoint])


  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
model = BertForSequenceClassification_pl(MODEL_NAME, num_labels=9, lr=1e-5)

trainer.fit(model, dataloader_train, dataloader_val)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

Epoch 1: 100%|██████████| 128/128 [01:28<00:00,  1.44it/s, loss=0.498, v_num=4]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 128/128 [01:34<00:00,  1.35it/s, loss=0.498, v_num=4]


In [15]:
best_model_path = checkpoint.best_model_path
print("best model file: ", checkpoint.best_model_path)
print("best model score: ", checkpoint.best_model_score)

best model file:  /home/jovyan/data/micky/workspace/github.com/wdy06/bert-practice/model/epoch=1-step=244.ckpt
best model score:  tensor(0.5554, device='cuda:0')


In [19]:
# test
test = trainer.test(model=model, dataloaders=dataloader_test)
print(f"Accuracy: {test[0]['accuracy']:.2f}")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 6/6 [00:07<00:00,  1.31s/it]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        accuracy            0.8192307949066162
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Accuracy: 0.82


In [20]:
# load pytorch lightning model
model = BertForSequenceClassification_pl.load_from_checkpoint(best_model_path)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

In [21]:
# save transformers model
model.bert_sc.save_pretrained("../model_transformers/")

In [22]:
bert_sc = BertForSequenceClassification.from_pretrained('../model_transformers/')