<a href="https://colab.research.google.com/github/tsuji1234/sample/blob/main/BertByTsuji.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# pip-install
---
* ファイル保存しておいてロード出来るようにする

In [1]:
## =================================================================
## パッケージインストール
## =================================================================
!pip install --upgrade pip
!pip install transformers==4.18.0 fugashi==1.1.0 ipadic==1.0.0 pytorch-lightning==1.6.1
!pip install livelossplot --quiet # acc, lossグラフ表示用


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-23.0.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.0.4
    Uninstalling pip-22.0.4:
      Successfully uninstalled pip-22.0.4
Successfully installed pip-23.0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.18.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0-cp39-cp39-manylinux1_x86_64.whl (482 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m482.2/482.2 kB[0m [31m1

# Debug用
---

In [None]:
#import pdb
#pdb.set_trace()

# クラス定義
---

In [2]:
## =================================================================
## PyTorch Lightningの定義
## =================================================================
import torch
import pytorch_lightning as pl

class BertForSequenceClassification_pl(pl.LightningModule):
    # ===========================================================
    # 名称：コンストラクタ
    # 引数：bert_sc			BERTインスタンス
    #       lr				学習率
    #       labelName		ラベル名
    #       lossViewer		学習状況のグラフ表示
    # ===========================================================
    def __init__(self, bert_sc, lr, labelName, lossViewer=None):
        super().__init__()
        self.save_hyperparameters()		## 以降、self.hparamsでlabelNameとlrにアクセス出来る
        self.bert_sc = bert_sc
        self.lossViewer = lossViewer
        if self.lossViewer == None:
            self.lossViewer = NullViewer()
            

    # 学習データのミニバッチ(`batch`)が与えられた時に損失を出力する関数を書く。
    # batch_idxはミニバッチの番号であるが今回は使わない。
    def training_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        loss = output.loss
        self.log('train_loss', loss) # 損失を'train_loss'の名前でログをとる。
        self.lossViewer.setLoss(loss)
        return loss

    # 検証データのミニバッチが与えられた時に、
    # 検証データを評価する指標を計算する関数を書く。
    def validation_step(self, batch, batch_idx):
        output = self.bert_sc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss) # 損失を'val_loss'の名前でログをとる。
        self.lossViewer.setValLoss(val_loss)

    # テストデータのミニバッチが与えられた時に、
    # テストデータを評価する指標を計算する関数を書く。
    def test_step(self, batch, batch_idx):
        labels = batch.pop(self.hparams.labelName) # バッチからラベルを取得
        output = self.bert_sc(**batch)
        labels_predicted = output.logits.argmax(-1)
        num_correct = ( labels_predicted == labels ).sum().item()
        accuracy = num_correct/labels.size(0) #精度
        self.log('accuracy', accuracy) # 精度を'accuracy'の名前でログをとる。

    # 学習に用いるオプティマイザを返す関数を書く。
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)


In [3]:
import csv
import glob
import os

## =================================================================
## 名称：CSV読込みクラス
## =================================================================
class csvLoader:
    # ========================
    # コンストラクタ
    # dirPath : 入力CSVが入っているフォルダパス
    # hdr_skip      True :CSVの1行目をスキップする
    #               False:CSVの1行目スキップをしない
    # ========================
    def __init__(self, dirPath, hdr_skip=False):
        self.dirPath = dirPath
        self.hdr_skip = hdr_skip

    # ========================
    # 名称：CSVファイル読込み
    # 説明：CSVを読み込んで二次元リストを返す
    # 戻値：二次元リスト
    # ========================
    def load(self):
        chkFile = os.path.join(self.dirPath, "*.csv")
        schFileList = glob.glob(chkFile)

        retList = []
        for fPath in schFileList:
            print('read file : {0}'.format(fPath))
            with open(fPath, 'r', encoding = 'shift_jis', errors = 'ignore') as f:
                reader = csv.reader(f)
                if self.hdr_skip == True : next(reader) # ヘッダースキップ
                for line in reader:
                    retList.append(line)
        return retList


In [4]:
import torch
import random
from torch.utils.data import DataLoader

## =================================================================
## 名称：トークンデータをDataLoaderに変換する
## =================================================================
class DataLoaderConverter:
    # ========================
    # コンストラクタ
    # inputLoader      入力データロード用クラスインスタンス
    #                  load()メソッドを実装している
    # ========================
    def __init__(self, batchSize):
        self.batchSize = batchSize

    def convert(self, inputToken):
        tensorList = self.__convertTensor(inputToken)	# torchで扱える形式に変換
        dsTrain, dsVal, dsTest = self.__dataSplit(tensorList)
        print('トレーニングデータ:{0}、検証データ:{1}、テストデータ:{2}、合計:{3}'.format(len(dsTrain), len(dsVal), len(dsTest), len(tensorList)))

        # データセットからデータローダを作成
        # 学習データはshuffle=Trueにする。
        dataloader_train = DataLoader(dsTrain, batch_size=self.batchSize, shuffle=True)
        dataloader_val   = DataLoader(dsVal,   batch_size=256)
        dataloader_test  = DataLoader(dsTest,  batch_size=256)
        return [dataloader_train, dataloader_val, dataloader_test]

    ## データセット分割
    def __dataSplit(self, tensorList):
        random.shuffle(tensorList) # ランダムにシャッフル
        n = len(tensorList)
        n_train = int(0.6*n)
        n_val = int(0.2*n)
        dataset_train = tensorList[:n_train]				# 学習データ(0 ～ MAX*0.6)
        dataset_val = tensorList[n_train:n_train+n_val]		# 検証データ(MAX*0.6 ～ MAX*0.8)
        dataset_test = tensorList[n_train+n_val:]			# テストデータ(MAX*0.8 ～ MAX)
        return [dataset_train, dataset_val, dataset_test]

    ## torchで扱える形式に変換
    def __convertTensor(self, input):
        encodingList = []
        for line in input:
            encoding = { k: torch.tensor(v) for k, v in line.items() }
            encodingList.append(encoding)
        return encodingList



In [5]:
## =================================================================
## 名称：処理機カテゴライズ用のtokenizer
## =================================================================
class QATokenizer:
    # ========================
    # コンストラクタ
    # inputLoader      入力データロード用クラスインスタンス
    #                  load()メソッドを実装している
    # ========================
    def __init__(self, bertTokenizer, inputLoader):
        self.bertTokenizer = bertTokenizer
        self.inputLoader = inputLoader

    def loadData(self):
        return self.inputLoader.load()

    # ========================
    # トークン化
    # inputDataList    二次元リスト
    # ========================
    def tokenizer(self, inputDataList, max_length=512):
        retList = []
        for line in inputDataList:
            text = line[0]
            label = line[2]

            encoding = self.bertTokenizer(text, max_length = max_length, padding = 'max_length', truncation = True)
            encoding['labels'] = 0 if label == 'IMBT' else 1
            retList.append(encoding)

        return retList


In [6]:
import pytorch_lightning as pl

## =================================================================
## 名称：トレーニング用のインスタンス作成
## =================================================================
class TrainerBuilder:
    # ========================
    # コンストラクタ
    # ========================
    def __init__(self, epochs, modelTempPath):
        self.epochs = epochs
        self.modelTempPath = modelTempPath
        pass

    def buildGPU(self):
        checkpoint = self.__getChkPoint()                   # 学習時にモデルの重みを保存する条件を指定
        trainer = self.__getTrainer(1, [checkpoint])        # 学習の方法を指定
        return [trainer, checkpoint]

    def buildNoGPU(self):
        checkpoint = self.__getChkPoint()                   # 学習時にモデルの重みを保存する条件を指定
        trainer = self.__getTrainer(0, [checkpoint])        # 学習の方法を指定
        return [trainer, checkpoint]

#     def buildTPU(self):
#         checkpoint = self.__getChkPoint()                   # 学習時にモデルの重みを保存する条件を指定
#         trainer = self.__getTrainerTPU([checkpoint])        # 学習の方法を指定
#
#         return [trainer, checkpoint]

    # ========================
    # checkpoint定義
    # ========================
    def __getTrainer(self, num_gpus, callbk_func):
        trainer = pl.Trainer(
            gpus = num_gpus,             # 使用GPU数
#            val_check_interval = 0.5,    # トレーニング中の検証呼出し率
            max_epochs = self.epochs,    # エポック数
            callbacks = callbk_func
            )
        return trainer

    # ========================
    # checkpoint定義
    # ========================
    def __getChkPoint(self):
        checkpoint = pl.callbacks.ModelCheckpoint(
            monitor = 'val_loss',
            mode = 'min',
            save_top_k = 1,
            save_weights_only = True,
            dirpath = self.modelTempPath,
            )
        return checkpoint



In [7]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from livelossplot import PlotLosses

## =================================================================
## 名称：学習状況の表示
## =================================================================
class trainViewer:
    # ========================
    # コンストラクタ
    # ========================
    def __init__(self):
        self.loss = []
        self.val_loss = []
        self.liveloss = PlotLosses()
        pass

    def setLoss(self, loss):
        self.loss.append(loss)
        logs = {}
        logs['loss'] = loss.detach().cpu()
        self.liveloss.update(logs)
        self.liveloss.send()

    def setValLoss(self, val_loss):
        self.val_loss.append(val_loss)
        logs = {}
        logs['val_loss'] = val_loss.detach().cpu()
        self.liveloss.update(logs)
        self.liveloss.send()


class trainLogger:
    # ========================
    # コンストラクタ
    # ========================
    def __init__(self):
        self.loss = []
        self.val_loss = []
        self.liveloss = PlotLosses()
        pass

    def setLoss(self, loss):
        self.loss.append(loss)
        print('loss : {0}'.format(loss))

    def setValLoss(self, val_loss):
        self.val_loss.append(val_loss)
        print('val_loss : {0}'.format(val_loss))


## Null Object
class NullViewer:
    def setLoss(loss):
        pass
    def setValLoss(val_loss):
        pass


# 学習処理メイン
---

In [8]:
## =================================================================
## メイン処理
## =================================================================
## BERT事前学習モデルのロード
from transformers import BertJapaneseTokenizer
from transformers import BertForSequenceClassification
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'
bertTokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)
bert_sc = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

## Googleドライブをマウント
from google.colab import drive
drive.mount('/content/drive')
workDirPath = '/content/drive/MyDrive/SoftmBert'


Downloading:   0%|          | 0.00/252k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

Mounted at /content/drive


In [9]:
## =================================================================
## データ作成
## =================================================================
import os

class makeBertTrainer():
    def __init__(self, plotViewer, bert_sc, bertTokenizer, workPath):
        self.plotViewer = plotViewer
        self.bert_sc = bert_sc
        self.bertTokenizer = bertTokenizer
        self.workPath = workPath

    def makeCPU(self, maxTokenLength, max_epochs, batch_size, maxInputNum=None):

        ## 事前トレーニング済みモデルを用意する
        #self.bert_sc = self.bert_sc.cuda()
        preModel = BertForSequenceClassification_pl(self.bert_sc, lr=1e-5, labelName='labels', lossViewer=self.plotViewer)

        ## 入力データをトークン化
        inputFilePathDir = os.path.join(self.workPath, 'Data')
        inputLoader = csvLoader(inputFilePathDir, True)
        qaTokenizer = QATokenizer(self.bertTokenizer, inputLoader)
        inputData = qaTokenizer.loadData()
        inputToken = qaTokenizer.tokenizer(inputData[0:maxInputNum], max_length=maxTokenLength)   # param 入力データ数調整

        ## BERTトレーニングデータ作成
        modelDir = os.path.join(self.workPath, 'model/')
        dlTrain, dlVal, dlTest = DataLoaderConverter(batch_size).convert(inputToken)
        trainer, checkPoint = TrainerBuilder(max_epochs, modelDir).buildNoGPU()   # param ランタイム指定(buildGPU/buildNoGPU)

        print('バッチサイズ：{0}、エポック数：{1}、MAXトークンサイズ：{2}'.format(batch_size, max_epochs, maxTokenLength))
        print('入力データ：{0}個、1エポック：{1}STEP、合計：{2}STEP'.format(len(dlTrain.dataset),
                                                                            (len(dlTrain.dataset) / batch_size),
                                                                            ((len(dlTrain.dataset) / batch_size) * max_epochs)))

        return [preModel, dlTrain, dlVal, dlTest, trainer, checkPoint]

    def makeGPU(self, maxTokenLength, max_epochs, batch_size, maxInputNum=None):

        ## 事前トレーニング済みモデルを用意する
        self.bert_sc = self.bert_sc.cuda()
        preModel = BertForSequenceClassification_pl(self.bert_sc, lr=1e-5, labelName='labels', lossViewer=self.plotViewer)

        ## 入力データをトークン化
        inputFilePathDir = os.path.join(self.workPath, 'Data')
        inputLoader = csvLoader(inputFilePathDir, True)
        qaTokenizer = QATokenizer(self.bertTokenizer, inputLoader)
        inputData = qaTokenizer.loadData()
        inputToken = qaTokenizer.tokenizer(inputData[0:maxInputNum], max_length=maxTokenLength)   # param 入力データ数調整

        ## BERTトレーニングデータ作成
        modelDir = os.path.join(self.workPath, 'model/')
        dlTrain, dlVal, dlTest = DataLoaderConverter(batch_size).convert(inputToken)
        trainer, checkPoint = TrainerBuilder(max_epochs, modelDir).buildGPU()   # param ランタイム指定(buildGPU/buildNoGPU)

        print('バッチサイズ：{0}、エポック数：{1}、MAXトークンサイズ：{2}'.format(batch_size, max_epochs, maxTokenLength))
        print('入力データ：{0}個、1エポック：{1}STEP、合計：{2}STEP'.format(len(dlTrain.dataset),
                                                                            (len(dlTrain.dataset) / batch_size),
                                                                            ((len(dlTrain.dataset) / batch_size) * max_epochs)))

        return [preModel, dlTrain, dlVal, dlTest, trainer, checkPoint]



In [10]:
## =========================
## START
## =========================
#plotViewer = NullViewer()  # ダミー用
#plotViewer = trainViewer()  # グラフ表示用
plotViewer = trainLogger()  # ログ表示用

trainerMaker = makeBertTrainer(plotViewer, bert_sc, bertTokenizer, workDirPath)
preModel, dlTrain, dlVal, dlTest, trainer, checkPoint = trainerMaker.makeGPU(maxTokenLength=256, max_epochs=1, batch_size=16, maxInputNum=1000)


  rank_zero_warn(


read file : /content/drive/MyDrive/SoftmBert/Data/202109.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202107.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202106.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202105.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202203.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202111.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202202.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202110.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202201.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202108.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202104.csv
read file : /content/drive/MyDrive/SoftmBert/Data/202112.csv


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


トレーニングデータ:600、検証データ:200、テストデータ:200、合計:1000
バッチサイズ：16、エポック数：1、MAXトークンサイズ：256
入力データ：600個、1エポック：37.5STEP、合計：37.5STEP


In [None]:
## =================================================================
## ファインチューニング
## =================================================================
import shutil
modelDir = os.path.join(workDirPath, 'model/')

## ディレクトリを空にする
shutil.rmtree(modelDir)
os.mkdir(modelDir)

## fit
trainer.fit(preModel, dlTrain, dlVal)


  rank_zero_warn(f"attribute '{k}' removed from hparams because it cannot be pickled")
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type                          | Params
----------------------------------------------------------
0 | bert_sc | BertForSequenceClassification | 110 M 
----------------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
442.476   Total estimated model params size (MB)


In [None]:
## =================================================================
## 結果の評価
## =================================================================
print('ベストモデルのファイル: ', checkPoint.best_model_path)
print('ベストモデルの検証データに対する損失: ', checkPoint.best_model_score)

# チューニング済みモデルをロード
tuneModel = preModel.load_from_checkpoint(checkPoint.best_model_path)

# 評価実施
testResult = trainer.test(tuneModel, dlTest)
print(f'Accuracy: {testResult[0]["accuracy"]:.2f}')


In [None]:
## =================================================================
## モデル保存
## =================================================================
import datetime
resultDirPath = os.path.join(workDirPath, 'Result')
if os.path.exists(resultDirPath):
    now_str = datetime.datetime.now().strftime('_%Y%m%d%H%M')
    os.rename(resultDirPath, resultDirPath + now_str)
os.makedirs(resultDirPath)

saveDirectory = os.path.join(resultDirPath, 'model_transformers')
tuneModel.bert_sc.save_pretrained( saveDirectory )
print('チューニング済みモデル保存先：', saveDirectory)


In [None]:
## =================================================================
## TensorBord表示
## =================================================================

#%load_ext tensorboard
#%tensorboard --logdir=lightning_logs/