# 任意のデータを使ったT5での文章生成

使い方
*   ランタイムから「ランタイムのタイプの変更」を行い、GPUを選択する
*   左のファイルマークを選択し、csvデータをアップロードする  
*   ランタイムから「すべてのセルを実行」を選択
*   結果を確認する



## ライブラリのダウンロード

In [None]:
!pip install -qU torch==1.7.1 torchtext==0.8.0 torchvision==0.8.2
!pip install -q transformers==4.4.2 pytorch_lightning==1.2.1 sentencepiece

In [None]:
import argparse
import os
import random
import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AdamW, T5ForConditionalGeneration, T5Tokenizer, get_linear_schedule_with_warmup

In [None]:
# 乱数シードの設定
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

## 前処理(preprocess.py)

In [None]:
df = pd.read_csv("data.csv", index_col=0)
df_train, df_devtest = train_test_split(df, random_state=42)
df_dev, df_test = train_test_split(df_devtest, test_size=0.5, random_state=42)

df_train.to_csv("./train.tsv", sep="\t")
df_dev.to_csv("./dev.tsv", sep="\t")
df_test.to_csv("./test.tsv", sep="\t")

## 学習(train.py)

In [None]:
class TsvDataset(Dataset):
    def __init__(self, tokenizer, data_dir, type_path, input_max_len=512, target_max_len=512):
        self.file_path = os.path.join(data_dir, type_path)

        self.input_max_len = input_max_len
        self.target_max_len = target_max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        source_mask = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()

        return {"source_ids": source_ids, "source_mask": source_mask,
                "target_ids": target_ids, "target_mask": target_mask}

    def _make_record(self, input, target):
        # ニュースタイトル生成タスク用の入出力形式に変換する。
        input = f"{input}"
        target = f"{target}"
        return input, target

    def _build(self):
        with open(self.file_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                line = line.strip().split("\t")
                if i == 0:  # header
                    continue
                input = line[1]
                target = line[0]

                input, target = self._make_record(input, target)

                tokenized_inputs = self.tokenizer.batch_encode_plus(
                    [input], max_length=self.input_max_len, truncation=True,
                    padding="max_length", return_tensors="pt"
                )
                # tokenizer.batch_encode_plus([input], max_length=100, truncation=True,padding="max_length", return_tensors="pt")

                tokenized_targets = self.tokenizer.batch_encode_plus(
                    [target], max_length=self.target_max_len, truncation=True,
                    padding="max_length", return_tensors="pt"
                )

                self.inputs.append(tokenized_inputs)
                self.targets.append(tokenized_targets)

In [None]:
class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams

        # 事前学習済みモデルの読み込み
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)

        # トークナイザーの読み込み
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path, is_fast=True)

    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None,
                decoder_attention_mask=None, labels=None):
        """順伝搬"""
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels
        )

    def _step(self, batch):
        """ロス計算"""
        labels = batch["target_ids"]

        # All labels set to -100 are ignored (masked),
        # the loss is only computed for labels in [0, ..., config.vocab_size]
        labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            labels=labels
        )

        loss = outputs[0]
        return loss

    def training_step(self, batch, batch_idx):
        """訓練ステップ処理"""
        loss = self._step(batch)
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        """バリデーションステップ処理"""
        loss = self._step(batch)
        self.log("val_loss", loss)
        return {"val_loss": loss}

    def test_step(self, batch, batch_idx):
        """テストステップ処理"""
        loss = self._step(batch)
        self.log("test_loss", loss)
        return {"test_loss": loss}

    def configure_optimizers(self):
        """オプティマイザーとスケジューラーを作成する"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters()
                           if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters()
                           if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.learning_rate,
                          eps=self.hparams.adam_epsilon)
        self.optimizer = optimizer

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.t_total
        )
        self.scheduler = scheduler

        return [optimizer], [{"scheduler": scheduler, "interval": "step", "frequency": 1}]

    def get_dataset(self, tokenizer, type_path, args):
        """データセットを作成する"""
        return TsvDataset(
            tokenizer=tokenizer,
            data_dir=args.data_dir,
            type_path=type_path,
            input_max_len=args.max_input_length,
            target_max_len=args.max_target_length)

    def setup(self, stage=None):
        """初期設定（データセットの読み込み）"""
        if stage == 'fit' or stage is None:
            train_dataset = self.get_dataset(tokenizer=self.tokenizer,
                                             type_path="train.tsv", args=self.hparams)
            self.train_dataset = train_dataset

            val_dataset = self.get_dataset(tokenizer=self.tokenizer,
                                           type_path="dev.tsv", args=self.hparams)
            self.val_dataset = val_dataset

            self.t_total = (
                    (len(train_dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
                    // self.hparams.gradient_accumulation_steps
                    * float(self.hparams.num_train_epochs)
            )

    def train_dataloader(self):
        """訓練データローダーを作成する"""
        return DataLoader(self.train_dataset,
                          batch_size=self.hparams.train_batch_size,
                          drop_last=True, shuffle=True, num_workers=4)

    def val_dataloader(self):
        """バリデーションデータローダーを作成する"""
        return DataLoader(self.val_dataset,
                          batch_size=self.hparams.eval_batch_size,
                          num_workers=4)

In [None]:
# 事前学習済みモデル
PRETRAINED_MODEL_NAME = "sonoisa/t5-base-japanese"

# モデルとデータの保存先
DATA_DIR="./"
MODEL_DIR="./"

# GPU利用有無
USE_GPU = torch.cuda.is_available()

# 各種ハイパーパラメータ
args_dict = dict(
    data_dir=DATA_DIR,  # データセットのディレクトリ
    model_dir=MODEL_DIR,  # データセットのディレクトリ
    model_name_or_path=PRETRAINED_MODEL_NAME,
    tokenizer_name_or_path=PRETRAINED_MODEL_NAME,

    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    gradient_accumulation_steps=1,

    n_gpu=1 if USE_GPU else 0,
    early_stop_callback=False,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    seed=42,
)

# トークナイザー（SentencePiece）モデルの読み込み
tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME, is_fast=True)

# テストデータセットの読み込み
train_dataset = TsvDataset(tokenizer, args_dict["data_dir"], "train.tsv",
                           input_max_len=512, target_max_len=64)

# 学習に用いるハイパーパラメータを設定する
args_dict.update({
    "max_input_length": 512,  # 入力文の最大トークン数
    "max_target_length": 64,  # 出力文の最大トークン数
    "train_batch_size": 4,  # 訓練時のバッチサイズ
    "eval_batch_size": 8,  # テスト時のバッチサイズ
    "num_train_epochs": 9,  # 訓練するエポック数
})
args = argparse.Namespace(**args_dict)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    precision=16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
)

# 転移学習の実行
model = T5FineTuner(args)
trainer = pl.Trainer(**train_params)
trainer.fit(model)

# 最終エポックのモデルを保存
model.tokenizer.save_pretrained(args.model_dir)
model.model.save_pretrained(args.model_dir)

## テスト(test.py)

In [None]:
# トークナイザー（SentencePiece）
tokenizer = T5Tokenizer.from_pretrained(args.model_dir, is_fast=True)

# 学習済みモデル
trained_model = T5ForConditionalGeneration.from_pretrained(args.model_dir)

# GPUの利用有無
if USE_GPU:
    trained_model.cuda()

# テストデータの読み込み
test_dataset = TsvDataset(tokenizer, args_dict["data_dir"], "test.tsv",
                          input_max_len=args.max_input_length,
                          target_max_len=args.max_target_length)

test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, num_workers=4)

trained_model.eval()

inputs = []
outputs = []
targets = []

BEAM_SIZE = 1
# beamが複数のときのパラメータ
beam_params = dict()
if BEAM_SIZE != 1:
    cnt = 0
    beam_params.update({
        "num_beams": BEAM_SIZE,  # ビームサーチの探索幅
        "diversity_penalty": 1.0,  # 生成結果の多様性を生み出すためのペナルティ
        "num_beam_groups": BEAM_SIZE,  # ビームサーチのグループ数
        "num_return_sequences": BEAM_SIZE,  # 生成する文の数
    })

for batch in tqdm(test_loader):
    input_ids = batch['source_ids']
    input_mask = batch['source_mask']

    if USE_GPU:
        input_ids = input_ids.cuda()
        input_mask = input_mask.cuda()

    output = trained_model.generate(input_ids=input_ids,
                                    attention_mask=input_mask,
                                    max_length=args.max_target_length,
                                    temperature=1.0,  # 生成にランダム性を入れる温度パラメータ
                                    repetition_penalty=1.5,  # 同じ文の繰り返し（モード崩壊）へのペナルティ
                                    **beam_params
                                    )

    output_text = [tokenizer.decode(ids, skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False)
                   for ids in output]
    target_text = [tokenizer.decode(ids, skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False)
                   for ids in batch["target_ids"]]
    input_text = [tokenizer.decode(ids, skip_special_tokens=True,
                                   clean_up_tokenization_spaces=False)
                  for ids in input_ids]

    inputs.extend(input_text)
    outputs.extend(output_text)
    targets.extend(target_text)

In [None]:
with open(os.path.join(args.data_dir, "output.txt"), "w") as out, open(os.path.join(args.data_dir, "target.txt"),
                                                                      "w") as tar, open(
    os.path.join(args.data_dir, "input.txt"), "w") as inp:
    for i in range(len(inputs)):
        print("generated: " + "\n\t".join(outputs[i * BEAM_SIZE:i * BEAM_SIZE + BEAM_SIZE]))
        print("target:    " + targets[i])
        print("src:       " + inputs[i])
        print()
        out.write(", ".join(outputs[i * BEAM_SIZE:i * BEAM_SIZE + BEAM_SIZE]) + "\n")
        tar.write(targets[i] + "\n")
        inp.write(inputs[i] + "\n")