<a href="https://colab.research.google.com/github/wadaka0821/nlp-tutorial/blob/main/questions/4_2_lstm_question.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM を使った一般常識のテキスト生成
## 作成者：和田
## 最終更新日：2023/03/08

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-

In [None]:
import torch
from torch.nn import LSTM, CrossEntropyLoss, Embedding, Linear
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from datasets import load_dataset
import nltk
from tqdm import tqdm
import seaborn as sns

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
dataset = load_dataset("common_gen")

Downloading builder script:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.99k [00:00<?, ?B/s]

Downloading and preparing dataset common_gen/default to /root/.cache/huggingface/datasets/common_gen/default/2020.5.30/1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23...


Downloading data:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67389 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4018 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1497 [00:00<?, ? examples/s]

Dataset common_gen downloaded and prepared to /root/.cache/huggingface/datasets/common_gen/default/2020.5.30/1a9e8bdc026c41ce7a9e96260debf7d2809cb7fd63fa02b017e4fac1b00c6b23. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 67389
    })
    validation: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 4018
    })
    test: Dataset({
        features: ['concept_set_idx', 'concepts', 'target'],
        num_rows: 1497
    })
})

In [None]:
dataset['train'][0] # concepts で指定された単語を使用して文(target)を生成する

{'concept_set_idx': 0,
 'concepts': ['ski', 'mountain', 'skier'],
 'target': 'Skier skis down the mountain'}

# 学習データから語彙の作成

In [None]:
UNK_TOKEN = '<UNK>' # 未知語
PAD_TOKEN = '<PAD>' # Padding 用
SOS_TOKEN = '<s>' # Start Of Sequence
EOS_TOKEN = '</s>' # End Of Sequence

def build_vocab(sents, add_unkown=True):
    """単語と id を対応させる辞書を作成

    Parameters
    ----------
    sents : list[str]
        文のリスト

    add_unkowm : bool
        未知語に対応するトークンの追加の有無

    Returns
    -------
    dict[str, int]
        単語 -> id へ変換するための辞書
    """
    vocab = {PAD_TOKEN:0, SOS_TOKEN:1, EOS_TOKEN:2}
    if add_unkown:
        vocab[UNK_TOKEN] = len(vocab)
    for sent in sents:
        tokenized_sent = nltk.tokenize.word_tokenize(sent) # 単語分割
        for word in tokenized_sent:
            if word not in vocab: # 辞書に存在していない単語のみ追加
                vocab[word] = len(vocab)
    print(f'vocabulary size is {len(vocab)}')

    return vocab

In [None]:
# 学習データから concepts と target のみ取得
concepts= [i['concepts'] for i in dataset['train']]
targets= [i['target'] for i in dataset['train']]

In [None]:
vocab = build_vocab(targets) # 語彙の作成

vocabulary size is 15855


# モデルの作成

In [None]:
class Text2TextModel(torch.nn.Module):
    """LSTM を用いた Encoder-Decoder モデル

    Attributes
    ----------
    vocab_size : int
        語彙のサイズ(単語数)
    hidden_size : int 
        単語の埋め込みベクトルと隠れ状態のベクトルの次元数
    embedding : torch.nn.Embedding
        単語を埋め込みベクトルへ変換するモデル
    encoder : torch.nn.RNN
        入力系列を隠れ状態に埋め込む
    decoder : torch.nn.RNN
        出力する系列を生成
    output : torch.nn.Linear
        decoder が出力したベクトル(hidden_size, )を(vocab_size, )に変換

    See Also
    --------
    forward : 学習時の順伝搬

    predict : 推論時の順伝搬(バッチ処理はしない)

    Note
    ----
    学習時と推論時で decoder の挙動が異なるので別々でメソッドを定義する
    """
    def __init__(self, vocab_size, hidden_size=300):
        super(Text2TextModel, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.embedding = Embedding(vocab_size, hidden_size)
        self.encoder = LSTM(
            input_size = self.hidden_size,
            hidden_size = self.hidden_size,
            num_layers = 1,
            bias = True,
            batch_first = True,
            dropout = 0,
            bidirectional = False
        )
        self.decoder = LSTM(
            input_size = self.hidden_size,
            hidden_size = self.hidden_size,
            num_layers = 1,
            bias = True,
            batch_first = True,
            dropout = 0,
            bidirectional = False
        )
        self.output = Linear(
            in_features = self.hidden_size,
            out_features = self.vocab_size
        )
    
    def forward(self, X, Y, device='cpu'):
        """学習時の順伝搬
        Parameters
        ----------
        X : Tensor[long]
            encoder の入力(batch_size, sequence_length)
        Y : Tensor[long]
            decoder の入力(batch_size, sequence_length)
        device : str
            実行するデバイス(cpu or cuda)の指定
        """
        h = torch.zeros(1, len(X), self.hidden_size).to(device) # 隠れ状態の初期化
        c = torch.zeros(1, len(X), self.hidden_size).to(device)
        encoder_i = self.embedding(X) # encoder の入力列の単語埋め込み
        decoder_i = self.embedding(Y) # decoder の入力列の単語埋め込み
        _, (h, c) = self.encoder(encoder_i, (h, c)) 
        o, _ = self.decoder(decoder_i, (h, c))
        o = self.output(o) # logit の計算

        return o

    @torch.inference_mode()
    def predict(self, X, device='cpu', max_length=64):
        """推論時の順伝搬
        Parameters
        ----------
        X : Tensor[long]
            encoder の入力(batch_size, sequence_length)
        device : str
            実行するデバイス(cpu or cuda)の指定
        max_length : int
            生成する系列の最大長
        """
        pred = [1] # SOS トークンを最初に入れる(本当はvocab[SOS_TOKEN]で入れるのが望ましい)
        h = torch.zeros(1, self.hidden_size).to(device) # 隠れ状態の初期化
        c = torch.zeros(1, self.hidden_size).to(device) # 隠れ状態の初期化
        encoder_i = self.embedding(X) # 入力列の埋め込み
        _, (h, c) = self.encoder(encoder_i, (h, c))
        while len(pred) <= max_length and pred[-1] != 2: # EOS トークンが出力されるか最大長に達するまで生成
            decoder_i = self.embedding(torch.Tensor(pred).long().to(device))
            o, _ = self.decoder(decoder_i, (h, c))
            logits = self.output(o[-1, :])
            pred.append(torch.argmax(logits).item()) # 最も高い確率だった単語を生成結果とする
        return pred


## 問題1
---
RNNを使用したときの実装と非常に似ていますが，LSTMとRNNの違い(理論面は別でやっているので，実装の部分を中心に)を教えてください．

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [None]:
tokenized_concepts = [torch.Tensor([vocab[j] if j in vocab else vocab[UNK_TOKEN] for j in i]).long() for i in concepts]
tokenized_targets = [torch.Tensor([vocab[SOS_TOKEN]] + [vocab[j] if j in vocab else vocab[UNK_TOKEN] for j in nltk.tokenize.word_tokenize(i)] + [vocab[EOS_TOKEN]]).long() for i in targets]

In [None]:
train_dataset = MyDataset(tokenized_concepts, tokenized_targets)
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: [[i[0] for i in x], [i[1] for i in x]])

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' # colabo では tpu も使用できますが，使用方法は自分で調べてみてください

lstm = Text2TextModel(len(vocab)).to(device)
optimizer = Adam(lstm.parameters(), lr=1e-3)
criterion = CrossEntropyLoss()
MAX_EPOCH = 15

loss_history = list()

for epoch in range(1, MAX_EPOCH+1):
    for batch_X, batch_Y in tqdm(dataloader):
        optimizer.zero_grad(set_to_none=True)
        encoder_input = pad_sequence(batch_X, batch_first=True).to(device)
        decoder_input = pad_sequence([i[:-1:] for i in batch_Y], batch_first=True).to(device)
        target = pad_sequence([i[1::] for i in batch_Y], batch_first=True).to(device)

        o = lstm(encoder_input, decoder_input, device)
        o = o.view(o.shape[0]*o.shape[1], -1) # CrossEntropyLoss の入力用に shape を変更
        target = target.view(o.shape[0], )
        loss = criterion(o, target)
        loss.backward()
        
        loss_history.append(loss.item())

        optimizer.step()
    train_loss = torch.mean(torch.Tensor(loss_history[-len(dataloader):])).item()
    print(f'{epoch=} | {train_loss=:.5f}')

100%|██████████| 2106/2106 [00:27<00:00, 76.14it/s]


epoch=1 | train_loss=2.85527


100%|██████████| 2106/2106 [00:28<00:00, 74.68it/s]


epoch=2 | train_loss=1.96522


100%|██████████| 2106/2106 [00:27<00:00, 75.73it/s]


epoch=3 | train_loss=1.60773


100%|██████████| 2106/2106 [00:27<00:00, 75.92it/s]


epoch=4 | train_loss=1.37540


100%|██████████| 2106/2106 [00:27<00:00, 75.72it/s]


epoch=5 | train_loss=1.20361


100%|██████████| 2106/2106 [00:27<00:00, 75.64it/s]


epoch=6 | train_loss=1.06409


100%|██████████| 2106/2106 [00:27<00:00, 75.58it/s]


epoch=7 | train_loss=0.95592


100%|██████████| 2106/2106 [00:27<00:00, 75.98it/s]


epoch=8 | train_loss=0.86431


100%|██████████| 2106/2106 [00:27<00:00, 75.77it/s]


epoch=9 | train_loss=0.78353


100%|██████████| 2106/2106 [00:27<00:00, 75.75it/s]


epoch=10 | train_loss=0.71727


100%|██████████| 2106/2106 [00:27<00:00, 75.63it/s]


epoch=11 | train_loss=0.65767


100%|██████████| 2106/2106 [00:27<00:00, 75.89it/s]


epoch=12 | train_loss=0.60473


100%|██████████| 2106/2106 [00:27<00:00, 75.77it/s]


epoch=13 | train_loss=0.56012


100%|██████████| 2106/2106 [00:27<00:00, 75.69it/s]


epoch=14 | train_loss=0.52153


100%|██████████| 2106/2106 [00:27<00:00, 75.63it/s]

epoch=15 | train_loss=0.48557





In [None]:
def decode(ids, vocab):
    """idのリストから文字列に変換
    Parameters
    ----------
    ids : list[int]
        変換対象の id のリスト
    vocab : dict[str, int]
        変換に使用する語彙の辞書
    """
    vocab_rev = {j:i for i, j in vocab.items()} # id -> word の変換に使いたいので辞書の key, value を交換
    decoded_sent = list()
    for id in ids:
        decoded_sent.append(vocab_rev[id]) # 存在しない id は入力されないことを前提にしている．これに対処する場合は場合分けするか，default Dict を使う
    return decoded_sent

In [None]:
def predict(idx):
    """学習用データの idx 番目 concepts から文を生成
    Parameters
    ----------
    idx : int
        学習用データの何番目を使用するか指定する整数
    """
    x = decode(train_dataset[idx][0].tolist(), vocab)
    true_y = decode(train_dataset[idx][1].tolist(), vocab)

    pred = lstm.predict(train_dataset[idx][0].to(device), device=device)
    pred_y = decode(pred, vocab)

    return x, true_y, pred_y

In [None]:
skip_keywords = list()
for i in range(1000):
    x, true_y, pred_y = predict(i)
    if x in skip_keywords:
        continue
    else:
        skip_keywords.append(x)
    score = sum([1 if w in pred_y else 0 for w in x])

    if score > 2:
        print('='*30+f'{score=}'+'='*30)
        print(f'idx={i}')
        print(f'keywords={x}')
        print(f'true sentense={true_y}')
        print(f'predicted sentense={pred_y}')

idx=39
keywords=['lake', 'shore', 'canoe']
true sentense=['<s>', 'canoe', 'on', 'a', 'shore', 'of', 'lake', '.', '</s>']
predicted sentense=['<s>', 'canoe', 'on', 'shore', 'with', 'lake', 'in', 'autumn', '</s>']
idx=42
keywords=['mountain', 'skier', 'way']
true sentense=['<s>', 'A', 'skier', 'on', 'his', 'way', 'to', 'the', 'mountain', '.', '</s>']
predicted sentense=['<s>', 'A', 'skier', 'making', 'her', 'way', 'down', 'a', 'snowy', 'mountain', '.', '</s>']
idx=72
keywords=['station', 'stop', 'train']
true sentense=['<s>', 'train', 'is', 'stopped', 'at', 'a', 'station', '</s>']
predicted sentense=['<s>', 'A', 'train', 'stopped', 'at', 'the', 'station', 'next', 'to', 'a', 'stop', '.', '</s>']
idx=120
keywords=['beach', 'time', 'spend']
true sentense=['<s>', 'tourists', 'spend', 'time', 'on', 'a', 'beach', '</s>']
predicted sentense=['<s>', 'people', 'spend', 'a', 'time', 'at', 'the', 'beach', '</s>']
idx=147
keywords=['hospital', 'patient', 'nurse']
true sentense=['<s>', 'nurse', 'with

# 検証データに対して予測してみる

In [None]:
valid_concepts= [i['concepts'] for i in dataset['validation']]
valid_targets= [i['target'] for i in dataset['validation']]

In [None]:
tokenized_valid_concepts = [torch.Tensor([vocab[j] if j in vocab else vocab[UNK_TOKEN] for j in i]).long() for i in valid_concepts]
tokenized_valid_targets = [torch.Tensor([vocab[SOS_TOKEN]] + [vocab[j] if j in vocab else vocab[UNK_TOKEN] for j in nltk.tokenize.word_tokenize(i)] + [vocab[EOS_TOKEN]]).long() for i in valid_targets]

In [None]:
valid_dataset = MyDataset(tokenized_valid_concepts, tokenized_valid_targets)

In [None]:
def predict_valid(idx):
    x = decode(valid_dataset[idx][0].tolist(), vocab)
    true_y = decode(valid_dataset[idx][1].tolist(), vocab)

    pred = lstm.predict(valid_dataset[idx][0].to(device), device=device)
    pred_y = decode(pred, vocab)

    return x, true_y, pred_y

In [None]:
skip_keywords = list()
for i in range(1000):
    x, true_y, pred_y = predict_valid(i)
    if x in skip_keywords:
        continue
    else:
        skip_keywords.append(x)
    score = sum([1 if w in pred_y else 0 for w in x])

    if score > 1:
        print('='*30+f'{score=}'+'='*30)
        print(f'idx={i}')
        print(f'keywords={x}')
        print(f'true sentense={true_y}')
        print(f'predicted sentense={pred_y}')

idx=8
keywords=['cat', 'pet', 'couch']
true sentense=['<s>', 'A', 'pet', 'cat', 'likes', 'to', 'sleep', 'on', 'a', 'couch', '.', '</s>']
predicted sentense=['<s>', 'A', 'cat', 'floating', 'on', 'the', 'couch', 'of', 'the', 'small', 'home', '.', '</s>']
idx=12
keywords=['climb', 'building', 'side']
true sentense=['<s>', 'The', 'mouse', 'climbed', 'the', 'side', 'of', 'the', 'building', '.', '</s>']
predicted sentense=['<s>', 'A', 'boy', "'s", 'in', 'a', 'building', 'next', 'to', 'a', 'side', 'of', 'a', 'sheep', '.', '</s>']
idx=70
keywords=['hand', 'front', 'wave']
true sentense=['<s>', 'A', 'girl', 'waved', 'her', 'hand', 'in', 'front', 'of', 'the', 'man', '.', '</s>']
predicted sentense=['<s>', 'people', 'waving', 'their', 'own', 'hand', 'at', 'the', 'front', 'with', 'the', 'camera', 'and', 'the', 'camera', 'extended', 'closer', 'to', 'the', 'camera', 'and', 'yellow', 'butterfly', '</s>']
idx=111
keywords=['sit', 'cat', 'floor']
true sentense=['<s>', 'The', 'cat', 'is', 'sitting', 'on

## 問題2
---
GRU を使用した実装をしてみてください