# 專題（一）：訓練LSTM之歌詞自動填詞器

## 專案目標
- 目標：使用 LSTM 模型去學習五月天歌詞，並且可以自動填詞來產生歌詞
- mayday_lyrics.txt 資料說明：
    - 每一行都是一首歌的歌詞
    - 除去標點符號並以空白表示間格
- 利用 mayday_lyrics.txt 來產生歌詞的序列
- 使用 LSTM 模型去學習歌詞的序列
- 當我們給定開頭的一段歌詞，例如：”給我一首歌”，就可以用 LSTM 猜下一個字，反覆這個過程就可以自動填詞

## 實作提示
- STEP1：從 mayday_lyrics.txt 中取出歌詞
- STEP2：建立每個字的 Index
- STEP3：用 Rolling 的方式打造 LyricsDataset
- STEP4：使用 DataLoader 來包裝 LyricsDataset
- STEP5：建立 LSTM 模型： inputs > nn.Embedding > nn.LSTM > nn.Dropout > 取最後一個 state > nn.Linear > softmax
- STEP6：開始訓練並調整參數
- STEP7：進行 Demo，給定 pre_text ，使用模型迭代的預測下一個字產生歌詞
- (進階) STEP8：在 Demo 時可以採用依照 Softmax 機率來作隨機採樣，這可以增加隨機性，讓歌詞有更多變化，當然你還可以使用機率閥值來避免太奇怪的字出現

## 重要知識點：專題結束後你可以學會
- 如何讀取並處理需要 Rolling 的序列資料
- 了解如何用 Pytorch 建制一個 LSTM 的模型
- 學會如何訓練一個語言模型
- 學會如何隨機抽樣自 Softmax 的分布

In [1]:
import pandas as pd
import numpy as np

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import random_split
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm

In [2]:
# from: https://github.com/gaussic/Chinese-Lyric-Corpus

lyrics_list = [line.strip() for line in open('mayday_lyrics.txt',encoding="utf-8")]

In [3]:
# 建立詞典對照表
word2index = {}
index2word = {}

i = 0
for words in lyrics_list:
    for word in words:
        if word not in word2index:
            word2index[word] = i
            index2word[i] = word
            i += 1

In [4]:
len(word2index)

2101

In [5]:
num_unrollings = 10
samples = []
for i in lyrics_list:
    for j in range(len(i) - num_unrollings + 1):
        samples.append(i[j:j+num_unrollings])
samples[:11]

['摸不到的顏色 是否叫',
 '不到的顏色 是否叫彩',
 '到的顏色 是否叫彩虹',
 '的顏色 是否叫彩虹 ',
 '顏色 是否叫彩虹 看',
 '色 是否叫彩虹 看不',
 ' 是否叫彩虹 看不到',
 '是否叫彩虹 看不到的',
 '否叫彩虹 看不到的擁',
 '叫彩虹 看不到的擁抱',
 '彩虹 看不到的擁抱 ']

In [6]:
# 建立數據集
class LyricsDataset(Dataset):
    def __init__(self, lyrics_list, word2index, num_unrollings=10):
        ## Code Here
        self.word2index = word2index
        self.samples = []
        for i in lyrics_list:
            for j in range(len(i) - num_unrollings + 1):
                self.samples.append(i[j:j+num_unrollings])

    def __getitem__(self, idx):
        ## Code Here
        sample = self.samples[idx]
        
        input_lyric = [self.word2index[w] for w in sample[:-1]]
        input_lyric = torch.tensor(input_lyric, dtype=torch.long)
        output_lyric = self.word2index[sample[-1]]
        output_lyric = torch.tensor(output_lyric, dtype=torch.long)
        
        return input_lyric, output_lyric
    def __len__(self):
        return len(self.samples)

In [7]:
batch_size = 128

dataset = LyricsDataset(lyrics_list, word2index)

train_loader = DataLoader(
    dataset=dataset,
    batch_size=batch_size,
    shuffle=True)

In [8]:
# 建立模型
class LM_LSTM(nn.Module):
    def __init__(self, n_hidden, vocab_size, num_layers, dropout_ratio):
        super(LM_LSTM, self).__init__()
        ## Code Here
        self.n_hidden = n_hidden
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout_ratio)
        self.embed = torch.nn.Embedding(vocab_size, n_hidden)
        self.lstm = nn.LSTM(input_size = n_hidden, 
                            hidden_size = n_hidden,
                            num_layers = num_layers,
                            dropout = dropout_ratio)
        self.fc = nn.Linear(in_features=n_hidden, out_features=vocab_size)

        

    def forward(self, inputs):
        ## Code Here
        embed = self.embed(inputs) # [128, 9, 128]
        embed = embed.transpose(0, 1) # [9, 128, 128]
        
        outputs, _ = self.lstm(embed)
        outputs = self.dropout(outputs)
        output = outputs[-1]  # [batch_size, n_hidden]
        logits = self.fc(output)

        return logits

In [9]:
def train_batch(model, data, criterion, optimizer, device):
    model.train()
    inputs, targets = [d.to(device) for d in data]

    outputs = model(inputs)

    loss = criterion(outputs, targets)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()

In [10]:
# 訓練模型
epochs = 100
lr = 0.001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LM_LSTM(128, len(word2index), 2, 0.3)
model.to(device)

criterion = nn.CrossEntropyLoss(size_average=False)
criterion.to(device)

optimizer = optim.Adam(model.parameters(), lr=lr)


for epoch in range(1, 1 + epochs):
    tot_train_loss = 0
    tot_train_count = 0

    for train_data in train_loader:
        loss = train_batch(model, train_data, criterion, optimizer, device)

        tot_train_loss += loss
        tot_train_count += train_data[0].size(0)

    print('epoch ', epoch, 'train_loss: ', tot_train_loss / tot_train_count)

    if epoch % 10 == 0:
        for idx in [0, 50, 99]:
            input_batch = dataset[idx][0].unsqueeze(0).to(device)
            predict = model(input_batch).argmax(dim=-1).item()
            print('Example: "{}"+"{}"'.format(dataset.samples[idx][:-1], index2word[predict]))



epoch  1 train_loss:  5.65644539985612
epoch  2 train_loss:  5.263513305908318
epoch  3 train_loss:  4.994525241053509
epoch  4 train_loss:  4.753529821982881
epoch  5 train_loss:  4.539069605684454
epoch  6 train_loss:  4.333671154475996
epoch  7 train_loss:  4.155282433800497
epoch  8 train_loss:  3.991806651645844
epoch  9 train_loss:  3.8339239407837304
epoch  10 train_loss:  3.6884352194692727
Example: "摸不到的顏色 是否"+"要"
Example: " 只留下結果 時間"+"人"
Example: "麼多的燦爛的夢 以"+"為"
epoch  11 train_loss:  3.555434479628527
epoch  12 train_loss:  3.422983957487341
epoch  13 train_loss:  3.3094714500068996
epoch  14 train_loss:  3.2054917369992055
epoch  15 train_loss:  3.099463789831735
epoch  16 train_loss:  2.997106836811381
epoch  17 train_loss:  2.9067579838417243
epoch  18 train_loss:  2.8193396126867944
epoch  19 train_loss:  2.73430210017063
epoch  20 train_loss:  2.660477321575358
Example: "摸不到的顏色 是否"+"叫"
Example: " 只留下結果 時間"+"變"
Example: "麼多的燦爛的夢 以"+"為"
epoch  21 train_loss:  2.5914293748

In [11]:
# 模型inference
pre_text = '給我一首歌'
generate_len = 50
prob_threshold = 0.01

result = [word2index[c] for c in pre_text]
for _ in range(generate_len):
    input_example = torch.tensor([result], dtype=torch.long, device=device)
    logit = model(input_example)

    ## Code Here
    prob = F.softmax(logit, dim=-1)
    probs = torch.where(prob > prob_threshold, prob, torch.zeros_like(prob))
    predict = torch.multinomial(probs, 1).item()
    ## End
    result += [predict]
print(''.join([index2word[i] for i in result]))

給我一首歌子 已經不能原諒我相過 時間變成那們館的 孩復就算的結局 誰能看見 你是不去 太大的人 每天的平圍 
