## **Generate Thai Lyrics**

Most of the code is based on Udacity Deep Learning Github repo

In [None]:
!wget https://s3-us-west-2.amazonaws.com/thai-corpus/lyric_dataframe_update.csv

In [None]:
# import all libraries
import pandas as pd
from itertools import chain
from pythainlp import word_tokenize
from collections import Counter
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
import matplotlib

# !wget https://github.com/Phonbopit/sarabun-webfont/raw/master/fonts/thsarabunnew-webfont.ttf
# !sudo cp thsarabunnew-webfont.ttf /usr/local/lib/python3.6/dist-packages/matplotlib/mpl-data/fonts/ttf/
# !sudo cp thsarabunnew-webfont.ttf /usr/share/fonts/truetype/

matplotlib.font_manager._rebuild()
matplotlib.rc('font', family='TH Sarabun New')
matplotlib.rc('figure', figsize=(10, 10))

In [None]:
def flatten(ls):
    """Flatten list of list"""
    return list(chain.from_iterable(ls))

def clean_lyrics(lyric):
    """Clean lines that do not contain lyrics"""
    lines = lyric.split('\n')
    lyrics_clean = [] 
    for line in lines:
        # remove headers from the file
        headers = [
            'เพลง ', 'คำร้อง ', 'คำร้อง/ทำนอง ', 'ศิลปิน ', 'ทำนอง ', 
            'เรียบเรียง ', 'เพลงประกอบละคร ', 'อัลบัม ', 'ร่วมร้องโดย ', 
            'เนื้อร้อง/ทำนอง', 'ทำนอง/เรียบเรียง ', 'เพลงประกอบภาพยนตร์ ', 
            'เพลงประกอบละครซิทคอม ', 'คำร้อง/ทำนอง/เรียบเรียง ', 
            'คำร้อง/เรียบเรียง ', 'เพลงประกอบ ', 'ร้องโดย ', 
            'ทำนอง / เรียบเรียง :', ' สังกัด'
        ]
        if any(line.startswith(s) for s in headers):
            pass
        else:
            line = ' '.join(line.replace('(', ' ').replace(')', ' ').replace('-', ' ').split())
            lyrics_clean.append(line)
    return '\n'.join(lyrics_clean).strip()


def create_lookup_dict(tokenized_lyrics, n_min=None):
    """
    Create lookup dictionary
    """
    word_counts = Counter(tokenized_lyrics)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    if n_min is not None:
        sorted_vocab = {k: v for k, v in word_counts.items() if v >= n_min}
    vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 0)}
    int_to_vocab = {i: word for word, i in vocab_to_int.items()}
    return (vocab_to_int, int_to_vocab)

In [None]:
df = pd.read_csv('lyric_dataframe.csv')
lyrics = df.full_lyrics.map(clean_lyrics)
tokenized_lyrics = lyrics.map(word_tokenize)

In [None]:
tokenized_lyrics = flatten(tokenized_lyrics)
tokenized_lyrics = [token if token is not '\n' else ' ' for token in tokenized_lyrics]
word_counts = Counter(tokenized_lyrics)
vocab_to_int, int_to_vocab = create_lookup_dict(tokenized_lyrics, n_min=None)

In [None]:
import numpy as np
import torch

train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('No GPU found. Please use a GPU to train your neural network.')

In [None]:
from torch.utils.data import TensorDataset, DataLoader

def batch_data(words, sequence_length, batch_size, shuffle=False):
    """
    ref: Udacity
    """
    # TODO: Implement function
    batch_size_total = batch_size * sequence_length
    n_batches = len(words) // batch_size_total
    words = words[: n_batches * batch_size_total]
    
    X, target = [], []
    for n in range(0, len(words) - sequence_length, 1):
        x = words[n: n + sequence_length]
        y = words[n + sequence_length]
        X.append(np.array(x))
        target.append(y)
    X = np.array(X)
    target = np.array(target)
    dataset = TensorDataset(torch.from_numpy(X), torch.from_numpy(target))
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

In [None]:
tokenized_indices = [vocab_to_int.get(token, 0) for token in tokenized_lyrics]

## **LSTM Model**

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5):
        super(RNN, self).__init__()
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = nn.Embedding(num_embeddings=self.vocab_size, 
                                      embedding_dim=self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, 
                            hidden_size=self.hidden_dim, 
                            dropout=self.dropout,
                            num_layers=self.n_layers,
                            batch_first=True)
        self.fc = nn.Linear(self.hidden_dim, self.output_size)
    
    
    def forward(self, nn_input, hidden):
        batch_size, _ = nn_input.size() # batch first
        embedding_input = self.embedding(nn_input)
        nn_output, hidden = self.lstm(embedding_input, hidden)
        nn_output = nn_output.contiguous().view(-1, self.hidden_dim)
        
        output = self.fc(nn_output)
        output = output.view(batch_size, -1, self.output_size)
        output = output[:, -1]

        # return one batch of output word scores and the hidden state
        return output, hidden
    
    
    def init_hidden(self, batch_size):

        weight = next(self.parameters()).data
        
        # initialize hidden state with zero weights, and move to GPU if available
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        return hidden

In [None]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    
    # move data to GPU, if available
    if train_on_gpu:
        inp, target = inp.cuda(), target.cuda()
    
    hidden = tuple([each.data for each in hidden])
    
    # perform backpropagation and optimization
    rnn.zero_grad()
    
    output, hidden = rnn(inp, hidden)
    loss = criterion(output.squeeze(), target)
    loss.backward()
    
    clip = 5.0 # gradient clipping
    nn.utils.clip_grad_norm_(rnn.parameters(), clip)
    optimizer.step()
    
    # return the loss over a batch and the hidden state produced by our model
    return loss.item(), hidden

In [None]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    
    batch_losses = []
    rnn.train()

    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            # forward, back prop
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            # record loss
            batch_losses.append(loss)

            # printing loss stats
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    # returns a trained rnn
    return rnn

In [None]:
# parameters
num_epochs = 15
learning_rate = 0.001
batch_size = 128
vocab_size = len(vocab_to_int)
output_size = len(vocab_to_int)
embedding_dim = 300
hidden_dim = 300
sequence_length = 20
n_layers = 2
show_every_n_batches = 5000

In [None]:
train_loader = batch_data(tokenized_indices, 
                          sequence_length=20, 
                          batch_size=batch_size, 
                          shuffle=True)
rnn = RNN(vocab_size, output_size, embedding_dim, 
          hidden_dim, n_layers, dropout=0.3)
if train_on_gpu:
    rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
# training the model
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.001)
trained_rnn = train_rnn(rnn, batch_size, optimizer, 
                        criterion, num_epochs, 
                        show_every_n_batches)

In [None]:
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.0002)
trained_rnn = train_rnn(rnn, batch_size, optimizer, 
                        criterion, 15, 
                        show_every_n_batches)

In [None]:
# save model
torch.save(trained_rnn, 'lstm_model.pt.pt')

## **Lyrics generation**

In [None]:
import torch.nn.functional as F

# load trained model
trained_rnn = torch.load('lstm_model.pt')

def generate(rnn, prime_id, int_to_vocab, pad_value, predict_len=100):
    rnn.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))
        
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
         
        # use top_k sampling to get the index of the next word
        top_k = 100
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        # select the likely next word index with some element of randomness
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        
        # retrieve that word from the dictionary
        word = int_to_vocab[word_i]
        predicted.append(word)
        
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    gen_sentences = ''.join(predicted)    
    return gen_sentences

In [None]:
gen_length = 200
prime_word = 'ใคร'
generated_script = generate(trained_rnn, vocab_to_int.get(prime_word, 0), 
                            int_to_vocab, 0, gen_length)
print(generated_script)

## **TSNE of embedding layer**

Here, we will sample embedddings from the embedding layer and use TSNE to visualize the word

In [None]:
from sklearn.manifold import TSNE
import seaborn as sns
import torch

In [None]:
rnn = torch.load('lstm_model.pt')
word_vectors = torch.Tensor(list(rnn.embedding.parameters())[0].cpu().detach().numpy())
tsne = TSNE(n_components=2)

word_vectors_proj = tsne.fit_transform(word_vectors)

In [None]:
word_vectors_proj_df = pd.DataFrame(list(zip(list(vocab_to_int.keys()), 
                                             word_vectors_proj[:, 0], 
                                             word_vectors_proj[:, 1])), 
                                    columns=['word', 'x', 'y'])
word_counts_df = pd.DataFrame(word_counts.most_common(n=1500), 
                              columns=['word', 'n_word'])
word_vectors_proj_sel = word_vectors_proj_df.merge(word_counts_df, on='word')

In [None]:
matplotlib.rc('figure', figsize=(20, 20))

df = word_vectors_proj_sel.sample(n=300) # sample only 300 words to show
p = sns.regplot(data=df, 
                x="x", y="y", fit_reg=False, 
                marker="+", color="skyblue")
for _, r in df.iterrows():
     p.text(r['x'], r['y'], r['word'], 
            horizontalalignment='left', 
            size='large', color='black', weight='semibold')

## **Snippet to scrape data**

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
def scrape_siamzone(d):
    soup = BeautifulSoup(requests.get('https://www.siamzone.com/music/thailyric/%d' % d).content, 'html.parser')
    title, artist_name = soup.find('title').text.split('|')
    title, artist_name = title.strip(), artist_name.strip()
    n_shares = int(soup.find('span', attrs={'class': 'sz-social-number'}).text.replace(',', ''))
    full_lyrics = soup.find('div', attrs={'itemprop': 'articleBody'}).text.strip()
    return {
        'url': 'https://www.siamzone.com/music/thailyric/%d' % d,
        'soup': soup, 
        'title': title,
        'artist_name': artist_name,
        'n_shares': n_shares,
        'full_lyrics': full_lyrics
    }

scraped_siamzone = []
for i in range(14050, 16041):
    try:
        scraped_siamzone.append(scrape_siamzone(i))
    except:
        pass

scraped_siamzone_df = pd.DataFrame(scraped_siamzone)
scraped_siamzone_df['lyrics'] = scraped_siamzone_df.full_lyrics.map(clean_lyrics)