In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split

import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import BPTTIterator, ReversibleField
from torchtext.datasets import LanguageModelingDataset

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('albums.csv')

In [4]:
df['title'] = df['title'] + '\n'

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_text = train_df['title'].sum()
with open('train.txt', 'w') as f:
    f.write(train_text)

test_text = test_df['title'].sum()
with open('test.txt', 'w') as f:
    f.write(test_text)

In [5]:
TEXT = ReversibleField(use_vocab=True,
             tokenize=list,
             batch_first=True,
             lower=True
            )

train, test = LanguageModelingDataset.splits(path='.',
                                             train='train.txt',
                                             test='test.txt',
                                             text_field=TEXT,
                                             newline_eos=True)

In [6]:
TEXT.build_vocab(train)

In [7]:
TEXT.vocab.itos[:10]

[' UNK ', '<pad>', 'e', ' ', 'a', 't', 'o', 'i', 'n', 'r']

In [8]:
len(TEXT.vocab.itos)

435

In [9]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                           batch_first=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        self.drop = tt.nn.Dropout()
        
        self.init_weights()
        
    def init_weights(self):
        nn.init.uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        total_length = x.size(-1)
        
        x = self.embedding(x)
        x = self.drop(x)
        
        x, hidden = self.rnn(x, hidden)
         
        x = self.drop(x)
        x = x.contiguous().view(batch_size * total_length, -1)
        x = self.fc(x)
        x = x.contiguous().view(batch_size, total_length, -1)
        return x, hidden
    
    def init_hidden(self, batch):
        return (tt.rand(2, batch, 128, requires_grad=True),
                tt.rand(2, batch, 128, requires_grad=True))

In [10]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    model.train()

    running_loss = 0
    
    n_batches = len(iterator)
    iterator = tqdm_notebook(iterator,
                             total=n_batches,
                             desc='epoch %d' % (curr_epoch),
                             leave=True)

    hidden = model.init_hidden(30)
    
    for i, batch in enumerate(iterator):
        if batch.text.size(0) != 30:
            continue
        
        optimizer.zero_grad()
        hidden = (hidden[0].detach(), hidden[1].detach())
        
        pred, hidden = model(batch.text, hidden)
        pred_flat = pred.view(-1, len(TEXT.vocab.itos))
        loss = criterion(pred_flat, batch.target.view(-1))
        loss.backward()
        optimizer.step()

        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss

def _test_epoch(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    hidden = model.init_hidden(30)

    with tt.no_grad():
        for batch in iterator:
            if batch.text.size(0) != 30:
                continue

            pred, hidden = model(batch.text, hidden)
            pred_flat = pred.view(-1, len(TEXT.vocab.itos))
            loss = criterion(pred_flat, batch.target.view(-1))
            epoch_loss += loss.data.item()
            hidden = (hidden[0].detach(), hidden[1].detach())

    return 2 ** (epoch_loss / n_batches)


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):

    prev_perplexity = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        perplexity = _test_epoch(model,  valid_iterator, criterion)

        print('perplexity %.5f' % perplexity)

        record = {'epoch': epoch, 'train_loss': train_loss, 'perplexity': perplexity}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if perplexity > prev_perplexity:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.perplexity == history.perplexity.min()].iloc[0]
                print('Early stopping! best epoch: %d perplexity %.5f' % (best_epoch['epoch'],
                                                                          best_epoch['perplexity']))
                break

            prev_perplexity = min(prev_perplexity, perplexity)

    return model

In [11]:
batch_size = 32

model = MyModel(vocab_size=len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128
               )

train_iterator, test_iterator = BPTTIterator.splits(
    (train, test),
    bptt_len=30,
    batch_sizes=(batch_size, batch_size),
    shuffle=True,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True
)

optimizer = optim.Adam(model.parameters())
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

criterion = nn.CrossEntropyLoss()

In [12]:
res_model = nn_train(model, train_iterator, test_iterator,
                     criterion, optimizer, scheduler=scheduler, 
                     n_epochs=50, early_stopping=5)


perplexity 6.53624



perplexity 6.25695



perplexity 6.14091



perplexity 6.09788



perplexity 6.07333



perplexity 6.05553



perplexity 6.04124



perplexity 6.02980



perplexity 6.02090



perplexity 6.01505



perplexity 6.00931



perplexity 6.00600



perplexity 6.00316



perplexity 6.00013



perplexity 5.99833



perplexity 5.99705



perplexity 5.99561



perplexity 5.99699



perplexity 5.99589



perplexity 5.99560



perplexity 5.99630



perplexity 5.99738



perplexity 5.99724



perplexity 5.99759



perplexity 5.99842
Early stopping! best epoch: 19 perplexity 5.99560


In [13]:
def get_new_band_name(model, prime_str='<eos>', max_len=20):
    hidden = model.init_hidden(1)
    inp = tt.tensor(TEXT.vocab.itos.index(prime_str)).unsqueeze(0).unsqueeze(0).long()
    predicted = ''

    _, hidden = model(inp, hidden)
            
    for p in range(max_len):
        
        output, hidden = model(inp, hidden)
        
        output_dist = output.data.view(-1).div(0.8).exp()
        top_i = tt.multinomial(output_dist, 1)[0]

        predicted_char = TEXT.vocab.itos[top_i]       
        if predicted_char == '<eos>':
            return predicted

        predicted += predicted_char
        inp = tt.tensor(TEXT.vocab.itos.index(predicted_char)).unsqueeze(0).unsqueeze(0).long()

    return predicted

In [14]:
tt.save(res_model, 'model.pt')

In [37]:
for i in range(10):
    print(get_new_band_name(res_model))

ct ticthelarg
wigilloricath iathe 
aco
imucer
thuthe & og gnck
fl derisinctesthe
thyee re
f in wilyouncappelin
memoverma patrlis wa
ve
