# Assignment 7

Delelop language model, which generates death metal band names.  
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.  
You are free to use any other data, but the most easy way is just to take the band name column.

Your language model should be char-based autogression RNN.  
Text generation should be terminated when either max length is reached or terminal symbol is generated.  

<img src="images/example.png">

<img src="images/example2.png">

Different band names can be generated by:  
1. init $h_0$ as random vector from some probabilty distribution.
2. sampling over tokens at each timestep with probability = softmax 

Calculate perplexity for your model = your objective quality metric.  
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.  

In [1]:
import pandas as pd
import numpy as np
import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchtext.data import Field, LabelField, BucketIterator, ReversibleField, TabularDataset, BPTTIterator
from torch.distributions.distribution import Distribution
#from tqdm import tqdm_notebook
from tqdm.autonotebook import tqdm
from random import shuffle

SEED = 42
import random
import string



In [2]:
data = pd.read_csv('bands.csv')

In [3]:
data.head()

Unnamed: 0,id,name,country,status,formed_in,genre,theme,active
0,1,('M') Inc.,United States,Unknown,2009.0,Death Metal,,2009-?
1,2,(sic),United States,Split-up,1993.0,Death Metal,,1993-1996
2,3,.F.O.A.D.,France,Active,2009.0,Death Metal,Life and Death,2009-present
3,4,100 Suns,United States,Active,2004.0,Death Metal,,2004-present
4,5,12 Days of Anarchy,United States,Split-up,1998.0,Death Metal,Anarchy,1998-2002


In [4]:
data.shape

(37723, 8)

In [5]:
data_t = list(data['name'])
shuffle(data_t)

with open('bands.txt', 'a') as f:
    for t in data_t:
        f.write(t + '\n')

In [6]:
with open('bands.txt', 'r') as file, open('train.txt', 'a') as file_w1, open('val.txt', 'a') as file_w2:
    lines = file.readlines()
    sep = int(0.9 * len(lines))
    i = 0
    for line in lines:
        i += 1
        if i < sep:
            file_w1.write(line)
        else:
            file_w2.write(line)

In [24]:
with open('bands.txt', 'r') as f:
    with open('train.txt', 'a') as f_tr:
        with open('val.txt', 'a') as f_val:
            ln = f.readlines()
            i = 0
            mark = int(0.9 * len(ln))
            
            for line in ln:
                i += 1
                if i < mark:
                    f_tr.write(line)
                else:
                    f_val.write(line)

In [7]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(MyModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, n_layers)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        
        return output, hidden

    def init_hidden(self, batch_size):
        
        return Variable(tt.zeros(self.n_layers, batch_size, self.hidden_size))

In [8]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [9]:
all_ch = string.printable
char_len = len(all_ch)

In [10]:
def char_tensor(string):
    tensor = tt.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = all_ch.index(string[c])
        except:
            continue
    return tensor


def random_training_set(chunk_len, batch_size, file, file_len):
    
    inp = tt.LongTensor(batch_size, chunk_len)
    target = tt.LongTensor(batch_size, chunk_len)
    
    for bi in range(batch_size):
        start_index = random.randint(0, file_len - chunk_len)
        end_index = start_index + chunk_len + 1
        chunk = file[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
        
    inp = Variable(inp)
    target = Variable(target)
    
    return inp, target


def perplexity(x):
    return 2**x


def _train_epoch(inp, target, model, optimizer, criterion, curr_epoch):

    decoder.train()
    hidden = decoder.init_hidden(batch_size)
    decoder.zero_grad()
    running_loss = 0
    perplexities = []
    
    for c in range(chunk_len):
        optimizer.zero_grad()
        
        output, hidden = decoder(inp[:,c], hidden)
        loss = criterion(output.view(batch_size, -1), target[:,c])
        perplexities.append(perplexity(loss.item()))
        
        curr_loss = loss.data.cpu().detach().item()
        loss_smoothing = c / (c+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss
    
    PERPLEXITY = np.mean(perplexities)
    loss.backward()
    optimizer.step()

    return running_loss, PERPLEXITY

def _test_epoch(inp, target, model, criterion):
    model.eval()
    epoch_loss = 0
    hidden = decoder.init_hidden(batch_size)
    loss = 0
    perplexities = []
    
    with tt.no_grad():
        for c in range(chunk_len):
            output, hidden = decoder(inp[:,c], hidden)
            loss = criterion(output.view(batch_size, -1), target[:,c])
            perplexities.append(perplexity(loss.item()))
            epoch_loss += loss.data.item()
    PERPLEXITY = np.mean(perplexities)
    
    return epoch_loss / chunk_len, PERPLEXITY


def nn_train(model, criterion, optimizer, n_epochs=100, scheduler=None, early_stopping=0):

    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()
    train_losses = []
    valid_losses = []
    
    for epoch in range(n_epochs):
        train_loss, train_per = _train_epoch(*random_training_set(300, batch_size, file_train, file_train_),
                                             model, optimizer, criterion, epoch)
        valid_loss, valid_per = _test_epoch(*random_training_set(300, batch_size, file_val, file_val_),
                                            model, criterion)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        if epoch % 100 == 0 or epoch == n_epochs-1:
            print('Epoch %s <--> Valid loss %.5f <--> Train loss %.5f <--> Valid perplexity %.5f <--> Train perplexity %.5f' % (str(epoch),
                                                                                                                          valid_loss,
                                                                                                                          train_loss,
                                                                                                                          valid_per,
                                                                                                                          train_per,
                                                                                                                          ))

In [11]:

with open('train.txt', 'r') as f:
    file_train = f.read()
file_train_ = len(file_train)


with open('val.txt', 'r') as f:
    file_val = f.read()
file_val_ = len(file_val)

In [12]:
hidden_size = 100
batch_size = 32
chunk_len = 200

decoder = MyModel(len(all_ch),
                  hidden_size, 
                  len(all_ch))

optimizer = tt.optim.Adam(decoder.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [13]:
nn_train(decoder, criterion, optimizer, n_epochs=1000)
tt.save(decoder, 'res.pt')

Epoch 0 <--> Valid loss 4.43708 <--> Train loss 4.61319 <--> Valid perplexity 21.67876 <--> Train perplexity 24.47978
Epoch 100 <--> Valid loss 2.71847 <--> Train loss 2.72899 <--> Valid perplexity 6.65616 <--> Train perplexity 6.73321
Epoch 200 <--> Valid loss 2.71121 <--> Train loss 2.76085 <--> Valid perplexity 6.63945 <--> Train perplexity 6.92165
Epoch 300 <--> Valid loss 2.74093 <--> Train loss 2.72001 <--> Valid perplexity 6.80755 <--> Train perplexity 6.71317
Epoch 400 <--> Valid loss 2.73104 <--> Train loss 2.71503 <--> Valid perplexity 6.75537 <--> Train perplexity 6.66524
Epoch 500 <--> Valid loss 2.70650 <--> Train loss 2.69153 <--> Valid perplexity 6.64979 <--> Train perplexity 6.58943
Epoch 600 <--> Valid loss 2.66933 <--> Train loss 2.63552 <--> Valid perplexity 6.48999 <--> Train perplexity 6.32989
Epoch 700 <--> Valid loss 2.69908 <--> Train loss 2.71832 <--> Valid perplexity 6.62997 <--> Train perplexity 6.72221
Epoch 800 <--> Valid loss 2.70679 <--> Train loss 2.6909

  "type " + obj.__name__ + ". It won't be checked "


In [14]:
def group_names(decoder, prime_str='\n', predict_len=50, temperature=0.8):

    hidden = decoder.init_hidden(1)
    prime_input = char_tensor(prime_str).unsqueeze(0)
    predicted = ''

    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = tt.multinomial(output_dist, 1)[0]
        predicted_char = all_ch[top_i]
        
        if predicted and predicted_char == '\n':
            break
        else:
            predicted += predicted_char
            inp = char_tensor(predicted_char).unsqueeze(0)

    return predicted

In [15]:
filename = 'res.pt'
decoder = tt.load(filename)

In [16]:
for x in range(30):
    print(group_names(decoder))

Lostuss Cett of forve
Cyreecetrous
Cidthon Cit
Cyric
Sagapackavigus
Ceroit
Cecith
Givied
Gonionttional tous
Siperestion
Klof Dpivee
Dehan iesos Mereent
L0wate of Wheredasspingus
Gre Miton Retiied
Whityn
Llaning untes
Dawkiod
Morctied
Bured Cescienioning
Coreig
Oriped
Lapiit
Frive
Bulthtion of Rouionion Andis
Onfofusissee
Lesvid
Lortetiund Outhecres
Bryn
Wabion
Jued Cresia
