In [269]:
import torch
import numpy as np

from torch import nn, optim
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns

In [292]:
# Define a model:
class Network(nn.Module):
    def __init__(self, vocab, n_hidden, n_layers, do=0.5):
        super().__init__()
        
        self.vocab = vocab
        self.n_hidden = n_hidden 
        self.lstm = nn.LSTM(len(vocab), n_hidden, n_layers, batch_first=True, dropout=do)
        self.fc   = nn.Linear(n_hidden, len(vocab))
        
    def forward(self, x, hs=None):
        x, hs = self.lstm(x, hs)        # -> (batch_size, seq_len, n_hidden)
        x = x.reshape(-1, self.n_hidden)  # -> (batch_size * seq_len, n_hidden)
        out = self.fc(x)                # -> (batch_size * seq_len, vocab_size)
        
        return out, hs 

In [293]:
# define a batching method:
def get_batches(data, n_seq, seq_len):
    """
    Takes data of shape (n_samples, n_features), returns batches
    of shape (n_seq, seq_len, n_features)
    """
    n_features = data.shape[1]
    n_chars    = n_seq * seq_len
    n_batches  = int(np.floor(len(data) / n_chars))
    n_keep     = n_batches * n_chars
    
    inputs  = data[:n_keep]
    targets = np.append(data[1:], data[0]).reshape(data.shape)
    targets = targets[:n_keep]
    
    inputs = inputs.reshape(n_seq, -1, n_features)
    targets = targets.reshape(n_seq, -1, n_features)
    
    for i in range(0, inputs.shape[1], seq_len):
        x = inputs[:, i: i + seq_len]
        y = targets[:, i: i + seq_len]
        yield x,y
        

In [1]:
def train(model, data, batch_size, seq_len, epochs, lr=0.01, clip=5, valid=None):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    
    opt = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    encoder = OneHotEncoder(sparse=False).fit(model.vocab.reshape(-1, 1))
    data = encoder.transform(data)
          
    train_loss = []
    valid_loss = []
    
    for e in range(epochs):
    
        opt.zero_grad()
        hs = None
        t_loss = 0
        v_loss = 0

        for x, y in get_batches(data, batch_size, seq_len):
            x = torch.tensor(x).float()
            y = torch.tensor(y).float()
            x, y = x.to(device), y.to(device)
            
            out, hs = model(x, hs)
            hs = tuple([h.data for h in hs])

            # target should not be one-hot encoded for CE Loss. 
            y = y.reshape(-1, len(model.vocab))
            y = encoder.inverse_transform(y)
            y = torch.from_numpy(y).long()

            loss = criterion(out, y.squeeze())
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            opt.step()
            
            t_loss += loss.item()
        
        train_loss.append(np.mean(t_loss))
        
        if e % (epochs // 10) == 0:
            print(f'Training Loss: {train_loss[-1]}')
            
    plt.plot(train_loss)

In [349]:
# get and pre-process data:
with open('data/texts/anna.txt') as data:
    text = data.read()

vocab = sorted(set(text))
int2char = {i:ch for i, ch in enumerate(vocab)}
char2int = {ch: i for i, ch in int2char.items()}

# label encode and reshape to (n_samples, n_features)
encoded = np.array([char2int[ch] for ch in text]).reshape(-1, 1)
vocab = np.array([char2int[ch] for ch in vocab]).reshape(-1, 1)

In [350]:
n_hidden = 512
n_layers = 2

model = Network(vocab, n_hidden, n_layers)
model

Network(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)

In [None]:
batch_size = 128
seq_len = 100
epochs = 3
lr = 0.01

train(model, encoded, batch_size, seq_len, epochs, lr=lr)