In [26]:
import torch
from torch import nn

import numpy as np

In [27]:
text = ['hey bạn khỏe không',
       'tôi khỏe vừa vừa',
       'chúc một ngày tốt lành',
        'Mặt hoa da phấn.',
        'Cưng như cưng trứng, hứng như hứng hoa.',
        'Đứng núi này trông núi nọ.',
        'Mưa to gió lớn.',
        'Ngày lành tháng tốt.',
       ]
#extract unique characters
chars = set(''.join(text))
#dict map integers to characters
int2char = dict(enumerate(chars))
#dict map characters to integers
char2int = {char: ind for ind, char in int2char.items()}

 In order to used batches to train on our data, we'll need to ensure that each sequence within the input data is of equal size. So we need padding sentences

In [28]:
#Finding the length of the longest string in our data
maxlen = len(max(text, key=len))

for i in range(len(text)):
  while (len(text[i])<maxlen):
      text[i] += ' '

In [29]:
# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []

for i in range(len(text)):
    # Remove last character for input sequence
  input_seq.append(text[i][:-1])
    
    # Remove first character for target sequence
  target_seq.append(text[i][1:])
  print("Input Sequence: {}\nTarget Sequence: {}".format(input_seq[i], target_seq[i]))

Input Sequence: hey bạn khỏe không                    
Target Sequence: ey bạn khỏe không                     
Input Sequence: tôi khỏe vừa vừa                      
Target Sequence: ôi khỏe vừa vừa                       
Input Sequence: chúc một ngày tốt lành                
Target Sequence: húc một ngày tốt lành                 
Input Sequence: Mặt hoa da phấn.                      
Target Sequence: ặt hoa da phấn.                       
Input Sequence: Cưng như cưng trứng, hứng như hứng hoa
Target Sequence: ưng như cưng trứng, hứng như hứng hoa.
Input Sequence: Đứng núi này trông núi nọ.            
Target Sequence: ứng núi này trông núi nọ.             
Input Sequence: Mưa to gió lớn.                       
Target Sequence: ưa to gió lớn.                        
Input Sequence: Ngày lành tháng tốt.                  
Target Sequence: gày lành tháng tốt.                   


In [30]:
for i in range(len(text)):
    input_seq[i] = [char2int[character] for character in input_seq[i]]
    target_seq[i] = [char2int[character] for character in target_seq[i]]

In [31]:
print('Char2Int: {}'.format(input_seq[5]))
print('Int2Char: {}'.format(''.join([int2char[idx] for idx in input_seq[5]])))

Char2Int: [3, 39, 2, 10, 14, 2, 0, 7, 14, 2, 21, 1, 14, 19, 25, 4, 2, 10, 14, 2, 0, 7, 14, 2, 33, 26, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]
Int2Char: Đứng núi này trông núi nọ.            


In [32]:
dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)

def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
    # Replacing the 0 at the relevant character index with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            features[i, u, sequence[i][u]] = 1
    return features

input_seq = one_hot_encode(input_seq, dict_size, seq_len, batch_size)

In [33]:
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)

In [34]:
device = torch.device("cpu")

In [35]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)   
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        
        batch_size = x.size(0)

        # Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [36]:
print('Current Device: {}'.format(device))
# Instantiate the model with hyperparameters
model = Model(input_size=dict_size, output_size=dict_size, hidden_dim=12, n_layers=1)
# We'll also set the model to the device that we defined earlier (default is CPU)
model.to(device)

# Define hyperparameters
n_epochs = 2000
lr=0.01

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# Training Run
for epoch in range(1, n_epochs + 1):
    optimizer.zero_grad() # Clears existing gradients from previous epoch
    input_seq.to(device)
    output, hidden = model(input_seq)
    loss = criterion(output, target_seq.view(-1).long())
    loss.backward() # Does backpropagation and calculates gradients
    optimizer.step() # Updates the weights accordingly
    
    if epoch%100 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs), end=' ')
        print("Loss: {:.4f}".format(loss.item()))

Current Device: cpu
Epoch: 100/2000............. Loss: 0.8884
Epoch: 200/2000............. Loss: 0.2513
Epoch: 300/2000............. Loss: 0.1038
Epoch: 400/2000............. Loss: 0.0539
Epoch: 500/2000............. Loss: 0.0508
Epoch: 600/2000............. Loss: 0.0313
Epoch: 700/2000............. Loss: 0.0211
Epoch: 800/2000............. Loss: 0.0164
Epoch: 900/2000............. Loss: 0.0136
Epoch: 1000/2000............. Loss: 0.0118
Epoch: 1100/2000............. Loss: 0.0105
Epoch: 1200/2000............. Loss: 0.0095
Epoch: 1300/2000............. Loss: 0.0087
Epoch: 1400/2000............. Loss: 0.0081
Epoch: 1500/2000............. Loss: 0.0077
Epoch: 1600/2000............. Loss: 0.0073
Epoch: 1700/2000............. Loss: 0.0070
Epoch: 1800/2000............. Loss: 0.5326
Epoch: 1900/2000............. Loss: 0.0974
Epoch: 2000/2000............. Loss: 0.0591


In [37]:
# This function takes in the model and character as arguments and returns the next character prediction and hidden state
def predict(model, character):
    # One-hot encoding our input to fit into the model
    character = np.array([[char2int[c] for c in character]])
    character = one_hot_encode(character, dict_size, character.shape[1], 1)
    character = torch.from_numpy(character)
    character.to(device)
    
    out, hidden = model(character)

    prob = nn.functional.softmax(out[-1], dim=0).data
    # Taking the class with the highest probability score from the output
    char_ind = torch.max(prob, dim=0)[1].item()

    return int2char[char_ind], hidden

In [38]:
# This function takes the desired output length and input characters as arguments, returning the produced sentence
def sample(model, out_len, start='hey'):
    model.eval() # eval mode
    start = start.lower()
    # First off, run through the starting characters
    chars = [ch for ch in start]
    size = out_len - len(chars)
    # Now pass in the previous characters and get a new one
    for ii in range(size):
        char, h = predict(model, chars)
        chars.append(char)

    return ''.join(chars)

In [43]:
sample(model, 40, start='cưng')

'cưng như cưng trứng, hứng như hứng hoa. '