# Project - Gating Mechanism

##### Lili LU <br> Xiqing Chang <br> Xinwen XU <br>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
from torch.nn import init
from torch import Tensor

import matplotlib.pyplot as plt
import math
import random
import numpy as np


# Data processing

## Read data

In [None]:
# read the file
def read_file(path):
    data = list()
    with open(path) as inf:
        for line in inf:
            line = line.strip()
            if len(line) == 0:
                continue
            data.append(line.split())
    return data

In [None]:
# read the train data, dev data and test data
train_data = read_file("./ptb.train.txt")
dev_data = read_file("./ptb.valid.txt")
test_data = read_file("./ptb.test.txt")

In [None]:
# create a set of all the words in the train data
train_words = set()
for sentence in train_data:
    train_words.update(sentence)

train_words.update(["<bos>", "<eos>"])

## Illustration of data set

In [None]:
# training set
len_dist = [0] * 5
for sentence in train_data:
  if len(sentence) <= 10: len_dist[0] += 1
  elif len(sentence) > 10 and len(sentence) <= 20: len_dist[1] += 1
  elif len(sentence) > 20 and len(sentence) <= 30: len_dist[2] += 1
  elif len(sentence) > 30 and len(sentence) <= 40: len_dist[3] += 1
  else: len_dist[4] += 1


plt.xlabel('length of sentence')
plt.ylabel('number of sentence')


bar_labels = [ "[0, 10]", "[10, 20]", "[20, 30]", "[30, 40]", "[40, :]"]
y_pos = np.arange(len(len_dist))

bars = plt.bar(y_pos, len_dist)
plt.xticks(y_pos, bar_labels)
for b, d in zip(bars, len_dist):
  plt.text(b.get_x() + 0.15, b.get_height() + 100,'{0:.2%}'.format(d/sum(len_dist)))

plt.xlabel('length of sentence')
plt.ylabel('number of sentence')  

In [None]:
# valid set
len_dist = [0] * 5
for sentence in dev_data:
  if len(sentence) <= 10: len_dist[0] += 1
  elif len(sentence) > 10 and len(sentence) <= 20: len_dist[1] += 1
  elif len(sentence) > 20 and len(sentence) <= 30: len_dist[2] += 1
  elif len(sentence) > 30 and len(sentence) <= 40: len_dist[3] += 1
  else: len_dist[4] += 1

bar_labels = [ "[0, 10]", "[10, 20]", "[20, 30]", "[30, 40]", "[40, :]"]
y_pos = np.arange(len(len_dist))

bars = plt.bar(y_pos, len_dist)
plt.xticks(y_pos, bar_labels)
for b, d in zip(bars, len_dist):
  plt.text(b.get_x() + 0.15, b.get_height()+ 10,'{0:.2%}'.format(d/sum(len_dist)))

plt.xlabel('length of sentence')
plt.ylabel('number of sentence')

In [None]:
# test set
len_dist = [0] * 5
for sentence in test_data:
  if len(sentence) <= 10: len_dist[0] += 1
  elif len(sentence) > 10 and len(sentence) <= 20: len_dist[1] += 1
  elif len(sentence) > 20 and len(sentence) <= 30: len_dist[2] += 1
  elif len(sentence) > 30 and len(sentence) <= 40: len_dist[3] += 1
  else: len_dist[4] += 1

bar_labels = [ "[0, 10]", "[10, 20]", "[20, 30]", "[30, 40]", "[40, :]"]
y_pos = np.arange(len(len_dist))

bars = plt.bar(y_pos, len_dist)
plt.xticks(y_pos, bar_labels)
for b, d in zip(bars, len_dist):
  plt.text(b.get_x() + 0.15, b.get_height()+ 10,'{0:.2%}'.format(d/sum(len_dist)))

plt.xlabel('length of sentence')
plt.ylabel('number of sentence')

## Create word dictionnary

In [None]:
class create_word_dic():
    def __init__(self, words):
        self.dict_word_id = dict()
        self.dict_id_word = list()
        
        for idx, word in enumerate(words):
            self.dict_word_id[word] = idx
            self.dict_id_word.append(word)
    
    def word_to_id(self, word):
        return self.dict_word_id[word]
    
    def id_to_word(self, idx):
        return self.dict_id_word[idx]

In [None]:
word_dict = create_word_dic(train_words)

In [None]:
print(word_dict.word_to_id('<eos>'))

In [None]:
print(word_dict.id_to_word(7492))

## transfer to tensor

In [None]:
# transfer the data into tensor type
def sentence_to_tensor(word_dict, sentence):
    sentence = [word_dict.word_to_id(word) for word in sentence]
    bos = word_dict.word_to_id('<bos>')
    eos = word_dict.word_to_id('<eos>')
    return torch.tensor([bos] + sentence + [eos])

def data_to_tensor(word_dict, data):
    return [sentence_to_tensor(word_dict, sentence) for sentence in data]

In [None]:
train_tensor = data_to_tensor(word_dict, train_data)
valid_tensor = data_to_tensor(word_dict, dev_data)
test_tensor = data_to_tensor(word_dict, test_data)
print(len(train_tensor))

# This experiment executes the training and evaluation with batch.

# Naive LSTM

## Naive LSTM cell

In [None]:
class Naive_LSTM_cell(nn.Module):
    """Naive LSTM like nn.LSTM"""
    def __init__(self, input_size, hidden_size):
        super(Naive_LSTM_cell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # input gate
        self.W_i = Parameter(Tensor(hidden_size, input_size))
        self.U_i = Parameter(Tensor(hidden_size, hidden_size))
        self.b_i = Parameter(Tensor(hidden_size, 1))
        
        # forget gate
        self.W_f = Parameter(Tensor(hidden_size, input_size))
        self.U_f = Parameter(Tensor(hidden_size, hidden_size))
        self.b_f = Parameter(Tensor(hidden_size, 1))
        
        # output gate
        self.W_o = Parameter(Tensor(hidden_size, input_size))
        self.U_o = Parameter(Tensor(hidden_size, hidden_size))
        self.b_o = Parameter(Tensor(hidden_size, 1))
        
        # cell
        self.W_g = Parameter(Tensor(hidden_size, input_size))
        self.U_g = Parameter(Tensor(hidden_size, hidden_size))
        self.b_g = Parameter(Tensor(hidden_size, 1))
        
        # initialize the weights
        self.initialize_weights()
    
    def initialize_weights(self):
        """initialize weights
        """
        '''
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            init.uniform_(weight, -stdv, stdv)
        '''
        for weight in self.parameters():
            init.xavier_normal_(weight)
    
    def forward(self, inputs, state = None):
        """Forward
        Args:
            inputs: [1, 1, input_size]
            state: ([1, 1, hidden_size], [1, 1, hidden_size])
        """
        batch_size, seq_size, _ = inputs.size()
        hidden_seq = []
        
        # read the state
        if state is None:
            h_t = torch.zeros(batch_size, self.hidden_size).t().to(inputs.device)
            c_t = torch.zeros(batch_size, self.hidden_size).t().to(inputs.device)
        else:
            (h, c) = state
            h_t = h.squeeze(0).t()
            c_t = c.squeeze(0).t()
        
        # for each sequence, do the iteration
        for t in range(seq_size):
            x_t = inputs[:, t, :].t()
            
            # input gate
            i_t = torch.sigmoid(self.W_i @ x_t + self.U_i @ h_t + self.b_i)
            # forget gate
            f_t = torch.sigmoid(self.W_f @ x_t + self.U_f @ h_t + self.b_f)
            # cell
            g_t = torch.tanh(self.W_g @ x_t + self.U_g @ h_t + self.b_g)
            # output gate
            o_t = torch.sigmoid(self.W_o @ x_t + self.U_o @ h_t + self.b_o)
            
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
            
            hidden_seq.append(h_t.t().unsqueeze(0))
        
        hidden_seq = torch.cat(hidden_seq, dim=0)
        hidden_seq = hidden_seq.transpose(0,1).contiguous()
        
        return hidden_seq, (h_t.t().unsqueeze(0), c_t.t().unsqueeze(0))

## Naive LSTM model

In [None]:
# without batch
class Naive_LSTM(nn.Module):
    def __init__(self, total_num_words, emb_size=100, lstm_hidden_size=200, n_lstm_layers=1):
        super().__init__()
        self.embeddings = nn.Embedding(total_num_words, emb_size)
        self.lstm = Naive_LSTM_cell(input_size = emb_size, hidden_size = lstm_hidden_size)
        self.output_proj = nn.Linear(lstm_hidden_size, total_num_words)
        
    def forward(self, inp):
            
        embedding = self.embeddings(inp)
        hidden, _ = self.lstm(embedding.unsqueeze(0))
        output_logics = self.output_proj(hidden)
        
        return output_logics.squeeze(0)


In [None]:

# with batch
class Naive_LSTM_batch(nn.Module):
    def __init__(self, total_num_words, emb_size=100, lstm_hidden_size=200, n_lstm_layers=1):
        super().__init__()
        self.embeddings = nn.Embedding(total_num_words+1, emb_size, padding_idx = len(train_words))
        self.lstm = Naive_LSTM_cell(input_size = emb_size, hidden_size = lstm_hidden_size)
        self.output_proj = nn.Linear(lstm_hidden_size, total_num_words)
        
    def forward(self, input_tensor):
        embedding = self.embeddings(input_tensor)
        hidden, (h, c) = self.lstm(embedding)
        output_logics = self.output_proj(hidden)
        
        return output_logics

## Training model

### without batch

In [None]:
# naive lstm training model without batch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

naive_lstm = Naive_LSTM(len(train_words)).to(device)

# optimizer 
optimizer = torch.optim.Adam(naive_lstm.parameters(), lr = 1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma=0.9)

# training
all_train_epoch_losses = []
all_valid_epoch_losses = []
all_epoch_perplexity = []

n_epochs = 10
for epoch in range(n_epochs):
    # learning
    random.shuffle(train_tensor)
    train_epoch_loss = 0
    valid_epoch_loss = 0
    naive_lstm.train()

    for sentence in train_tensor:
        optimizer.zero_grad() ## zero the parameters of gradients
        logits = naive_lstm(sentence[:-1].to(device))
        sentence_loss = F.cross_entropy(logits, sentence[1:].to(device), reduction="sum")

        train_epoch_loss += sentence_loss.item()
        loss = sentence_loss / (len(sentence)-1)
        
        loss.backward()
        optimizer.step()
    
    # Adjust the learning rate
    scheduler.step()
        
    # evaluation   
    naive_lstm.eval()
    n_valid_perplexity = []
                     
    with torch.no_grad():
        for sentence in valid_tensor:
            logits = naive_lstm(sentence[:-1].to(device))
            # calculate the loss
            sentence_loss = F.cross_entropy(logits, sentence[1:].to(device), reduction="sum")
            valid_epoch_loss += sentence_loss.item()

            # calculate the perplexity
            n_valid_perplexity.append(torch.exp(F.cross_entropy(logits, sentence[1:].to(device),reduction='mean')).item())
                     
    all_train_epoch_losses.append(train_epoch_loss / len(train_tensor))
    all_valid_epoch_losses.append(valid_epoch_loss / len(valid_tensor))
    all_epoch_perplexity.append(np.sum(n_valid_perplexity) / len(n_valid_perplexity)) 
    
    print(
        epoch, ":\t / loss for train data:\t",
        train_epoch_loss / len(train_tensor),
        "\t / loss for validation data:\t",
        valid_epoch_loss / len(valid_tensor),
        "\t / perplexity:\t",
        np.sum(n_valid_perplexity) / len(n_valid_perplexity),
        flush=True
    )


### with batch

In [None]:
# naive lstm training model with batch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

naive_lstm_batch = Naive_LSTM_batch(len(train_words)).to(device)

# optimizer 
optimizer = torch.optim.Adam(naive_lstm_batch.parameters(), lr = 2e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma=0.9)

# epochs and batchsize
n_epochs = 20
batch_size = 10

# training
all_train_epoch_losses = []
all_valid_epoch_losses = []
all_epoch_perplexity = []
all_epoch_accuracies = []

# training
for epoch in range(n_epochs):
    random.shuffle(train_tensor)
    train_epoch_loss = 0
    valid_epoch_loss = 0
    naive_lstm_batch.train()

    for batch in range(0, len(train_tensor), batch_size):
        batch_data = train_tensor[batch:batch+batch_size]
        optimizer.zero_grad()

        # find the sentence with the maximum length
        sentence_length = [len(sentence) for sentence in batch_data]
        max_len = max(sentence_length)

        # pack up all the sentences of one batch into a matrix
        batched_input = torch.empty(len(batch_data), max_len-1, dtype=torch.int64).to(device)
        batched_mask = torch.zeros(len(batch_data), max_len-1, dtype=torch.bool).to(device)
        batch_labels = torch.zeros(len(batch_data), max_len-1, dtype=torch.int64).to(device)
        
        # initialize the matrix
        batched_input.fill_(naive_lstm_batch.embeddings.padding_idx)

        # update the matrix 
        for i in range(len(batch_data)):
            sentence = batch_data[i]
            batched_input[i][:len(sentence) - 1] = sentence[:-1]
            batched_mask[i][:len(sentence) - 1] = True
            batch_labels[i][:len(sentence) - 1] = sentence[1:]

        # calculate the output logits
        batch_logits = naive_lstm_batch(batched_input)
        batch_logits = batch_logits[batched_mask]
        batch_labels = batch_labels[batched_mask]
        
        # calculate the cross entropy as the training loss function
        batch_loss = F.cross_entropy(batch_logits, batch_labels, reduction="sum")
        batch_norm = sum(len(s) - 1 for s in batch_data)
            
        train_epoch_loss += batch_loss.item()
        loss = batch_loss / batch_norm
        # backwards
        loss.backward()
        optimizer.step()
    
    # Adjust the learning rate
    scheduler.step()
    
    # evaluation   
    naive_lstm_batch.eval()
    n_valid_perplexity = []
    accuracy = []

    with torch.no_grad():
        for batch in range(0, len(valid_tensor), batch_size):
            batch_data = valid_tensor[batch:batch+batch_size]

            # find the sentence with the maximum length
            sentence_length = [len(sentence) for sentence in batch_data]
            max_len = max(sentence_length)
            
            # pack up all the sentences of one batch into a matrix
            batched_input = torch.empty(len(batch_data), max_len-1, dtype=torch.int64).to(device)
            batched_mask = torch.zeros(len(batch_data), max_len-1, dtype=torch.bool).to(device)
            batch_labels = torch.zeros(len(batch_data), max_len-1, dtype=torch.int64).to(device)
            
            # initialize the matrix
            batched_input.fill_(naive_lstm_batch.embeddings.padding_idx)
            
            # update the matrix 
            for i in range(len(batch_data)):
                sentence = batch_data[i]
                batched_input[i][:len(sentence) - 1] = sentence[:-1]
                batched_mask[i][:len(sentence) - 1] = True
                batch_labels[i][:len(sentence) - 1] = sentence[1:]
            
            # calculate the output logits
            batch_logits = naive_lstm_batch(batched_input)
            batch_logits = batch_logits[batched_mask]
            batch_labels = batch_labels[batched_mask]

            # calculate the cross entropy as the loss function
            batch_loss = F.cross_entropy(batch_logits, batch_labels, reduction="sum")
            batch_norm = sum(len(s) - 1 for s in batch_data)
            valid_epoch_loss += batch_loss.item()

            # calculate the perplexity
            n_valid_perplexity.append(torch.exp(F.cross_entropy(batch_logits, batch_labels, reduction="mean")).item())
            
            # calculate the accuracy
            accuracy.append((batch_labels == batch_logits.argmax(dim=1)).sum().item() / len(batch_labels))

    all_train_epoch_losses.append(train_epoch_loss / len(train_tensor))
    all_valid_epoch_losses.append(valid_epoch_loss / len(valid_tensor))
    all_epoch_perplexity.append(np.sum(n_valid_perplexity)/(len(n_valid_perplexity)))
    all_epoch_accuracies.append(np.sum(accuracy) / len(accuracy))

    print(
        epoch, ":\t / loss for train data:\t",
        train_epoch_loss / len(train_tensor),
        "\t / loss for validation data:\t",
        valid_epoch_loss / len(valid_tensor),
        "\t / perplexity:\t",
        np.sum(n_valid_perplexity) / (len(n_valid_perplexity)),
        "\t / accuracy:\t",
        np.sum(accuracy) / len(accuracy),
        flush = True
    )

In [None]:
# save data
naive_perplexity = torch.tensor(all_epoch_perplexity)
naive_accuracies = torch.tensor(all_epoch_accuracies)
naive_train_losses = torch.tensor(all_train_epoch_losses)
naive_valid_losses = torch.tensor(all_valid_epoch_losses)

torch.save(naive_perplexity, "./naive_perplexity.pt")
torch.save(naive_accuracies, "./naive_accuracies.pt")
torch.save(naive_train_losses, "./naive_train_losses.pt")
torch.save(naive_valid_losses, "./naive_valid_losses.pt")

## Plot

In [None]:
x1 = range(0, n_epochs)
x2 = range(0, n_epochs)
x3 = range(0, n_epochs)
x4 = range(0, n_epochs)

y1 = all_epoch_perplexity
y2 = all_train_epoch_losses
y3 = all_valid_epoch_losses
y4 = all_epoch_accuracies
plt.figure()

plt.plot(x1, y1)
plt.title('Valid perplexity vs. epoches')
plt.xlabel('epoch')
plt.ylabel('Valid perplexity')

plt.figure()

plt.plot(x4, y4)
plt.title('Accuracy vs. epoches')
plt.xlabel('epoch')
plt.ylabel('Accuracy')


plt.figure()
plt.plot(x2, y2,  label='Train loss')
plt.plot(x3, y3, label='Valid loss')
plt.title('Train/Valid loss vs. epoches')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()

plt.show()


# Peephole LSTM 

## Peephole LSTM cell

In [None]:
class Peephole_LSTM_cell(nn.Module):
    """Peephole LSTM """
    
    def __init__(self, input_size, hidden_size):
        super(Peephole_LSTM_cell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # input gate
        self.W_i = Parameter(Tensor(hidden_size, input_size))
        self.U_i = Parameter(Tensor(hidden_size, hidden_size))
        self.P_i = Parameter(Tensor(1, hidden_size))
        self.b_i = Parameter(Tensor(hidden_size, 1))
        
        # forget gate
        self.W_f = Parameter(Tensor(hidden_size, input_size))
        self.U_f = Parameter(Tensor(hidden_size, hidden_size))
        self.P_f = Parameter(Tensor(1, hidden_size))
        self.b_f = Parameter(Tensor(hidden_size, 1))
        
        # output gate
        self.W_o = Parameter(Tensor(hidden_size, input_size))
        self.U_o = Parameter(Tensor(hidden_size, hidden_size))
        self.P_o = Parameter(Tensor(1, hidden_size))
        self.b_o = Parameter(Tensor(hidden_size, 1))
        
        # cell
        self.W_g = Parameter(Tensor(hidden_size, input_size))
        self.U_g = Parameter(Tensor(hidden_size, hidden_size))
        self.b_g = Parameter(Tensor(hidden_size, 1))
        
        # initialize the weights
        self.initialize_weights()
    
    def initialize_weights(self):
        """initialize weights
        """
        '''
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            init.uniform_(weight, -stdv, stdv)
        '''
        for weight in self.parameters():
            init.xavier_normal_(weight)
        
    def forward(self, inputs, state = None):
        """Forward
        Args:
            inputs: [1, 1, input_size]
            state: ([1, 1, hidden_size], [1, 1, hidden_size])
        """
        batch_size, seq_size, _ = inputs.size()
        hidden_seq = []
        
        # read the state
        if state is None:
            h_t = torch.zeros(batch_size, self.hidden_size).t().to(inputs.device)
            c_t = torch.zeros(batch_size, self.hidden_size).t().to(inputs.device)
        else:
            (h, c) = state
            h_t = h.squeeze(0).t()
            c_t = c.squeeze(0).t()
 
        # for each sequence, do the iteration
        for t in range(seq_size):
            x_t = inputs[:, t, :].t()
            
            # input gate
            i_t = torch.sigmoid(self.W_i @ x_t + self.U_i @ h_t + self.P_i @ c_t + self.b_i) 
            # forget gate
            f_t = torch.sigmoid(self.W_f @ x_t + self.U_f @ h_t + self.P_f @ c_t + self.b_f)
            # cell
            g_t = torch.tanh(self.W_g @ x_t + self.U_g @ h_t + self.b_g)
            # output gate
            o_t = torch.sigmoid(self.W_o @ x_t + self.U_o @ h_t + self.P_o @ c_t + self.b_o)
            
            c_t = f_t * c_t + i_t * g_t
            h_t = o_t * torch.tanh(c_t)
           
            hidden_seq.append(h_t.t().unsqueeze(0))
        
        hidden_seq = torch.cat(hidden_seq, dim=0)
        hidden_seq =hidden_seq.transpose(0,1).contiguous()
        
        return hidden_seq, (h_t.t().unsqueeze(0), c_t.t().unsqueeze(0))

## Peephole LSTM model

In [None]:
# without batch
class Peephole_LSTM(nn.Module):
    def __init__(self, total_num_words, emb_size=100, lstm_hidden_size=200, n_lstm_layers=1):
        super().__init__()
        self.embeddings = nn.Embedding(total_num_words, emb_size)
        self.lstm = Peephole_LSTM_cell(input_size = emb_size, hidden_size = lstm_hidden_size)
        self.output_proj = nn.Linear(lstm_hidden_size, total_num_words)
        
    def forward(self, input_tensor):
        embedding = self.embeddings(input_tensor)
        hidden, (h, c) = self.lstm(embedding.unsqueeze(0))
        output_logics = self.output_proj(hidden)
        
        return output_logics.squeeze(0)


In [None]:
# with batch
class Peephole_LSTM_batch(nn.Module):
    def __init__(self, total_num_words, emb_size=100, lstm_hidden_size=200, n_lstm_layers=1):
        super().__init__()
        self.embeddings = nn.Embedding(total_num_words+1, emb_size, padding_idx = len(train_words))
        self.lstm = Peephole_LSTM_cell(input_size = emb_size, hidden_size = lstm_hidden_size)
        self.output_proj = nn.Linear(lstm_hidden_size, total_num_words)
        
    def forward(self, input_tensor):
        embedding = self.embeddings(input_tensor)
        hidden, (h, c) = self.lstm(embedding)
        output_logics = self.output_proj(hidden)
        
        return output_logics

## Training model

### without batch

In [None]:
# peephole lstm training model without batch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

peephole_lstm = Peephole_LSTM(len(train_words)).to(device)

# optimizer 
optimizer = torch.optim.Adam(peephole_lstm.parameters(), lr = 1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma=0.9)

# training
all_epoch_train_losses = []
all_epoch_valid_losses = []
all_epoch_perplexity = []

n_epochs = 10
for epoch in range(n_epochs):
    # learning
    random.shuffle(train_tensor)
    train_epoch_loss = 0
    valid_epoch_loss = 0
    peephole_lstm.train()
    for sentence in train_tensor:
        optimizer.zero_grad() ## zero the parameters of gradients
        logits = peephole_lstm(sentence[:-1].to(device)) 
        sentence_loss = F.cross_entropy(logits, sentence[1:].to(device), reduction="sum")
        
        train_epoch_loss += sentence_loss.item()
        loss = sentence_loss / (len(sentence)-1)
        
        loss.backward()
        optimizer.step()
    
    # Adjust the learning rate
    scheduler.step()
        
    # evaluation   
    peephole_lstm.eval()
    n_valid_perplexity = []
                     
    with torch.no_grad():
        for sentence in valid_tensor:
            logits = peephole_lstm(sentence[:-1].to(device))
            sentence_loss = F.cross_entropy(logits, sentence[1:].to(device), reduction="sum")
            valid_epoch_loss += sentence_loss.item()
              
            n_valid_perplexity.append(torch.exp(F.cross_entropy(logits, sentence[1:].to(device),reduction='mean')).item()) 
            
            
    all_train_epoch_losses.append(train_epoch_loss / len(train_tensor))
    all_valid_epoch_losses.append(valid_epoch_loss / len(valid_tensor))
    all_epoch_perplexity.append(np.sum(n_valid_perplexity) / len(n_valid_perplexity)) 
    
    print(
        epoch, ":\t / loss for train data:\t",
        train_epoch_loss / len(train_tensor),
        "\t / loss for validation data:\t",
        valid_epoch_loss / len(valid_tensor),
        "\t / perplexity:\t",
        np.sum(n_valid_perplexity) / len(n_valid_perplexity),
        flush=True
    )

### with batch

In [None]:
# peephole lstm model with batch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

peephole_lstm_batch = Peephole_LSTM_batch(len(train_words)).to(device)

# optimizer 
optimizer = torch.optim.Adam(peephole_lstm_batch.parameters(), lr = 1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma=0.9)

# epochs and batchsize
n_epochs=20
batch_size=10

# training
all_train_epoch_losses = []
all_valid_epoch_losses = []
all_epoch_perplexity = []
all_epoch_accuracies = []

# training
for epoch in range(n_epochs):
    random.shuffle(train_tensor)
    train_epoch_loss = 0
    valid_epoch_loss = 0
    peephole_lstm_batch.train()

    for batch in range(0, len(train_tensor), batch_size):
        batch_data = train_tensor[batch:batch+batch_size]
        optimizer.zero_grad()
  
        sentence_length = [len(sentence) for sentence in batch_data]
        max_len = max(sentence_length)
        
        batched_input = torch.empty(len(batch_data), max_len-1, dtype=torch.int64).to(device)
        batched_mask = torch.zeros(len(batch_data), max_len-1, dtype=torch.bool).to(device)
        batch_labels = torch.zeros(len(batch_data), max_len-1, dtype=torch.int64).to(device)
        
        batched_input.fill_(peephole_lstm_batch.embeddings.padding_idx)
        
        for i in range(len(batch_data)):
            sentence = batch_data[i]
            batched_input[i][:len(sentence) - 1] = sentence[:-1]
            batched_mask[i][:len(sentence) - 1] = True
            batch_labels[i][:len(sentence) - 1] = sentence[1:]

        batch_logits = peephole_lstm_batch(batched_input)
        batch_logits = batch_logits[batched_mask]
        batch_labels = batch_labels[batched_mask]
        
        batch_loss = F.cross_entropy(batch_logits, batch_labels, reduction="sum")
        batch_norm = sum(len(s) - 1 for s in batch_data)
            
        train_epoch_loss += batch_loss.item()
        loss = batch_loss / batch_norm
        loss.backward()
        optimizer.step()
    
    # Adjust the learning rate
    scheduler.step()
    
    # evaluation   
    peephole_lstm_batch.eval()
    n_valid_perplexity = []
    accuracy = []

    with torch.no_grad():

        for batch in range(0, len(valid_tensor), batch_size):
            batch_data = valid_tensor[batch:batch+batch_size]
  
            sentence_length = [len(sentence) for sentence in batch_data]
            max_len = max(sentence_length)
        
            batched_input = torch.empty(len(batch_data), max_len-1, dtype=torch.int64).to(device)
            batched_mask = torch.zeros(len(batch_data), max_len-1, dtype=torch.bool).to(device)
            batch_labels = torch.zeros(len(batch_data), max_len-1, dtype=torch.int64).to(device)
            
            batched_input.fill_(peephole_lstm_batch.embeddings.padding_idx)
            
            for i in range(len(batch_data)):
                sentence = batch_data[i]
                batched_input[i][:len(sentence) - 1] = sentence[:-1]
                batched_mask[i][:len(sentence) - 1] = True
                batch_labels[i][:len(sentence) - 1] = sentence[1:]

            batch_logits = peephole_lstm_batch(batched_input)
            batch_logits = batch_logits[batched_mask]
            batch_labels = batch_labels[batched_mask]
            
            batch_loss = F.cross_entropy(batch_logits, batch_labels, reduction="sum")
            batch_norm = sum(len(s) - 1 for s in batch_data)
            valid_epoch_loss += batch_loss.item()

            # calculate the perplexity
            n_valid_perplexity.append(torch.exp(F.cross_entropy(batch_logits, batch_labels, reduction="mean")).item())

            # calculate the accuracy
            accuracy.append((batch_labels == batch_logits.argmax(dim=1)).sum().item() / len(batch_labels))
    
    all_train_epoch_losses.append(train_epoch_loss / len(train_tensor))
    all_valid_epoch_losses.append(valid_epoch_loss / len(valid_tensor))
    all_epoch_perplexity.append(np.sum(n_valid_perplexity)/(len(n_valid_perplexity)))
    all_epoch_accuracies.append(np.sum(accuracy) / len(accuracy))

    print(
        epoch, ":\t / loss for train data:\t",
        train_epoch_loss  / len(train_tensor),
        "\t / loss for validation data:\t",
        valid_epoch_loss  / len(valid_tensor),
        "\t / perplexity:\t",
        np.sum(n_valid_perplexity) / (len(n_valid_perplexity)),
        "\t / accuracy:\t",
        np.sum(accuracy) / len(accuracy),
        flush=True
    )


In [None]:
# save data
p_perplexity = torch.tensor(all_epoch_perplexity)
p_accuracies = torch.tensor(all_epoch_accuracies)
p_train_losses = torch.tensor(all_train_epoch_losses)
p_valid_losses = torch.tensor(all_valid_epoch_losses)

torch.save(p_perplexity, "./p_perplexity.pt")
torch.save(p_accuracies, "./p_accuracies.pt")
torch.save(p_train_losses, "./p_train_losses.pt")
torch.save(p_valid_losses, "./p_valid_losses.pt")

## Plot

In [None]:
x1 = range(0, n_epochs)
x2 = range(0, n_epochs)
x3 = range(0, n_epochs)
x4 = range(0, n_epochs)

y1 = all_epoch_perplexity
y2 = all_train_epoch_losses
y3 = all_valid_epoch_losses
y4 = all_epoch_accuracies
plt.figure()

plt.plot(x1, y1)
plt.title('Valid perplexity vs. epoches')
plt.xlabel('epoch')
plt.ylabel('Valid perplexity')

plt.figure()

plt.plot(x4, y4)
plt.title('Accuracy vs. epoches')
plt.xlabel('epoch')
plt.ylabel('Accuracy')


plt.figure()
plt.plot(x2, y2,  label='Train loss')
plt.plot(x3, y3, label='Valid loss')
plt.title('Train/Valid loss vs. epoches')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()

plt.show()

# ON-LSTM

## ON-LSTM cell

In [None]:
class ON_LSTM_cell(nn.Module):
    """ON LSTM Cell"""
    def __init__(self, input_size, hidden_size):
        super(ON_LSTM_cell, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        # input gate
        self.W_i = Parameter(Tensor(hidden_size, input_size))
        self.U_i = Parameter(Tensor(hidden_size, hidden_size))
        self.b_i = Parameter(Tensor(hidden_size, 1))
        
        # forget gate
        self.W_f = Parameter(Tensor(hidden_size, input_size))
        self.U_f = Parameter(Tensor(hidden_size, hidden_size))
        self.b_f = Parameter(Tensor(hidden_size, 1))
        
        # output gate
        self.W_o = Parameter(Tensor(hidden_size, input_size))
        self.U_o = Parameter(Tensor(hidden_size, hidden_size))
        self.b_o = Parameter(Tensor(hidden_size, 1))
        
        # cell
        self.W_g = Parameter(Tensor(hidden_size, input_size))
        self.U_g = Parameter(Tensor(hidden_size, hidden_size))
        self.b_g = Parameter(Tensor(hidden_size, 1))
        
        # master input gate
        self.W_mi = Parameter(Tensor(hidden_size, input_size))
        self.U_mi = Parameter(Tensor(hidden_size, hidden_size))
        self.b_mi = Parameter(Tensor(hidden_size, 1))
        
        # master forget gate
        self.W_mf = Parameter(Tensor(hidden_size, input_size))
        self.U_mf = Parameter(Tensor(hidden_size, hidden_size))
        self.b_mf = Parameter(Tensor(hidden_size, 1))
        
        # initialize the weights
        self.initialize_weights()
    
    def initialize_weights(self):
        """initialize weights
        """
        '''
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            init.uniform_(weight, -stdv, stdv)
        '''
        for weight in self.parameters():
            init.xavier_normal_(weight)
    
    def cumsoftmax(self, x):

        return torch.cumsum(F.softmax(x, dim=-1), dim=-1)
    
    def forward(self, inputs, state = None):
        """Forward
        Args:
            inputs: [1, 1, input_size]
            state: ([1, 1, hidden_size], [1, 1, hidden_size])
        """
        batch_size, seq_size, _ = inputs.size()
        hidden_seq = []
        
        # read the state
        if state is None:
            h_t = torch.zeros(batch_size, self.hidden_size).t().to(inputs.device)
            c_t = torch.zeros(batch_size, self.hidden_size).t().to(inputs.device)
        else:
            (h, c) = state
            h_t = h.squeeze(0).t()
            c_t = c.squeeze(0).t()
        
        # for each sequence, do the iteration
        for t in range(seq_size):
            x_t = inputs[:, t, :].t()
            
            # input gate
            i_t = torch.sigmoid(self.W_i @ x_t + self.U_i @ h_t + self.b_i)
            # forget gate
            f_t = torch.sigmoid(self.W_f @ x_t + self.U_f @ h_t + self.b_f)
            # cell
            g_t = torch.tanh(self.W_g @ x_t + self.U_g @ h_t + self.b_g)
            # output gate
            o_t = torch.sigmoid(self.W_o @ x_t + self.U_o @ h_t + self.b_o)
            
            # master input gate
            im_t = 1. - self.cumsoftmax(self.W_mi @ x_t + self.U_mi @ h_t + self.b_mi)
            # master forget gate
            fm_t = self.cumsoftmax(self.W_mf @ x_t + self.U_mf @ h_t + self.b_mf)
            
            w_t = f_t * i_t
            c_t = w_t * (f_t * c_t + i_t * g_t) + (fm_t-w_t) * c_t + (im_t-w_t) * g_t
            h_t = o_t * torch.tanh(c_t)
            
           
            hidden_seq.append(h_t.t().unsqueeze(0))
        
        hidden_seq = torch.cat(hidden_seq, dim=0)
        hidden_seq =hidden_seq.transpose(0,1).contiguous()
        
        return hidden_seq, (h_t.t().unsqueeze(0), c_t.t().unsqueeze(0))
    

## ON-LSTM model

In [None]:
# without batch
class ON_LSTM(nn.Module):
    def __init__(self, total_num_words, emb_size=100, lstm_hidden_size=200, n_lstm_layers=1):
        super().__init__()
        self.embeddings = nn.Embedding(total_num_words, emb_size)
        self.lstm = ON_LSTM_cell(input_size = emb_size, hidden_size = lstm_hidden_size)
        self.output_proj = nn.Linear(lstm_hidden_size, total_num_words)
        
    def forward(self, input_tensor):
        embedding = self.embeddings(input_tensor)
        hidden, (h, c) = self.lstm(embedding.unsqueeze(0))
        output_logics = self.output_proj(hidden)
        
        return output_logics.squeeze(0)
    

In [None]:
# with batch
class ON_LSTM_batch(nn.Module):
    def __init__(self, total_num_words, emb_size=400, lstm_hidden_size=1000, n_lstm_layers=1):
        super().__init__()
        self.embeddings = nn.Embedding(total_num_words+1, emb_size, padding_idx = len(train_words))
        self.lstm = ON_LSTM_cell(input_size = emb_size, hidden_size = lstm_hidden_size)
        self.output_proj = nn.Linear(lstm_hidden_size, total_num_words)
        
    def forward(self, input_tensor):
        embedding = self.embeddings(input_tensor)
        hidden, (h, c) = self.lstm(embedding)
        output_logics = self.output_proj(hidden)
        
        return output_logics

    

## Training model

### without batch

In [None]:
# on lstm model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

on_lstm = ON_LSTM(len(train_words)).to(device)

# optimizer 
optimizer = torch.optim.Adam(on_lstm.parameters(), lr = 1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma=0.9)

# training
all_epoch_train_losses = []
all_epoch_valid_losses = []
all_epoch_perplexity = []

n_epochs = 10
for epoch in range(n_epochs):
    # learning
    random.shuffle(train_tensor)
    train_epoch_loss = 0
    valid_epoch_loss = 0
    on_lstm.train()
    for sentence in train_tensor:
        optimizer.zero_grad() ## zero the parameters of gradients
        logits = on_lstm(sentence[:-1].to(device)) 

        sentence_loss = F.cross_entropy(logits, sentence[1:].to(device), reduction="sum")
        
        train_epoch_loss += sentence_loss.item()
        loss = sentence_loss / (len(sentence)-1)
        
        loss.backward()
        optimizer.step()
    
    # Adjust the learning rate
    scheduler.step()
        
    # evaluation   
    on_lstm.eval()
    n_valid_perplexity = []
                     
    with torch.no_grad():
        for sentence in valid_tensor:
            logits = on_lstm(sentence[:-1].to(device))
            sentence_loss = F.cross_entropy(logits, sentence[1:].to(device), reduction="sum")
            valid_epoch_loss += sentence_loss.item()
              
            n_valid_perplexity.append(torch.exp(F.cross_entropy(logits, sentence[1:].to(device),reduction='mean')).item())
            
    all_train_epoch_losses.append(train_epoch_loss / len(train_tensor))
    all_valid_epoch_losses.append(valid_epoch_loss / len(valid_tensor))
    all_epoch_perplexity.append(np.sum(n_valid_perplexity) / len(n_valid_perplexity)) 
    
    print(
        epoch, ":\t / loss for train data:\t",
        train_epoch_loss / len(train_tensor),
        "\t / loss for validation data:\t",
        valid_epoch_loss / len(valid_tensor),
        "\t / perplexity:\t",
        np.sum(n_valid_perplexity) / len(n_valid_perplexity),
        flush=True
    )

### with batch

In [None]:
# on lstm model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

on_lstm_batch = ON_LSTM_batch(len(train_words)).to(device)

# optimizer 
optimizer = torch.optim.Adam(on_lstm_batch.parameters(), lr = 1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma=0.9)

# epochs and batchsize
n_epochs=20
batch_size=10

# training
all_train_epoch_losses = []
all_valid_epoch_losses = []
all_epoch_perplexity = []
all_epoch_accuracies = []

# training
for epoch in range(n_epochs):
    random.shuffle(train_tensor)
    train_epoch_loss = 0
    valid_epoch_loss = 0
    on_lstm_batch.train()

    for batch in range(0, len(train_tensor), batch_size):
        batch_data = train_tensor[batch:batch+batch_size]
        optimizer.zero_grad()
  
        sentence_length = [len(sentence) for sentence in batch_data]
        max_len = max(sentence_length)
        
        batched_input = torch.empty(len(batch_data), max_len-1, dtype=torch.int64).to(device)
        batched_mask = torch.zeros(len(batch_data), max_len-1, dtype=torch.bool).to(device)
        batch_labels = torch.zeros(len(batch_data), max_len-1, dtype=torch.int64).to(device)
        
        batched_input.fill_(on_lstm_batch.embeddings.padding_idx)
        
        for i in range(len(batch_data)):
            sentence = batch_data[i]
            batched_input[i][:len(sentence) - 1] = sentence[:-1]
            batched_mask[i][:len(sentence) - 1] = True
            batch_labels[i][:len(sentence) - 1] = sentence[1:]

        batch_logits = on_lstm_batch(batched_input)
        batch_logits = batch_logits[batched_mask]
        batch_labels = batch_labels[batched_mask]
        
        batch_loss = F.cross_entropy(batch_logits, batch_labels, reduction="sum")
        batch_norm = sum(len(s) - 1 for s in batch_data)
            
        train_epoch_loss += batch_loss.item()
        loss = batch_loss / batch_norm
        loss.backward()
        optimizer.step()
    
    # Adjust the learning rate
    scheduler.step()
    
    # evaluation   
    on_lstm_batch.eval()
    n_valid_perplexity = []
    accuracy = []

    with torch.no_grad():

        for batch in range(0, len(valid_tensor), batch_size):
            batch_data = valid_tensor[batch:batch+batch_size]
  
            sentence_length = [len(sentence) for sentence in batch_data]
            max_len = max(sentence_length)
        
            batched_input = torch.empty(len(batch_data), max_len-1, dtype=torch.int64).to(device)
            batched_mask = torch.zeros(len(batch_data), max_len-1, dtype=torch.bool).to(device)
            batch_labels = torch.zeros(len(batch_data), max_len-1, dtype=torch.int64).to(device)
            
            batched_input.fill_(on_lstm_batch.embeddings.padding_idx)
            
            for i in range(len(batch_data)):
                sentence = batch_data[i]
                batched_input[i][:len(sentence) - 1] = sentence[:-1]
                batched_mask[i][:len(sentence) - 1] = True
                batch_labels[i][:len(sentence) - 1] = sentence[1:]

            batch_logits = on_lstm_batch(batched_input)
            batch_logits = batch_logits[batched_mask]
            batch_labels = batch_labels[batched_mask]
            
            batch_loss = F.cross_entropy(batch_logits, batch_labels, reduction="sum")
            batch_norm = sum(len(s) - 1 for s in batch_data)
            valid_epoch_loss += batch_loss.item()

            # calculate the perplexity
            n_valid_perplexity.append(torch.exp(F.cross_entropy(batch_logits, batch_labels, reduction="mean")).item())

            # calculate the accuracy
            accuracy.append((batch_labels == batch_logits.argmax(dim=1)).sum().item() / len(batch_labels))
            
    all_train_epoch_losses.append(train_epoch_loss / len(train_tensor))
    all_valid_epoch_losses.append(valid_epoch_loss / len(valid_tensor))
    all_epoch_perplexity.append(np.sum(n_valid_perplexity)/(len(n_valid_perplexity) * batch_size))
    all_epoch_accuracies.append(np.sum(accuracy) / len(accuracy))

    print(
        epoch, ":\t / loss for train data:\t",
        train_epoch_loss / len(train_tensor),
        "\t / loss for validation data:\t",
        valid_epoch_loss / len(valid_tensor),
        "\t / perplexity:\t",
        np.sum(n_valid_perplexity) / (len(n_valid_perplexity)),
        "\t / accuracy:\t",
        np.sum(accuracy) / len(accuracy),
        flush=True
    )
    
      

In [None]:
# save data
on_perplexity = torch.tensor(all_epoch_perplexity)
on_accuracies = torch.tensor(all_epoch_accuracies)
on_train_losses = torch.tensor(all_train_epoch_losses)
on_valid_losses = torch.tensor(all_valid_epoch_losses)

torch.save(on_perplexity, "./on_perplexity.pt")
torch.save(on_accuracies, "./on_accuracies.pt")
torch.save(on_train_losses, "./on_train_losses.pt")
torch.save(on_valid_losses, "./on_valid_losses.pt")

## Plot

In [None]:

x1 = range(0, n_epochs)
x2 = range(0, n_epochs)
x3 = range(0, n_epochs)
x4 = range(0, n_epochs)

y1 = all_epoch_perplexity
y2 = all_train_epoch_losses
y3 = all_valid_epoch_losses
y4 = all_epoch_accuracies
plt.figure()

plt.plot(x1, y1)
plt.title('Valid perplexity vs. epoches')
plt.xlabel('epoch')
plt.ylabel('Valid perplexity')

plt.figure()

plt.plot(x4, y4)
plt.title('Accuracy vs. epoches')
plt.xlabel('epoch')
plt.ylabel('Accuracy')


plt.figure()
plt.plot(x2, y2,  label='Train loss')
plt.plot(x3, y3, label='Valid loss')
plt.title('Train/Valid loss vs. epoches')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()

plt.show()

In [None]:
# Plot accuracy in one figure
x1 = range(0, n_epochs)
x2 = range(0, n_epochs)
x3 = range(0, n_epochs)

on_acc = torch.load("./on_accuracies.pt")
p_acc = torch.load("./p_accuracies.pt")
naive_acc = torch.load("./naive_accuracies.pt")

plt.figure()
plt.plot(x1, naive_acc,  label='naive LSTM')

plt.plot(x2, p_acc,  label='peephole LSTM')

plt.plot(x3, on_acc, label='on LSTM')
plt.title('Accuracy vs. epoches')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend()

In [None]:
# Plot perplexity in one figure
x1 = range(0, n_epochs)
x2 = range(0, n_epochs)
x3 = range(0, n_epochs)

on_p = torch.load("./on_perplexity.pt")
p_p = torch.load("./p_perplexity.pt")
naive_p = torch.load("./naive_perplexity.pt")


plt.figure()
plt.plot(x1, naive_p,  label='naive LSTM')

plt.plot(x2, p_p,  label='peephole LSTM')

plt.plot(x3, on_p, label='on LSTM')
plt.title('Perplexity vs. epoches')
plt.xlabel('epoch')
plt.ylabel('perplexity')
plt.legend()

In [None]:
# Plot train and validation losses for 3 LSTM models
x1 = range(0, n_epochs)
x2 = range(0, n_epochs)
x3 = range(0, n_epochs)
x4 = range(0, n_epochs)
x5 = range(0, n_epochs)
x6 = range(0, n_epochs)


on_v_loss = torch.load("./on_valid_losses.pt")
p_v_loss = torch.load("./p_valid_losses.pt")
naive_v_loss = torch.load("./naive_valid_losses.pt")


on_t_loss = torch.load("./on_train_losses.pt")
p_t_loss = torch.load("./p_train_losses.pt")
naive_t_loss = torch.load("./naive_train_losses.pt")


plt.figure()
plt.plot(x1, naive_v_loss,  label='Validation loss')
plt.plot(x2, naive_t_loss,  label='Train loss')

plt.title('Losses from naive LSTM vs. epoches')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()


plt.figure()
plt.plot(x3, p_v_loss,  label='Validation loss')
plt.plot(x4, p_t_loss,  label='Train loss')

plt.title('Losses from Peephole LSTM vs. epoches')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()

plt.figure()
plt.plot(x5, on_v_loss,  label='Validation loss')
plt.plot(x6, on_t_loss,  label='Train loss')

plt.title('Losses from On LSTM vs. epoches')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend()

