# Peephole LSTM Test & Performance Comparison (Speed & Memory)

* [Imports](#Importing-necessary-modules)
* [Load & Definition](#Loading-and-defining-modules)
    * [Autograd Functions](#Autograd-Functions)
    * [Module Classes](#Module-classes-(C++,-CUDA,-PyTorch))
* [Models](#Defining-models)
    * [Definition](#Definition)
    * [Instantiation](#Instantiation)
    * [Parameter Synchronization](#Parameter-Synchronization)
* [Fake Dataset](#Creating-a-fake-dataset)
* [Sanity Check](#Sanity-check:-output-comparison)
    * [Forward Outputs](#Forward-Outputs)
    * [Backward Gradients](#Backward-Gradients)
* [Forward Performance](#Forward-time-comparison)
* [+Backward Performance](#+Backward-time-comparison)

---

## Importing necessary modules
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [5]:
import torch
from torch import nn
from torch.utils.cpp_extension import load
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader

import math
from collections import OrderedDict
from time import sleep

torch.__version__

'0.4.1'

---

## Loading and defining modules
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

### Autograd Functions
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [6]:
_bn_peephole_lstm_layer_cpp = load('bn_peephole_lstm_layer', ['./bn_peephole_lstm_layer.cpp'])

########################################################################################################################

class BNPeepholeLSTMFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight_ih, weight_hh, weight_ch, bias, gamma_ih, gamma_hh, gamma_ch, gamma_tanh_cell, beta_tanh_cell, running_mean_ih, running_mean_hh, running_mean_ch, running_mean_tanh_cell, running_var_ih, running_var_hh, running_var_ch, running_var_tanh_cell, old_h, old_cell, momentum, epsilon, dropout_p, training):
        
        outputs = _bn_peephole_lstm_layer_cpp.forward(input, weight_ih, weight_hh, weight_ch, bias,
                                                    gamma_ih, gamma_hh, gamma_ch, gamma_tanh_cell, beta_tanh_cell,
                                                    running_mean_ih, running_mean_hh, running_mean_ch, running_mean_tanh_cell,
                                                    running_var_ih, running_var_hh, running_var_ch, running_var_tanh_cell,
                                                    old_h, old_cell,
                                                    momentum, epsilon, dropout_p, training)
        
        (out, new_h, new_cell,
         running_mean_ih.data, running_mean_hh.data, running_mean_ch.data, running_mean_tanh_cell.data,
         running_var_ih.data, running_var_hh.data, running_var_ch.data, running_var_tanh_cell.data) = outputs[:11]
        
        variables = outputs[11:] + [weight_ih, weight_hh, weight_ch,
                                    gamma_ih, gamma_hh, gamma_ch, gamma_tanh_cell]
        ctx.training = training # Boolean value stored this way since only tensors can be stored using save_for_backward
        ctx.save_for_backward(*variables)

        return out, new_h, new_cell

    @staticmethod
    def backward(ctx, grad_output, grad_h, grad_cell):
        outputs = _bn_peephole_lstm_layer_cpp.backward(
            grad_output.contiguous(), grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_tensors, ctx.training)
        
        (d_old_h, d_old_cell, d_input,
         d_weight_ih, d_weight_hh, d_weight_ch, d_bias,
         d_gamma_ih, d_gamma_hh, d_gamma_ch, d_gamma_tanh_cell, d_beta_tanh_cell) = outputs
        
        return d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias, d_gamma_ih, d_gamma_hh, d_gamma_ch, d_gamma_tanh_cell, d_beta_tanh_cell, None, None, None, None, None, None, None, None, d_old_h, d_old_cell, None, None, None, None
   

### Module classes (C++, CUDA, PyTorch)
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [7]:
class BNPeepholeLSTMTorch(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0., momentum=0.1, eps=1e-05):
        if not 0 <= dropout <= 1:
            raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")
        if not 0 <= momentum <= 1:
            raise ValueError(f"Invalid momentum value : {momentum} momentum must be in range [0, 1].")
        super(BNPeepholeLSTMTorch, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = float(dropout)
        self.momentum = float(momentum)
        self.eps = eps
        
        self.register_parameter('weight_ih', nn.Parameter(torch.empty(4 * hidden_size, input_size)))
        self.register_parameter('weight_hh', nn.Parameter(torch.empty(4 * hidden_size, hidden_size)))
        self.register_parameter('weight_ch', nn.Parameter(torch.empty(3 * hidden_size, hidden_size)))
        self.register_parameter('bias', nn.Parameter(torch.empty(4 * hidden_size)))
        
        self.register_parameter('gamma_ih', nn.Parameter(torch.empty(4 * hidden_size)))
        self.register_parameter('gamma_hh', nn.Parameter(torch.empty(4 * hidden_size)))
        self.register_parameter('gamma_ch', nn.Parameter(torch.empty(3 * hidden_size)))
        self.register_parameter('gamma_tanh_cell', nn.Parameter(torch.empty(hidden_size)))
        self.register_parameter('beta_tanh_cell', nn.Parameter(torch.empty(hidden_size)))
        
        self.register_buffer('running_mean_ih', torch.empty(4 * hidden_size))
        self.register_buffer('running_mean_hh', torch.empty(4 * hidden_size))
        self.register_buffer('running_mean_ch', torch.empty(3 * hidden_size))
        self.register_buffer('running_mean_tanh_cell', torch.empty(hidden_size))
        self.register_buffer('running_var_ih', torch.empty(4 * hidden_size))
        self.register_buffer('running_var_hh', torch.empty(4 * hidden_size))
        self.register_buffer('running_var_ch', torch.empty(3 * hidden_size))
        self.register_buffer('running_var_tanh_cell', torch.empty(hidden_size))
        
        self.reset_parameters()
        self.reset_running_stats()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
        self.weight_ih.data.uniform_(-stdv, +stdv)
        self.weight_hh.data.uniform_(-stdv, +stdv)
        self.weight_ch.data.uniform_(-stdv, +stdv)
        
        self.bias.data.zero_()
        self.bias.data[:self.hidden_size].fill_(1.)
        
        self.gamma_ih.data.uniform_()
        self.gamma_hh.data.uniform_()
        self.gamma_ch.data.uniform_()
        self.gamma_tanh_cell.data.uniform_()
        self.beta_tanh_cell.data.zero_()
    
    def reset_running_stats(self):
        self.running_mean_ih.data.zero_()
        self.running_mean_hh.data.zero_()
        self.running_mean_ch.data.zero_()
        self.running_mean_tanh_cell.data.zero_()
        self.running_var_ih.data.fill_(1.)
        self.running_var_hh.data.fill_(1.)
        self.running_var_ch.data.fill_(1.)
        self.running_var_tanh_cell.data.fill_(1.)
    
    def forward(self, input, states):
        assert input.dim() == 3
        outputs = input.new_empty((input.size(0), input.size(1), self.hidden_size))
        
        h = states[0].clone()
        c = states[1].clone()
        
        weight_ih = self.weight_ih.t()
        weight_hh = self.weight_hh.t()
        weight_ch = self.weight_ch.t()
        
        ih = torch.matmul(input.transpose(0, 1), weight_ih)
        
        for i in range(input.size(1)):
            h = F.dropout(h, p=self.dropout, training=self.training)
            
            gates = (F.batch_norm(ih[i], self.running_mean_ih, self.running_var_ih, weight=self.gamma_ih, bias=None, training=self.training, momentum=self.momentum, eps=self.eps)
                     + F.batch_norm(torch.mm(h, weight_hh), self.running_mean_hh, self.running_var_hh, weight=self.gamma_hh, bias=None, training=self.training, momentum=self.momentum, eps=self.eps)
                     + self.bias)
            
            gates = torch.cat((gates[:, :3 * self.hidden_size].add(F.batch_norm(torch.mm(c, weight_ch), self.running_mean_ch, self.running_var_ch, weight=self.gamma_ch, bias=None, training=self.training, momentum=self.momentum, eps=self.eps)).sigmoid(), gates[:, 3 * self.hidden_size:].tanh()), dim=1).chunk(chunks=4, dim=1)
            
            c = torch.addcmul(gates[1] * gates[3], c, gates[0])
            h = gates[2] * F.batch_norm(c.tanh(), self.running_mean_tanh_cell, self.running_var_tanh_cell, weight=self.gamma_tanh_cell, bias=self.beta_tanh_cell, training=self.training, momentum=self.momentum, eps=self.eps)
            
            outputs[:, i] = h
        
        outputs = torch.nn.functional.dropout(outputs, p=self.dropout, training=self.training)
    
        return outputs, (h, c)
    
    def __repr__(self):
        return f"PeepholeNormLSTMTorch(input_size={self.input_size}, hidden_size={self.hidden_size}, dropout={self.dropout}, momentum={self.momentum}, eps={self.eps})"

########################################################################################################################
    
# class LNPeepholeLSTMTorch(nn.Module):
#     def __init__(self, input_size, hidden_size, dropout=0., momentum=0.1, eps=1e-05):
#         if not 0 <= dropout <= 1:
#             raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")
#         if not 0 <= momentum <= 1:
#             raise ValueError(f"Invalid momentum value : {momentum} momentum must be in range [0, 1].")
#         super(PeepholeLayerNormLSTMTorch, self).__init__()
        
#         self.input_size = input_size
#         self.hidden_size = hidden_size
#         self.dropout = float(dropout)
#         self.momentum = float(momentum) #Not Used
#         self.eps = eps
        
#         self.register_parameter('weight_ih', nn.Parameter(torch.empty(4 * hidden_size, input_size)))
#         self.register_parameter('weight_hh', nn.Parameter(torch.empty(4 * hidden_size, hidden_size)))
#         self.register_parameter('weight_ch', nn.Parameter(torch.empty(3 * hidden_size, hidden_size)))
#         self.register_parameter('bias', nn.Parameter(torch.empty(4 * hidden_size)))
        
#         self.register_parameter('gamma_ih', nn.Parameter(torch.empty(4 * hidden_size)))
#         self.register_parameter('gamma_hh', nn.Parameter(torch.empty(4 * hidden_size)))
#         self.register_parameter('gamma_ch', nn.Parameter(torch.empty(3 * hidden_size)))
#         self.register_parameter('gamma_tanh_cell', nn.Parameter(torch.empty(hidden_size)))
#         self.register_parameter('beta_tanh_cell', nn.Parameter(torch.empty(hidden_size)))
        
#         self.reset_parameters()

#     def reset_parameters(self):
#         stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
#         self.weight_ih.data.uniform_(-stdv, +stdv)
#         self.weight_hh.data.uniform_(-stdv, +stdv)
#         self.weight_ch.data.uniform_(-stdv, +stdv)
        
#         self.bias.data.zero_()
#         self.bias.data[:self.hidden_size].fill_(1.)
        
#         self.gamma_ih.data.uniform_()
#         self.gamma_hh.data.uniform_()
#         self.gamma_ch.data.uniform_()
#         self.gamma_tanh_cell.data.uniform_()
#         self.beta_tanh_cell.data.zero_()
    
#     def forward(self, input, states):
#         assert input.dim() == 3
#         outputs = input.new_empty((input.size(0), input.size(1), self.hidden_size))
        
#         h = states[0].clone()
#         c = states[1].clone()
        
#         hidden_size_3 = 3 * self.hidden_size
#         gate_size = hidden_size_3 + self.hidden_size
        
#         weight_ih = self.weight_ih.t()
#         weight_hh = self.weight_hh.t()
#         weight_ch = self.weight_ch.t()
        
#         ih = torch.matmul(input.transpose(0, 1), weight_ih)
        
#         for i in range(input.size(1)):
#             h = F.dropout(h, p=self.dropout, training=self.training)
            
#             gates = (F.layer_norm(ih[i], [gate_size], weight=self.gamma_ih, bias=None, eps=self.eps)
#                      + F.layer_norm(torch.mm(h, weight_hh), [gate_size], weight=self.gamma_hh, bias=None, eps=self.eps)
#                      + self.bias)
    
#             gates = torch.cat((gates[:, :3 * self.hidden_size].add(F.layer_norm(torch.mm(c, weight_ch), [hidden_size_3], weight=self.gamma_ch, bias=None, eps=self.eps)).sigmoid(), gates[:, 3 * self.hidden_size:].tanh()), dim=1).chunk(chunks=4, dim=1)
#             gates = torch.cat((gates[:, :3 * self.hidden_size].sigmoid(), gates[:, 3 * self.hidden_size:].tanh()), dim=1).chunk(chunks=4, dim=1)
            
#             c = torch.addcmul(gates[1] * gates[3], c, gates[0])
#             h = gates[2] * F.layer_norm(c.tanh(), [self.hidden_size], weight=self.gamma_tanh_cell, bias=self.beta_tanh_cell, eps=self.eps)
            
#             outputs[:, i] = h
        
#         outputs = torch.nn.functional.dropout(outputs, p=self.dropout, training=self.training)
    
#         return outputs, (h, c)
    
#     def __repr__(self):
#         return f"PeepholeLayerNormLSTMTorch(input_size={self.input_size}, hidden_size={self.hidden_size}, dropout={self.dropout}, eps={self.eps})"

########################################################################################################################

class BNPeepholeLSTMCPP(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0., momentum=0.1, eps=1e-05):
        if not 0 <= dropout <= 1:
            raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")
        if not 0 <= momentum <= 1:
            raise ValueError(f"Invalid momentum value : {momentum} momentum must be in range [0, 1].")
            
        super(BNPeepholeLSTMCPP, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = float(dropout)
        self.momentum = float(momentum)
        self.eps = eps
        
        self.register_parameter('weight_ih', nn.Parameter(torch.empty(4 * hidden_size, input_size)))
        self.register_parameter('weight_hh', nn.Parameter(torch.empty(4 * hidden_size, hidden_size)))
        self.register_parameter('weight_ch', nn.Parameter(torch.empty(3 * hidden_size, hidden_size)))
        self.register_parameter('bias', nn.Parameter(torch.empty(4 * hidden_size)))
        
        self.register_parameter('gamma_ih', nn.Parameter(torch.empty(4 * hidden_size)))
        self.register_parameter('gamma_hh', nn.Parameter(torch.empty(4 * hidden_size)))
        self.register_parameter('gamma_ch', nn.Parameter(torch.empty(3 * hidden_size)))
        self.register_parameter('gamma_tanh_cell', nn.Parameter(torch.empty(hidden_size)))
        self.register_parameter('beta_tanh_cell', nn.Parameter(torch.empty(hidden_size)))
        
        self.register_buffer('running_mean_ih', torch.empty(4 * hidden_size))
        self.register_buffer('running_mean_hh', torch.empty(4 * hidden_size))
        self.register_buffer('running_mean_ch', torch.empty(3 * hidden_size))
        self.register_buffer('running_mean_tanh_cell', torch.empty(hidden_size))
        self.register_buffer('running_var_ih', torch.empty(4 * hidden_size))
        self.register_buffer('running_var_hh', torch.empty(4 * hidden_size))
        self.register_buffer('running_var_ch', torch.empty(3 * hidden_size))
        self.register_buffer('running_var_tanh_cell', torch.empty(hidden_size))
        
        self.reset_parameters()
        self.reset_running_stats()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
        self.weight_ih.data.uniform_(-stdv, +stdv)
        self.weight_hh.data.uniform_(-stdv, +stdv)
        self.weight_ch.data.uniform_(-stdv, +stdv)
        
        self.bias.data.zero_()
        self.bias.data[:self.hidden_size].fill_(1.)
        
        self.gamma_ih.data.uniform_()
        self.gamma_hh.data.uniform_()
        self.gamma_ch.data.uniform_()
        self.gamma_tanh_cell.data.uniform_()
        self.beta_tanh_cell.data.zero_()
    
    def reset_running_stats(self):
        self.running_mean_ih.data.zero_()
        self.running_mean_hh.data.zero_()
        self.running_mean_ch.data.zero_()
        self.running_mean_tanh_cell.data.zero_()
        self.running_var_ih.data.fill_(1.)
        self.running_var_hh.data.fill_(1.)
        self.running_var_ch.data.fill_(1.)
        self.running_var_tanh_cell.data.fill_(1.)
        
    def forward(self, input, state):
        input = input.transpose(0, 1).contiguous()
        
        output, new_h, new_cell = BNPeepholeLSTMFunction.apply(input, self.weight_ih, self.weight_hh, self.weight_ch, self.bias, self.gamma_ih, self.gamma_hh, self.gamma_ch, self.gamma_tanh_cell, self.beta_tanh_cell, self.running_mean_ih, self.running_mean_hh, self.running_mean_ch, self.running_mean_tanh_cell, self.running_var_ih, self.running_var_hh, self.running_var_ch, self.running_var_tanh_cell, state[0], state[1], self.momentum, self.eps, self.dropout, self.training)
        
        return output.transpose(0, 1).contiguous(), (new_h, new_cell)
    
    def __repr__(self):
        return f"NormalizedPeepholeLSTM(input_size={self.input_size}, hidden_size={self.hidden_size}, dropout={self.dropout}, momentum={self.momentum}, eps={self.eps})"

---

## Defining models
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

### Definition
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [8]:
class BNPeepholeTorch(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0, momentum=0.1, eps=1e-05):
        super().__init__()
        self.lstm0 = BNPeepholeLSTMTorch(input_size, hidden_size, dropout, momentum, eps)
        self.lstm1 = BNPeepholeLSTMTorch(hidden_size, hidden_size, dropout, momentum, eps)
        self.lstm2 = BNPeepholeLSTMTorch(hidden_size, hidden_size, dropout, momentum, eps)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
    def forward(self, x, states):
        hc1, hc2, hc3 = ((states[0][i], states[1][i]) for i in range(states[0].size(0)))
        x, hc1 = self.lstm0(x, hc1)
        x, hc2 = self.lstm1(x, hc2)
        x, hc3 = self.lstm2(x, hc3)
        x = self.fc(x)
        new_states = (torch.cat((hc1[0], hc2[0], hc3[0])), torch.cat((hc1[1], hc2[1], hc3[1])))
        return x, new_states

########################################################################################################################
    
class BNPeepholeCPP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout=0, momentum=0.1, eps=1e-05):
        super().__init__()
        self.lstm0 = BNPeepholeLSTMCPP(input_size, hidden_size, dropout, momentum, eps)
        self.lstm1 = BNPeepholeLSTMCPP(hidden_size, hidden_size, dropout, momentum, eps)
        self.lstm2 = BNPeepholeLSTMCPP(hidden_size, hidden_size, dropout, momentum, eps)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
    def forward(self, x, states):
        hc1, hc2, hc3 = ((states[0][i], states[1][i]) for i in range(states[0].size(0)))
        x, hc1 = self.lstm0(x, hc1)
        x, hc2 = self.lstm1(x, hc2)
        x, hc3 = self.lstm2(x, hc3)
        x = self.fc(x)
        new_states = (torch.cat((hc1[0], hc2[0], hc3[0])), torch.cat((hc1[1], hc2[1], hc3[1])))
        return x, new_states

### Instantiation
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [9]:
device = ('cpu', 'cuda')[1]

input_size = 5
hidden_size = 20
output_size = 10
dropout = 0.
momentum = 0.1
eps = 1e-05

model_torch = BNPeepholeTorch(input_size, hidden_size, output_size, dropout, momentum, eps)
model_cpp = BNPeepholeCPP(input_size, hidden_size, output_size, dropout, momentum, eps)

model_torch.to(device)
model_cpp.to(device)

models = (model_torch, model_cpp)

### Parameter Synchronization
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [10]:
named_parameter_dicts = [
    dict(model_torch.named_parameters()),
    dict(model_cpp.named_parameters())
]

print("Synchronized Parameters:\n")
for common_param_name in set.intersection(*(set(npd.keys()) for npd in named_parameter_dicts)):
    print("\t{}".format(common_param_name))
    for i in range(1, len(named_parameter_dicts)):
        if named_parameter_dicts[i][common_param_name].size() == named_parameter_dicts[0][common_param_name].size():
            named_parameter_dicts[i][common_param_name].data = named_parameter_dicts[0][common_param_name].data
        else:
            raise RuntimeError("Size mismatch\n0:{}\n{i}:{}".format(named_parameter_dicts[0][common_param_name].size(),
                                                                    named_parameter_dicts[i][common_param_name].size()))
print()
print("Exclusive Parameters (Not Synchronized):\n")
for exclusive_param_name in set.union(*(set(npd.keys()) for npd in named_parameter_dicts)) - set.intersection(*(set(npd.keys()) for npd in named_parameter_dicts)):
    print("\t{}".format(exclusive_param_name))

Synchronized Parameters:

	lstm2.weight_ch
	lstm1.weight_ih
	lstm0.gamma_ch
	lstm0.bias
	fc.bias
	lstm0.gamma_ih
	lstm0.weight_ch
	lstm0.gamma_hh
	lstm2.gamma_tanh_cell
	lstm2.weight_ih
	lstm1.gamma_ih
	lstm1.beta_tanh_cell
	lstm0.beta_tanh_cell
	fc.weight
	lstm2.bias
	lstm2.gamma_ih
	lstm2.weight_hh
	lstm1.weight_ch
	lstm0.weight_ih
	lstm1.gamma_hh
	lstm1.gamma_ch
	lstm1.gamma_tanh_cell
	lstm2.beta_tanh_cell
	lstm0.gamma_tanh_cell
	lstm2.gamma_ch
	lstm1.weight_hh
	lstm2.gamma_hh
	lstm0.weight_hh
	lstm1.bias

Exclusive Parameters (Not Synchronized):



---

## Creating a fake dataset
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [11]:
def create_fake_loader(dataset_size, sequence_length, batch_size, drop_last=True):
    fake_inputs = torch.randn(dataset_size, sequence_length, input_size)
    fake_targets = torch.randint(high=output_size, size=(dataset_size, sequence_length), dtype=torch.int64)

    fake_dataset = TensorDataset(fake_inputs, fake_targets)

    fake_loader = DataLoader(fake_dataset, batch_size=batch_size, drop_last=drop_last)
    
    return fake_loader

In [12]:
dataset_size = 1000
sequence_length = 20
batch_size = 32

fake_loader = create_fake_loader(dataset_size, sequence_length, batch_size)
print(next(iter(fake_loader))[0].size(), next(iter(fake_loader))[1].size())

torch.Size([32, 20, 5]) torch.Size([32, 20])


---

## Sanity check: output comparison
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

### Forward Outputs
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [13]:
hidden = (torch.zeros(3, batch_size, hidden_size, device=device), torch.zeros(3, batch_size, hidden_size, device=device))

inputs = next(iter(fake_loader))[0].to(device)

for model in models:
    model.train()
#     model.eval()

with torch.no_grad():
    print("[model_torch]")
    print("\n{partial output}")
    print(model_torch(inputs, hidden)[0][:4, -2:, :7])
    print("\n{partial running stat}")
    print(model_torch.lstm0.running_mean_ih[:16])
    print(model_torch.lstm0.running_var_ih[:16])
    print("\n")
    print("[model_cpp]")
    print("\n{partial output}")
    print(model_cpp(inputs, hidden)[0][:4, -2:, :7])
    print("\n{partial running stat}")
    print(model_cpp.lstm0.running_mean_ih[:16])
    print(model_cpp.lstm0.running_var_ih[:16])

[model_torch]

{partial output}
tensor([[[-0.4186,  0.1219,  0.1689,  0.4250, -0.2592,  0.1225,  0.1900],
         [-0.4184,  0.0997,  0.1880,  0.4353, -0.2315,  0.0688,  0.1375]],

        [[-0.1806, -0.0894, -0.0936,  0.3823, -0.0102,  0.0758,  0.0004],
         [-0.1453, -0.1031, -0.0836,  0.3456, -0.0135,  0.1084,  0.0478]],

        [[-0.1865, -0.2088,  0.1236,  0.0781,  0.2323, -0.1874, -0.2513],
         [-0.1975, -0.2112,  0.1484,  0.1027,  0.2236, -0.2123, -0.2658]],

        [[-0.4544,  0.1695, -0.4159,  0.3630, -0.0180,  0.0625, -0.0914],
         [-0.4745,  0.1880, -0.4048,  0.3721, -0.0601,  0.1053, -0.0624]]],
       device='cuda:0')

{partial running stat}
tensor([-0.0028, -0.0017,  0.0017, -0.0072, -0.0075, -0.0043, -0.0027, -0.0031,
         0.0086, -0.0080, -0.0092,  0.0068,  0.0033,  0.0023,  0.0023, -0.0087],
       device='cuda:0')
tensor([0.1430, 0.1473, 0.1475, 0.1458, 0.1405, 0.1657, 0.1702, 0.1516, 0.1603,
        0.1660, 0.1695, 0.1576, 0.1355, 0.1605, 0.1540,

### Backward Gradients
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [15]:
criterion = nn.CrossEntropyLoss()

hidden = (torch.zeros(3, batch_size, hidden_size, device=device), torch.zeros(3, batch_size, hidden_size, device=device))

inputs, targets = next(iter(fake_loader))
inputs = inputs.to(device)
targets = targets.to(device)

for model in models:
#     model.train()
    model.eval()
    model.zero_grad()
    criterion(model(inputs, hidden)[0].flatten(0, 1), targets.flatten(0, 1)).backward()

print("model_torch")
print(model_torch.lstm2.weight_ih.grad[:4, :7])
print(model_torch.lstm2.bias.grad[:25])
print("\n")
print("model_cpp")
print(model_cpp.lstm2.weight_ih.grad[:4, :7])
print(model_torch.lstm2.bias.grad[:25])
print("\n")
print(model_torch.lstm2.weight_ih.grad.sub(model_cpp.lstm2.weight_ih.grad).abs().sum())

model_torch
tensor([[ 4.3145e-07,  8.7233e-07, -9.5854e-07,  1.2553e-06, -7.9569e-08,
         -1.2469e-06,  3.8178e-07],
        [ 2.1202e-06, -4.4514e-07, -2.2069e-05,  4.2641e-05, -1.2819e-06,
         -1.5372e-06,  1.7842e-05],
        [ 9.7796e-08, -2.2805e-07, -6.9273e-07,  1.7257e-06, -3.7213e-08,
          9.2676e-08,  5.5854e-07],
        [ 6.5841e-08,  1.0308e-07, -1.3049e-06,  1.0900e-06, -5.3428e-08,
          1.7077e-07,  8.2855e-07]], device='cuda:0')
tensor([-8.8332e-06, -1.3035e-04, -4.3655e-05, -1.2249e-05,  4.2416e-04,
        -5.5785e-04,  3.2699e-05, -1.7135e-04, -1.3789e-05,  2.1965e-04,
         1.3887e-06,  5.9513e-04, -3.4441e-06, -1.5183e-04,  1.0374e-04,
        -2.6997e-05, -1.7569e-04,  4.8382e-05,  1.6407e-05, -2.1368e-05,
        -3.4967e-05, -6.4555e-05, -4.1796e-05, -1.2116e-05,  3.4734e-04],
       device='cuda:0')


model_cpp
tensor([[ 4.3145e-07,  8.7233e-07, -9.5854e-07,  1.2553e-06, -7.9569e-08,
         -1.2469e-06,  3.8178e-07],
        [ 2.1202e-

---

## Forward time comparison
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [20]:
dataset_size = 1000
sequence_length = 20
batch_size = 32

fake_loader = create_fake_loader(dataset_size, sequence_length, batch_size, drop_last=True)

hidden = (torch.zeros(3, batch_size, hidden_size, device=device), torch.zeros(3, batch_size, hidden_size, device=device))

In [21]:
%%timeit -n 1 -r 10
with torch.no_grad():
    for inputs, _ in fake_loader:
        inputs = inputs.to(device)
        model_torch(inputs, hidden)

3.02 s ± 107 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [22]:
%%timeit -n 1 -r 10
with torch.no_grad():
    for inputs, _ in fake_loader:
        inputs = inputs.to(device)
        model_cpp(inputs, hidden)

4.85 s ± 107 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


---

## +Backward time comparison
[go to top](#Peephole-LSTM-Test-&-Performance-Comparison-(Speed-&-Memory))

In [23]:
criterion = nn.CrossEntropyLoss()

In [24]:
%%timeit -r 10
for inputs, targets in fake_loader:
    inputs = inputs.to(device)
    targets = targets.to(device)
    model_torch.zero_grad()
    criterion(model_torch(inputs, hidden)[0].flatten(0, 1), targets.flatten(0, 1)).backward()

9.31 s ± 103 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


In [25]:
%%timeit -r 10
for inputs, targets in fake_loader:
    inputs = inputs.to(device)
    targets = targets.to(device)
    model_cpp.zero_grad()
    criterion(model_cpp(inputs, hidden)[0].flatten(0, 1), targets.flatten(0, 1)).backward()

8.46 s ± 98.2 ms per loop (mean ± std. dev. of 10 runs, 1 loop each)


---