# Peephole LSTM Test & Performance Comparison (Speed & Memory)

* [Imports](#Importing-necessary-modules)
* [Load & Definition](#Loading-and-defining-modules)
    * [Autograd Functions](#Autograd-Functions)
    * [Module Classes](#Module-classes-(C++,-CUDA,-PyTorch))
* [Models](#Defining-models)
    * [Definition](#Definition)
    * [Instantiation](#Instantiation)
    * [Parameter Synchronization](#Parameter-Synchronization)
* [Fake Dataset](#Creating-a-fake-dataset)
* [Sanity Check](#Sanity-check:-output-comparison)
* [Forward Performance](#Forward-time-comparison)
* [+Backward Performance](#+Backward-time-comparison)

---

## Importing necessary modules

In [1]:
import torch
from torch import nn
from torch.utils.cpp_extension import load
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader

import math
from collections import OrderedDict
from time import sleep

torch.__version__

'0.4.1'

---

## Loading and defining modules

### Autograd Functions

In [2]:
_peephole_lstm_cell = load('peephole_lstm_cell', ['./peephole_lstm_cell.cpp'])
_peephole_lstm_cpp = load('peephole_lstm', ['./peephole_lstm.cpp'])
_peephole_lstm_cuda = load('peephole_lstm_cuda', ['./peephole_lstm_cuda.cpp', './peephole_lstm_cuda_kernal.cu'])

########################################################################################################################

class PeepholeLSTMCellFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight_ih, weight_hh, weight_ch, bias, old_h, old_cell):
        outputs = _peephole_lstm_cell.forward(input, weight_ih, weight_hh, weight_ch, bias, old_h, old_cell)
        new_h, new_cell = outputs[:2]
        variables = [old_cell] + outputs[2:] + [weight_ih, weight_hh, weight_ch]
        ctx.save_for_backward(*variables)

        return new_h, new_cell

    @staticmethod
    def backward(ctx, grad_h, grad_cell):
        outputs = _peephole_lstm_cell.backward(
            grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
        d_old_h, d_old_cell, d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias = outputs
        return d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias, d_old_h, d_old_cell
    
########################################################################################################################
    
class PeepholeLSTMFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight_ih, weight_hh, weight_ch, bias, old_h, old_cell, dropout_p, training):
        outputs = _peephole_lstm_cpp.forward(input, weight_ih, weight_hh, weight_ch, bias, old_h, old_cell, dropout_p, training)
        out, new_h, new_cell = outputs[:3]
        variables = outputs[3:] + [weight_ih, weight_hh, weight_ch]
        ctx.save_for_backward(*variables)

        return out, new_h, new_cell

    @staticmethod
    def backward(ctx, grad_output, grad_h, grad_cell):
        outputs = _peephole_lstm_cpp.backward(
            grad_output.contiguous(), grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
        d_old_h, d_old_cell, d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias = outputs
        return d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias, d_old_h, d_old_cell, None, None
    
########################################################################################################################
    
class PeepholeLSTMCUDAFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight_ih, weight_hh, weight_ch, bias, old_h, old_cell, dropout_p, training):
        outputs = _peephole_lstm_cuda.forward(input, weight_ih, weight_hh, weight_ch, bias, old_h, old_cell, dropout_p, training)
        out, new_h, new_cell = outputs[:3]
        variables = outputs[3:] + [weight_ih, weight_hh, weight_ch]
        ctx.save_for_backward(*variables)

        return out, new_h, new_cell

    @staticmethod
    def backward(ctx, grad_output, grad_h, grad_cell):
        outputs = _peephole_lstm_cuda.backward(
            grad_output.contiguous(), grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
        d_old_h, d_old_cell, d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias = outputs
        return d_input, d_weight_ih, d_weight_hh, d_weight_ch, d_bias, d_old_h, d_old_cell, None, None



!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Your compiler (c++) may be ABI-incompatible with PyTorch!
Please use a compiler that is ABI-compatible with GCC 4.9 and above.
See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html.

See https://gist.github.com/goldsborough/d466f43e8ffc948ff92de7486c5216d6
for instructions on how to install GCC 4.9 or higher.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!




### Module classes (C++, CUDA, PyTorch)

In [4]:
class PeepholeLSTMCellTorch(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PeepholeLSTMCellTorch, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.weight_ih = torch.nn.Parameter(torch.empty(4 * hidden_size, input_size))
        self.weight_hh = torch.nn.Parameter(torch.empty(4 * hidden_size, hidden_size))
        self.weight_ch = torch.nn.Parameter(torch.empty(3 * hidden_size, hidden_size))
        self.bias = torch.nn.Parameter(torch.empty(4 * hidden_size))
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
        for param in self.parameters():
            param.data.uniform_(-stdv, +stdv)

    def forward(self, input, states):
        gates = torch.addmm(self.bias, input, self.weight_ih.t())
        gates = gates + torch.mm(states[0], self.weight_hh.t())
        gates[:, :3 * self.hidden_size] = gates[:, :3 * self.hidden_size] + torch.mm(states[1], self.weight_ch.t())
        gates = torch.cat((gates[:, :3*self.hidden_size].sigmoid(), gates[:, 3*self.hidden_size:].tanh()), dim=1).chunk(4, dim=1)
    
        new_cell = ( states[1] * gates[0] ) + ( gates[1] * gates[3] )
        new_hidden = gates[2] * new_cell.tanh()
    
        return new_hidden, new_cell
    
    def __repr__(self):
        return f"PeepholeLSTMCellTorch(input_size={self.input_size}, hidden_size={self.hidden_size})"

########################################################################################################################

class PeepholeLSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(PeepholeLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.weight_ih = torch.nn.Parameter(torch.empty(4 * hidden_size, input_size))
        self.weight_hh = torch.nn.Parameter(torch.empty(4 * hidden_size, hidden_size))
        self.weight_ch = torch.nn.Parameter(torch.empty(3 * hidden_size, hidden_size))
        self.bias = torch.nn.Parameter(torch.empty(4 * hidden_size))
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, +stdv)

    def forward(self, input, states):
        return PeepholeLSTMCellFunction.apply(input, self.weight_ih, self.weight_hh, self.weight_ch, self.bias, *states)
    
    def __repr__(self):
        return f"PeepholeLSTMCell(input_size={self.input_size}, hidden_size={self.hidden_size})"

########################################################################################################################
    
class PeepholeLSTMTorch(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.):
        if not 0 <= dropout <= 1:
            raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")
        super(PeepholeLSTMTorch, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.weight_ih = torch.nn.Parameter(torch.empty(4 * hidden_size, input_size))
        self.weight_hh = torch.nn.Parameter(torch.empty(4 * hidden_size, hidden_size))
        self.weight_ch = torch.nn.Parameter(torch.empty(3 * hidden_size, hidden_size))
        self.bias = torch.nn.Parameter(torch.empty(4 * hidden_size))
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
        for param in self.parameters():
            param.data.uniform_(-stdv, +stdv)

    def forward(self, input, states):
        assert input.dim() == 3
        outputs = input.new_empty((input.size(0), input.size(1), self.hidden_size))
        
        h = states[0].clone()
        c = states[1].clone()
        
        weight_ih = self.weight_ih.t()
        weight_hh = self.weight_hh.t()
        weight_ch = self.weight_ch.t()
        
        ih = torch.matmul(input.transpose(0, 1), weight_ih)
        
        for i in range(input.size(1)):
            h = F.dropout(h, p=self.dropout, training=self.training)
            
            gates = ih[i] + torch.addmm(self.bias, h, weight_hh)
            gates[:, :3 * self.hidden_size] += torch.mm(c, weight_ch)
            
            gates = torch.cat((gates[:, :3 * self.hidden_size].sigmoid(), gates[:, 3 * self.hidden_size:].tanh()), dim=1).chunk(chunks=4, dim=1)
            
            c = torch.addcmul(gates[1] * gates[3], c, gates[0])
            h = gates[2] * c.tanh()
            
            outputs[:, i] = h
        
        outputs = F.dropout(outputs, p=self.dropout, training=self.training)
    
        return outputs, (h, c)
    
    def __repr__(self):
        return f"PeepholeLSTMTorch(input_size={self.input_size}, hidden_size={self.hidden_size}, dropout={self.dropout})"
    
########################################################################################################################
    
class PeepholeLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.):
        if not 0 <= dropout <= 1:
            raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")
        super(PeepholeLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = float(dropout)
        self.weight_ih = torch.nn.Parameter(torch.empty(4 * hidden_size, input_size))
        self.weight_hh = torch.nn.Parameter(torch.empty(4 * hidden_size, hidden_size))
        self.weight_ch = torch.nn.Parameter(torch.empty(3 * hidden_size, hidden_size))
        self.bias = torch.nn.Parameter(torch.empty(4 * hidden_size))
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, +stdv)

    def forward(self, input, states):
        dropout = self.dropout if self.training else 0.
        input = input.transpose(0, 1).contiguous()
        output, new_h, new_cell = PeepholeLSTMFunction.apply(input, self.weight_ih, self.weight_hh, self.weight_ch,
                                                             self.bias, *states, dropout, self.training)
        return output.transpose(0, 1).contiguous(), (new_h, new_cell)
    
    def __repr__(self):
        return f"PeepholeLSTM(input_size={self.input_size}, hidden_size={self.hidden_size}, dropout={self.dropout})"
    
########################################################################################################################
    
class PeepholeLSTMCUDA(nn.Module):
    def __init__(self, input_size, hidden_size, dropout=0.):
        if not 0 <= dropout <= 1:
            raise ValueError(f"Invalid dropout value : {dropout} dropout must be in range [0, 1].")
        super(PeepholeLSTMCUDA, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.dropout = float(dropout)
        self.weight_ih = torch.nn.Parameter(torch.empty(4 * hidden_size, input_size))
        self.weight_hh = torch.nn.Parameter(torch.empty(4 * hidden_size, hidden_size))
        self.weight_ch = torch.nn.Parameter(torch.empty(3 * hidden_size, hidden_size))
        self.bias = torch.nn.Parameter(torch.empty(4 * hidden_size))
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.input_size + 2 * self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, +stdv)

    def forward(self, input, states):
        dropout = self.dropout if self.training else 0.
        input = input.transpose(0, 1).contiguous()
        output, new_h, new_cell = PeepholeLSTMCUDAFunction.apply(input, self.weight_ih, self.weight_hh, self.weight_ch,
                                                                 self.bias, *states, dropout, self.training)
        return output.transpose(0, 1).contiguous(), (new_h, new_cell)
    
    def __repr__(self):
        return f"PeepholeLSTMCUDA(input_size={self.input_size}, hidden_size={self.hidden_size}, dropout={self.dropout})"

---

## Defining models

### Definition

In [6]:
class PeepholeTorch(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.lstm0 = PeepholeLSTMCellTorch(input_size, hidden_size)
        self.lstm1 = PeepholeLSTMCellTorch(hidden_size, hidden_size)
        self.lstm2 = PeepholeLSTMCellTorch(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
    def forward(self, x, states):
        output = x.new_empty(x.size(0), x.size(1), self.output_size)
        hc1, hc2, hc3 = ((states[0][i], states[1][i]) for i in range(states[0].size(0)))
        for i, seq_batch in enumerate(x.transpose(0, 1)):
            hc1 = self.lstm0(seq_batch, hc1)
            hc2 = self.lstm1(F.dropout(hc1[0], p=0, training=self.training), hc2)
            hc3 = self.lstm2(F.dropout(hc2[0], p=0, training=self.training), hc3)
            output[:, i, :] = self.fc(F.dropout(hc3[0], p=0, training=self.training))
        new_states = (torch.cat((hc1[0], hc2[0], hc3[0])), torch.cat((hc1[1], hc2[1], hc3[1])))
        return output, new_states

########################################################################################################################
    
class PeepholeCPP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.lstm0 = PeepholeLSTMCell(input_size, hidden_size)
        self.lstm1 = PeepholeLSTMCell(hidden_size, hidden_size)
        self.lstm2 = PeepholeLSTMCell(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
    def forward(self, x, states):
        output = x.new_empty(x.size(0), x.size(1), self.output_size)
        hc1, hc2, hc3 = ((states[0][i], states[1][i]) for i in range(states[0].size(0)))
        for i, seq_batch in enumerate(x.transpose(0, 1)):
            hc1 = self.lstm0(seq_batch, hc1)
            hc2 = self.lstm1(F.dropout(hc1[0], p=0, training=self.training), hc2)
            hc3 = self.lstm2(F.dropout(hc2[0], p=0, training=self.training), hc3)
            output[:, i, :] = self.fc(F.dropout(hc3[0], p=0, training=self.training))
        new_states = (torch.cat((hc1[0], hc2[0], hc3[0])), torch.cat((hc1[1], hc2[1], hc3[1])))
        return output, new_states
    
########################################################################################################################

class PeepholeLoopTorch(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout):
        super().__init__()
        self.lstm0 = PeepholeLSTMTorch(input_size, hidden_size, dropout)
        self.lstm1 = PeepholeLSTMTorch(hidden_size, hidden_size, dropout)
        self.lstm2 = PeepholeLSTMTorch(hidden_size, hidden_size, dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        
    def forward(self, x, states):
        hc1, hc2, hc3 = ((states[0][i], states[1][i]) for i in range(states[0].size(0)))
        x, hc1 = self.lstm0(x, hc1)
        print(x)
        x, hc2 = self.lstm1(x, hc2)
        print(x)
        x, hc3 = self.lstm2(x, hc3)
        print(x)
        x = self.fc(x)
        new_states = (torch.cat((hc1[0], hc2[0], hc3[0])), torch.cat((hc1[1], hc2[1], hc3[1])))
        return x, new_states
    
########################################################################################################################

class PeepholeLoopCPP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout):
        super().__init__()
        self.lstm0 = PeepholeLSTM(input_size, hidden_size, dropout)
        self.lstm1 = PeepholeLSTM(hidden_size, hidden_size, dropout)
        self.lstm2 = PeepholeLSTM(hidden_size, hidden_size, dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        
    def forward(self, x, states):
        hc1, hc2, hc3 = ((states[0][i], states[1][i]) for i in range(states[0].size(0)))
        x, hc1 = self.lstm0(x, hc1)
        print(x)
        x, hc2 = self.lstm1(x, hc2)
        print(x)
        x, hc3 = self.lstm2(x, hc3)
        print(x)
        x = self.fc(x)
        new_states = (torch.cat((hc1[0], hc2[0], hc3[0])), torch.cat((hc1[1], hc2[1], hc3[1])))
        return x, new_states

########################################################################################################################

class PeepholeLoopCUDA(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout):
        super().__init__()
        self.lstm0 = PeepholeLSTMCUDA(input_size, hidden_size, dropout)
        self.lstm1 = PeepholeLSTMCUDA(hidden_size, hidden_size, dropout)
        self.lstm2 = PeepholeLSTMCUDA(hidden_size, hidden_size, dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout = dropout
        
    def forward(self, x, states):
        hc1, hc2, hc3 = ((states[0][i], states[1][i]) for i in range(states[0].size(0)))
        x, hc1 = self.lstm0(x, hc1)
        x, hc2 = self.lstm1(x, hc2)
        x, hc3 = self.lstm2(x, hc3)
        x = self.fc(x)
        new_states = (torch.cat((hc1[0], hc2[0], hc3[0])), torch.cat((hc1[1], hc2[1], hc3[1])))
        return x, new_states

### Instantiation

In [7]:
device = ('cpu', 'cuda')[1]

input_size = 3
hidden_size = 5
output_size = 2
dropout = 0.

model_cell_torch = PeepholeTorch(input_size, hidden_size, output_size)
model_cell_cpp = PeepholeCPP(input_size, hidden_size, output_size)

model_loop_torch = PeepholeLoopTorch(input_size, hidden_size, output_size, dropout)
model_loop_cpp = PeepholeLoopCPP(input_size, hidden_size, output_size, dropout)
model_loop_cuda = PeepholeLoopCUDA(input_size, hidden_size, output_size, dropout)

model_cell_torch.to(device)
model_cell_cpp.to(device)
model_loop_torch.to(device)
model_loop_cpp.to(device)
model_loop_cuda.to(device)

models = (model_cell_torch, model_cell_cpp, model_loop_torch, model_loop_cpp, model_loop_cuda)

### Parameter Synchronization

In [8]:
named_parameter_dicts = [
    dict(model_cell_torch.named_parameters()),
    dict(model_cell_cpp.named_parameters()),
    dict(model_loop_torch.named_parameters()),
    dict(model_loop_cpp.named_parameters()),
    dict(model_loop_cuda.named_parameters()),
]

print("Synchronized Parameters:\n")
for common_param_name in set.intersection(*(set(npd.keys()) for npd in named_parameter_dicts)):
    print("\t{}".format(common_param_name))
    for i in range(1, len(named_parameter_dicts)):
        if named_parameter_dicts[i][common_param_name].size() == named_parameter_dicts[0][common_param_name].size():
            named_parameter_dicts[i][common_param_name].data = named_parameter_dicts[0][common_param_name].data
        else:
            raise RuntimeError("Size mismatch\n0:{}\n{i}:{}".format(named_parameter_dicts[0][common_param_name].size(),
                                                                    named_parameter_dicts[i][common_param_name].size()))
print()
print("Exclusive Parameters (Not Synchronized):\n")
for exclusive_param_name in set.union(*(set(npd.keys()) for npd in named_parameter_dicts)) - set.intersection(*(set(npd.keys()) for npd in named_parameter_dicts)):
    print("\t{}".format(exclusive_param_name))

Synchronized Parameters:

	lstm2.bias
	lstm1.weight_ih
	lstm0.weight_hh
	fc.bias
	lstm0.bias
	lstm2.weight_ch
	lstm1.weight_ch
	lstm2.weight_hh
	fc.weight
	lstm0.weight_ch
	lstm2.weight_ih
	lstm1.bias
	lstm1.weight_hh
	lstm0.weight_ih

Exclusive Parameters (Not Synchronized):



---

## Creating a fake dataset

In [9]:
dataset_size = 1000
sequence_length = 2

batch_size = 2

fake_inputs = torch.randn(dataset_size, sequence_length, input_size)
fake_targets = torch.randint(high=output_size, size=(dataset_size, sequence_length), dtype=torch.int64)

fake_dataset = TensorDataset(fake_inputs, fake_targets)

fake_loader = DataLoader(fake_dataset, batch_size=batch_size)

print(next(iter(fake_loader))[0].size(), next(iter(fake_loader))[1].size())

torch.Size([2, 2, 3]) torch.Size([2, 2])


---

## Sanity check: output comparison

In [10]:
hidden = (torch.zeros(3, batch_size, hidden_size, device=device), torch.zeros(3, batch_size, hidden_size, device=device))

inputs, targets = next(iter(fake_loader))
inputs = inputs.to(device)
targets = targets.to(device)

# Output or Hidden
i = {"output": 0, "hidden": 1}["output"]

for model in models:
    model.train()
#     model.eval()

with torch.no_grad():
#     print("model_cell_torch")
#     print(model_cell_torch(inputs, hidden)[i])
#     print("\n")
#     print("model_cell_cpp")
#     print(model_cell_cpp(inputs, hidden)[i])
#     print("\n")
    print("model_loop_torch")
    print(model_loop_torch(inputs, hidden)[i])
    print("\n")
    print("model_loop_cpp")
    print(model_loop_cpp(inputs, hidden)[i])
    print("\n")
#     print("model_loop_cuda")
#     print(model_loop_cuda(inputs, hidden)[i])
#     print("\n")

model_loop_torch
tensor([[[ 0.0227,  0.0768, -0.1173, -0.0336, -0.0224],
         [-0.0090,  0.0725, -0.1446, -0.0626, -0.0410]],

        [[ 0.0840, -0.0619, -0.0597,  0.1108, -0.0773],
         [ 0.0905,  0.0711, -0.1619,  0.0355, -0.1084]]], device='cuda:0')
tensor([[[ 0.0283,  0.0022,  0.0476,  0.0344, -0.0383],
         [ 0.0462, -0.0004,  0.0721,  0.0518, -0.0589]],

        [[ 0.0101,  0.0005,  0.0478,  0.0239, -0.0297],
         [ 0.0233, -0.0092,  0.0814,  0.0407, -0.0466]]], device='cuda:0')
tensor([[[-0.0061, -0.0423, -0.0019,  0.0481, -0.0220],
         [-0.0140, -0.0626, -0.0015,  0.0762, -0.0364]],

        [[-0.0064, -0.0419, -0.0012,  0.0490, -0.0224],
         [-0.0150, -0.0619, -0.0001,  0.0780, -0.0369]]], device='cuda:0')
tensor([[[ 0.4308, -0.1655],
         [ 0.4319, -0.1565]],

        [[ 0.4309, -0.1655],
         [ 0.4321, -0.1563]]], device='cuda:0')


model_loop_cpp
tensor([[[ 0.0355,  0.1086, -0.1633, -0.0517, -0.0333],
         [-0.0180,  0.0554, -0.1177, -

---

## Forward time comparison

---

## +Backward time comparison