## Uni directional LSTM : with batch_first = True 

In [47]:
import torch
import torch.nn as nn

torch.manual_seed(13)
lstm_input = torch.randn(2, 3, 10)

# Reshape the input tensor
reshaped_input = lstm_input.permute(1, 0, 2)  # (sequence_length, batch_size, input_dim)

# Define the LSTM cell
input_dim = 10  # Number of features at each time step
hidden_dim = 20  # Number of hidden units in the LSTM cell
lstm_cell = nn.LSTMCell(input_dim, hidden_dim)

# Initialize the hidden state and cell state
batch_size = 2
hx = torch.zeros(batch_size, hidden_dim)
cx = torch.zeros(batch_size, hidden_dim)

# Pass the reshaped input through the LSTM cell
output = []
for i in range(3):  # Iterate over the sequence length
    hx, cx = lstm_cell(reshaped_input[i], (hx, cx))
    output.append(hx)

# Convert the output to a tensor
output = torch.stack(output)

print(output)


tensor([[[-0.1735,  0.0038,  0.1194,  0.1309, -0.0380, -0.0148, -0.0819,
          -0.0277,  0.0496,  0.0536,  0.1000, -0.0668,  0.1009,  0.0773,
           0.0688, -0.0478,  0.1195, -0.0588,  0.0189, -0.0175],
         [-0.1130,  0.0776,  0.0244,  0.0391,  0.0204, -0.0470, -0.0093,
          -0.1353,  0.0512,  0.1718,  0.2299,  0.1731,  0.0400, -0.0017,
          -0.0147,  0.0133,  0.3188, -0.0916, -0.0131, -0.0774]],

        [[ 0.0812,  0.0455,  0.0047,  0.1378,  0.0083,  0.0119, -0.1008,
           0.1060,  0.2122, -0.0591,  0.2257, -0.1863,  0.0897, -0.0348,
           0.0083,  0.1045,  0.1123, -0.0628,  0.0725,  0.0522],
         [ 0.1395,  0.1696, -0.1172, -0.0369,  0.0646,  0.1543,  0.0258,
           0.1252,  0.2361,  0.1940,  0.0922, -0.0354,  0.0168,  0.0974,
          -0.1142, -0.1370,  0.0984,  0.0826, -0.0957,  0.1414]],

        [[ 0.0641,  0.1497, -0.0913, -0.0074,  0.0141, -0.0551, -0.0394,
           0.1378,  0.0189,  0.0119,  0.0033, -0.0797,  0.0029, -0.1228,
      

In [52]:
hx # hidden state of last timestep 

tensor([[ 0.0641,  0.1497, -0.0913, -0.0074,  0.0141, -0.0551, -0.0394,  0.1378,
          0.0189,  0.0119,  0.0033, -0.0797,  0.0029, -0.1228, -0.1744, -0.0622,
          0.1366, -0.0503,  0.0833, -0.1936],
        [ 0.0835,  0.1460, -0.1632, -0.0009, -0.1080,  0.0853,  0.1047,  0.1883,
          0.0468,  0.2327,  0.1032, -0.0724,  0.0418,  0.1980, -0.1232, -0.1628,
          0.1094,  0.0797, -0.0712,  0.1553]], grad_fn=<MulBackward0>)

In [48]:
for i in range(3):  # Iterate over the sequence length
    print(reshaped_input[i])

tensor([[ 0.4372,  0.3701,  1.5816, -0.1556,  0.1511, -1.3495, -0.7089, -0.2434,
         -0.0389,  1.0810],
        [ 0.2593,  1.8514,  1.3406,  1.7659,  0.5640, -0.6749,  0.0914,  0.3948,
          1.5457, -0.3610]])
tensor([[ 0.9088,  0.0789, -0.0895,  0.1714, -0.1575,  1.9800,  0.7573, -0.4274,
         -1.5918, -0.0736],
        [-0.2723,  0.6279, -2.7544,  0.4208,  0.2516,  0.9675, -0.6870,  0.9042,
          0.3286, -0.0742]])
tensor([[-2.5141,  0.1140,  0.9822,  0.0681, -0.0996,  0.8033,  1.0441, -0.5201,
          0.8059,  1.0867],
        [ 0.1414, -1.2538, -0.3456, -0.2211, -0.7043,  0.3368,  0.0064,  0.2326,
          0.9527, -0.4139]])


In [49]:
lstm_input

tensor([[[ 0.4372,  0.3701,  1.5816, -0.1556,  0.1511, -1.3495, -0.7089,
          -0.2434, -0.0389,  1.0810],
         [ 0.9088,  0.0789, -0.0895,  0.1714, -0.1575,  1.9800,  0.7573,
          -0.4274, -1.5918, -0.0736],
         [-2.5141,  0.1140,  0.9822,  0.0681, -0.0996,  0.8033,  1.0441,
          -0.5201,  0.8059,  1.0867]],

        [[ 0.2593,  1.8514,  1.3406,  1.7659,  0.5640, -0.6749,  0.0914,
           0.3948,  1.5457, -0.3610],
         [-0.2723,  0.6279, -2.7544,  0.4208,  0.2516,  0.9675, -0.6870,
           0.9042,  0.3286, -0.0742],
         [ 0.1414, -1.2538, -0.3456, -0.2211, -0.7043,  0.3368,  0.0064,
           0.2326,  0.9527, -0.4139]]])

In [50]:
reshaped_input

tensor([[[ 0.4372,  0.3701,  1.5816, -0.1556,  0.1511, -1.3495, -0.7089,
          -0.2434, -0.0389,  1.0810],
         [ 0.2593,  1.8514,  1.3406,  1.7659,  0.5640, -0.6749,  0.0914,
           0.3948,  1.5457, -0.3610]],

        [[ 0.9088,  0.0789, -0.0895,  0.1714, -0.1575,  1.9800,  0.7573,
          -0.4274, -1.5918, -0.0736],
         [-0.2723,  0.6279, -2.7544,  0.4208,  0.2516,  0.9675, -0.6870,
           0.9042,  0.3286, -0.0742]],

        [[-2.5141,  0.1140,  0.9822,  0.0681, -0.0996,  0.8033,  1.0441,
          -0.5201,  0.8059,  1.0867],
         [ 0.1414, -1.2538, -0.3456, -0.2211, -0.7043,  0.3368,  0.0064,
           0.2326,  0.9527, -0.4139]]])

In [99]:
lstm = nn.LSTM(
            input_size=4, 
            hidden_size=10,
            num_layers=1,
            bidirectional= False,
            batch_first = True
        )

In [100]:
lstm

LSTM(4, 10, batch_first=True)

In [101]:
x = torch.randn(2,3,4)  # 2 batches, 3 tokens, each token is represented by 4 dim vector
x

tensor([[[ 0.0169, -0.1896,  0.1056,  1.3294],
         [ 1.7862, -0.3860, -1.0516,  1.4761],
         [ 0.7651, -0.0073,  0.7714, -1.4906]],

        [[ 0.4931, -0.5488,  0.2820, -0.2617],
         [ 1.5349,  0.2515, -0.1117,  0.5740],
         [ 1.6791,  1.6109, -0.5240, -0.4526]]])

In [102]:
output, (hx,cx) = lstm(x)

In [121]:
output 
# batch size is 2
# one element has 3 tokens 
# corresponding to each tokens we have a hidden state stored 
# in lstm most of the cases we use last hidden state of all the elements in a batch 
# since we have two batches here we will take the last hidden state of both elements


tensor([[[ 0.1855, -0.1394, -0.0590,  0.0404,  0.1128,  0.1281,  0.0590,
          -0.0274, -0.0022, -0.0581],
         [ 0.1200, -0.3064, -0.0430,  0.2128,  0.1870,  0.1861,  0.0894,
          -0.0406,  0.0517,  0.0938],
         [ 0.1611, -0.1037, -0.0089,  0.2342,  0.0320,  0.1737, -0.0627,
           0.0546,  0.1800,  0.1769]],

        [[ 0.1492, -0.0735,  0.0058,  0.0727,  0.0539,  0.1093, -0.0485,
           0.0389,  0.0908,  0.0830],
         [ 0.1247, -0.1943, -0.0582,  0.2144,  0.0857,  0.1623, -0.0242,
           0.0601,  0.1523,  0.1715],
         [ 0.0302, -0.1695, -0.0821,  0.3260,  0.0545,  0.0489,  0.0143,
           0.0434,  0.2671,  0.2833]]], grad_fn=<TransposeBackward0>)

In [122]:
output[:,batch_size,:] 

tensor([[ 0.1611, -0.1037, -0.0089,  0.2342,  0.0320,  0.1737, -0.0627,  0.0546,
          0.1800,  0.1769],
        [ 0.0302, -0.1695, -0.0821,  0.3260,  0.0545,  0.0489,  0.0143,  0.0434,
          0.2671,  0.2833]], grad_fn=<SliceBackward0>)

In [123]:
hx.shape  

torch.Size([1, 2, 10])

In [124]:
output.shape

torch.Size([2, 3, 10])

In [125]:
output[output.shape[0]-1]

tensor([[ 0.1492, -0.0735,  0.0058,  0.0727,  0.0539,  0.1093, -0.0485,  0.0389,
          0.0908,  0.0830],
        [ 0.1247, -0.1943, -0.0582,  0.2144,  0.0857,  0.1623, -0.0242,  0.0601,
          0.1523,  0.1715],
        [ 0.0302, -0.1695, -0.0821,  0.3260,  0.0545,  0.0489,  0.0143,  0.0434,
          0.2671,  0.2833]], grad_fn=<SelectBackward0>)

In [126]:
hx

tensor([[[ 0.1611, -0.1037, -0.0089,  0.2342,  0.0320,  0.1737, -0.0627,
           0.0546,  0.1800,  0.1769],
         [ 0.0302, -0.1695, -0.0821,  0.3260,  0.0545,  0.0489,  0.0143,
           0.0434,  0.2671,  0.2833]]], grad_fn=<StackBackward0>)

In [127]:
cx

tensor([[[ 0.3917, -0.2877, -0.0293,  0.3249,  0.0995,  0.3470, -0.1808,
           0.0948,  0.3159,  0.3612],
         [ 0.1171, -0.3221, -0.1907,  0.6064,  0.1275,  0.1403,  0.0414,
           0.1372,  0.5179,  0.4662]]], grad_fn=<StackBackward0>)

## What if it is bidirectional

In [141]:
bi_lstm = nn.LSTM(
            input_size=4, 
            hidden_size=5,
            num_layers=1,
            bidirectional= True,
            batch_first = True
        )

In [142]:
x = torch.randn(2,3,4)  
# 2 batches, 3 tokens, each token is represented by 4 dim vector

In [143]:
output, (hx,cx) = bi_lstm(x)

In [144]:
output
# output contains both directions lstm outputs
# in forward direction, each timestep lstm hidden state has a 5 dimensional output 
# in backward direction, each timestep lstm hidden state has 5 dimensional output 


# we have batchsize 2 here 
# in the output first array has dimension (1,3,10)
# first 5 values in the first dim corresponds to the foward direction hidden 
# second 5 values in the first dim corresponds to the backward direction (final)


tensor([[[ 0.0728,  0.0608,  0.1383,  0.1270, -0.0046, -0.2140,  0.0967,
           0.2819,  0.0730, -0.2082],
         [ 0.1710, -0.1659,  0.1097,  0.1377, -0.1140, -0.2165, -0.0859,
           0.1174,  0.0380, -0.1644],
         [ 0.3054, -0.2081,  0.0070,  0.0931, -0.1055, -0.1350, -0.1162,
          -0.0984,  0.0561, -0.0354]],

        [[-0.0525, -0.0849,  0.0346,  0.0158, -0.0629, -0.3748,  0.0809,
           0.2623,  0.2009, -0.2269],
         [ 0.1740, -0.2876,  0.0826,  0.1080, -0.1008, -0.1487,  0.0726,
           0.1754,  0.1618, -0.1111],
         [-0.0571,  0.1055,  0.1906,  0.2258,  0.0537, -0.3654,  0.3338,
           0.3916,  0.0974,  0.1387]]], grad_fn=<TransposeBackward0>)

In [145]:
output.shape
# note that here the last dimension we have the value of 20

torch.Size([2, 3, 10])

In [168]:
hx.shape

torch.Size([2, 2, 5])

## stacking multiple lstm :  output of one lstm is passed into another lstm

In [156]:
import torch
import torch.nn as nn

# Define custom model class
class StackedLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(StackedLSTM, self).__init__()
        self.lstm_layers = nn.ModuleList()
        self.num_layers = num_layers

        for _ in range(num_layers):
            self.lstm_layers.append(nn.LSTM(input_size, hidden_size, batch_first=True))
            input_size = hidden_size

    def forward(self, x):
        for i in range(self.num_layers):
            x, _ = self.lstm_layers[i](x)
            print(x)
        return x

# Example usage
input_size = 4
hidden_size = 5
num_layers = 2

# Create an instance of the model
model = StackedLSTM(input_size, hidden_size, num_layers)

# Generate some input data
batch_size = 3
sequence_length = 7
input_data = torch.randn(batch_size, sequence_length, input_size)
# print(input_data)

# Pass the input through the model
output = model(input_data)

# Print the output shape
# print(output.shape)

tensor([[[ 0.0910,  0.0985,  0.1572,  0.1337,  0.0287],
         [ 0.1093,  0.1988,  0.1030,  0.0624,  0.0398],
         [ 0.2978,  0.2197,  0.1769,  0.0870,  0.0433],
         [ 0.1856,  0.1130,  0.0361,  0.0512, -0.0525],
         [-0.0332, -0.1232, -0.0547,  0.0411,  0.0160],
         [ 0.0165,  0.0615, -0.0280,  0.0487,  0.0673],
         [-0.0366, -0.1633, -0.1418,  0.2091, -0.0403]],

        [[-0.0157, -0.0070,  0.0154,  0.1137,  0.0315],
         [-0.0470, -0.1371, -0.0197, -0.0110,  0.0973],
         [-0.0984, -0.1972, -0.2542,  0.0096,  0.0954],
         [-0.0386, -0.0604, -0.1443,  0.0551,  0.0697],
         [ 0.0350,  0.0932, -0.0109,  0.0044,  0.1797],
         [-0.0344,  0.2124,  0.0099, -0.0873,  0.1756],
         [ 0.0861,  0.2185,  0.0178, -0.0197,  0.1479]],

        [[ 0.1495,  0.1175,  0.1209,  0.1191, -0.0998],
         [ 0.2267,  0.1225,  0.1779,  0.2057, -0.1728],
         [ 0.0863,  0.1031,  0.0155,  0.0363, -0.0161],
         [ 0.1642,  0.0873,  0.0762,  0.0059

In [157]:
output 
# for 3 elements in the batch 
# we have 4 vectors corresponding to each tokens 
# each such vector is represented in 5 dimensional 

tensor([[[0.2007, 0.0366, 0.0289, 0.0787, 0.1340],
         [0.2973, 0.0443, 0.0480, 0.1434, 0.1573],
         [0.3787, 0.0513, 0.0513, 0.1946, 0.1609],
         [0.4035, 0.0440, 0.0683, 0.2001, 0.1514],
         [0.3895, 0.0239, 0.0851, 0.1805, 0.1474],
         [0.3849, 0.0208, 0.0874, 0.1960, 0.1581],
         [0.3700, 0.0134, 0.0906, 0.1821, 0.1730]],

        [[0.1718, 0.0287, 0.0332, 0.0648, 0.1356],
         [0.2520, 0.0229, 0.0511, 0.0980, 0.1503],
         [0.2649, 0.0105, 0.0646, 0.1131, 0.1635],
         [0.2942, 0.0112, 0.0704, 0.1407, 0.1740],
         [0.3306, 0.0200, 0.0681, 0.1803, 0.1713],
         [0.3554, 0.0195, 0.0784, 0.2044, 0.1535],
         [0.3878, 0.0271, 0.0780, 0.2245, 0.1584]],

        [[0.2056, 0.0350, 0.0323, 0.0701, 0.1277],
         [0.3168, 0.0452, 0.0510, 0.1298, 0.1568],
         [0.3464, 0.0384, 0.0677, 0.1618, 0.1535],
         [0.3890, 0.0354, 0.0722, 0.1835, 0.1484],
         [0.4240, 0.0358, 0.0811, 0.2130, 0.1457],
         [0.4055, 0.0192, 0

In [169]:
output.shape

torch.Size([2, 7, 8])

In [182]:
import torch
import torch.nn as nn

input_size = 3
hidden_size = 4
num_layers = 4

# Create a stacked LSTM model
model = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,bidirectional=True)

# Generate some input data
batch_size = 2
sequence_length = 7
input_data = torch.randn(batch_size, sequence_length, input_size)

# Pass the input through the model
output, (hidden,cell) = model(input_data)

# Print the output shape
print(output.shape)

torch.Size([2, 7, 8])


In [183]:
output

tensor([[[ 0.0810,  0.0551,  0.0185,  0.0093,  0.0895,  0.2262, -0.2059,
           0.1461],
         [ 0.1209,  0.0776,  0.0230, -0.0005,  0.0773,  0.2246, -0.2079,
           0.1392],
         [ 0.1406,  0.0873,  0.0222, -0.0168,  0.0677,  0.2208, -0.2060,
           0.1294],
         [ 0.1501,  0.0901,  0.0174, -0.0378,  0.0600,  0.2118, -0.2012,
           0.1197],
         [ 0.1519,  0.0899,  0.0121, -0.0579,  0.0521,  0.1936, -0.1910,
           0.1078],
         [ 0.1511,  0.0864,  0.0046, -0.0797,  0.0437,  0.1626, -0.1692,
           0.0932],
         [ 0.1509,  0.0809, -0.0058, -0.1077,  0.0306,  0.1054, -0.1215,
           0.0665]],

        [[ 0.0808,  0.0559,  0.0189,  0.0096,  0.0954,  0.2307, -0.2055,
           0.1486],
         [ 0.1231,  0.0798,  0.0229, -0.0032,  0.0858,  0.2319, -0.2070,
           0.1453],
         [ 0.1446,  0.0883,  0.0194, -0.0246,  0.0754,  0.2270, -0.2053,
           0.1362],
         [ 0.1554,  0.0892,  0.0114, -0.0519,  0.0671,  0.2175, -0.2

In [186]:
hidden.shape  # 4 layers * bidirection, 2 batch, hidden_size is 4 

torch.Size([8, 2, 4])

In [185]:
output.shape

torch.Size([2, 7, 8])