In [3]:
import torch

# Creates a 3D tensor with random numbers from normal distribution (mean=0, std=1)
x = torch.randn(1, 3, 6)

# Dimensions breakdown:
# 1   -> Number of sequences/batches (singleton dimension)
# 3  -> Batch size (number of samples)
# 6 -> Hidden dimension/feature size

# Shape components:
dimensions = {
    'dim 0 (sequences)': 1,    # Often used for sequence length or layers
    'dim 1 (batch)': 3,      # Number of samples processed together
    'dim 2 (features)': 6   # Size of hidden state/embedding
}

# Common uses:
# - Neural network initialization
# - Creating dummy data for testing
# - Initializing hidden states in RNN/LSTM

# The tensor x is a 3D tensor with shape (1, 3, 6)
x

tensor([[[-1.4043, -0.4011,  1.6967,  2.4041, -0.2251,  1.0248],
         [-0.1186,  0.1278, -0.7602,  0.8845, -1.2966,  0.7585],
         [ 0.0783, -0.4681,  0.0772,  0.3696,  0.8051, -0.7423]]])

In [4]:
# Squeeze - removes dimensions of size 1
# Because the number of batches is 1, we can remove the first dimension. It will result into a 2D tensor
squeezed = x.squeeze(0) 
squeezed

tensor([[-1.4043, -0.4011,  1.6967,  2.4041, -0.2251,  1.0248],
        [-0.1186,  0.1278, -0.7602,  0.8845, -1.2966,  0.7585],
        [ 0.0783, -0.4681,  0.0772,  0.3696,  0.8051, -0.7423]])

In [5]:
# Removes all dimensions of size 1. Not just the first dimension
squeezed = x.squeeze()       
squeezed

tensor([[-1.4043, -0.4011,  1.6967,  2.4041, -0.2251,  1.0248],
        [-0.1186,  0.1278, -0.7602,  0.8845, -1.2966,  0.7585],
        [ 0.0783, -0.4681,  0.0772,  0.3696,  0.8051, -0.7423]])

In [6]:

# Unsqueeze - adds dimension of size 1
unsqueezed = x.unsqueeze(1)  # Shape: [1, 1, 3, 6]

# Result will be a 4D tensor with shape (1, 1, 3, 6)
unsqueezed

tensor([[[[-1.4043, -0.4011,  1.6967,  2.4041, -0.2251,  1.0248],
          [-0.1186,  0.1278, -0.7602,  0.8845, -1.2966,  0.7585],
          [ 0.0783, -0.4681,  0.0772,  0.3696,  0.8051, -0.7423]]]])

In [9]:
# Example 2. Adding sequence dimension

# Shape: [batch, hidden]
hidden = torch.randn(2, 4)            

# Shape: [batch, 1, hidden]
sequence = hidden.unsqueeze(1)           
sequence

tensor([[[-0.1988,  0.1294, -0.3280, -0.1420]],

        [[-1.3942, -1.5514, -0.7374, -0.6113]]])

In [6]:
import torch
# Example: 
# Initial shape: [sequence_length=2, batch_size=3, vocab_size=5]
output = torch.randn(2, 3, 5)
output






tensor([[[-0.2523, -0.7688,  0.4002, -0.4287,  0.6104],
         [ 0.7961, -0.5849,  0.4927, -0.3052, -0.8675],
         [ 0.7898, -1.1308,  0.0450, -1.0427, -1.2169]],

        [[-0.5405,  1.3746,  0.4098, -1.1473, -0.0319],
         [-1.5157,  0.0699,  0.3763, -1.1690, -0.3738],
         [-0.1964,  0.9854,  0.8022, -0.2907, -1.3828]]])

In [7]:
output_dim = output.shape[-1]  # 5
output_dim

5

In [None]:
# 1. Slice first timestep (column) for all batches
sliced = output[1:]
sliced

tensor([[[-0.5405,  1.3746,  0.4098, -1.1473, -0.0319],
         [-1.5157,  0.0699,  0.3763, -1.1690, -0.3738],
         [-0.1964,  0.9854,  0.8022, -0.2907, -1.3828]]])

In [12]:
sliced.shape

torch.Size([1, 3, 5])

In [10]:
sliced.type()

'torch.FloatTensor'

In [None]:
# 2. Reshape from 3D to 2D
# -1 means compute the size to maintain total elements
# output_dim specifies the size of the last dimension
reshaped = sliced.view(-1, output_dim)
reshaped

tensor([[-0.5405,  1.3746,  0.4098, -1.1473, -0.0319],
        [-1.5157,  0.0699,  0.3763, -1.1690, -0.3738],
        [-0.1964,  0.9854,  0.8022, -0.2907, -1.3828]])

In [13]:
reshaped.shape

torch.Size([3, 5])

Here, we can experiment pytorch `squeeze` and `unsqueeze`

In [2]:
import torch
# Examples of handling torch tensors
# Example 1: Simple sequence
ids = [1, 2, 3, 4]  # sequence of token ids
tensor = torch.LongTensor(ids)
print("Original:", tensor.shape)  # Shape: [4]
print(tensor)

tensor = tensor.unsqueeze(-1)
print("\nAfter unsqueeze:", tensor.shape)  # Shape: [4, 1]
print(tensor)

# Example 2: Sentence tokens
sentence_ids = [5, 2, 8, 1, 9]  # Example token IDs for "Hello world"
tensor = torch.LongTensor(sentence_ids)
print("\nOriginal sentence:", tensor.shape)  # Shape: [5]
print(tensor)

# Add a new dimension at the end
tensor = tensor.unsqueeze(-1)
print("\nAfter unsqueeze:", tensor.shape)  # Shape: [5, 1]
print(tensor)

# Example 3: Alternative ways to unsqueeze
tensor = torch.LongTensor([1, 2, 3])
print("\nDifferent unsqueeze positions:")
print("Original:", tensor.shape)  # [3]
print("unsqueeze(-1):", tensor.unsqueeze(-1).shape)  # [3, 1]
print("unsqueeze(0):", tensor.unsqueeze(0).shape)    # [1, 3]
print("unsqueeze(1):", tensor.unsqueeze(1).shape)    # [3, 1]

Original: torch.Size([4])
tensor([1, 2, 3, 4])

After unsqueeze: torch.Size([4, 1])
tensor([[1],
        [2],
        [3],
        [4]])

Original sentence: torch.Size([5])
tensor([5, 2, 8, 1, 9])

After unsqueeze: torch.Size([5, 1])
tensor([[5],
        [2],
        [8],
        [1],
        [9]])

Different unsqueeze positions:
Original: torch.Size([3])
unsqueeze(-1): torch.Size([3, 1])
unsqueeze(0): torch.Size([1, 3])
unsqueeze(1): torch.Size([3, 1])


In [1]:
import torch

# Small example dimensions
batch_size = 2
hidden_dim = 4
src_length = 6

# Create sample hidden state with actual values
hidden = torch.tensor([
    [1, 2, 3, 4],    # batch item 1
    [5, 6, 7, 8]     # batch item 2
])  # Shape: [2, 4]

print("1. Original hidden state:")
print(hidden)
print(f"Shape: {hidden.shape}\n")

# Step 1: Add dimension at position 1
hidden_unsqueezed = hidden.unsqueeze(1)
print("2. After unsqueeze(1):")
print(hidden_unsqueezed)
print(f"Shape: {hidden_unsqueezed.shape}\n")

# Step 2: Repeat along sequence dimension,
# repeat 1 time on dimension 1 and dimension 3
# 6 times on dimension 2
hidden_repeated = hidden_unsqueezed.repeat(1, src_length, 1)
print("3. After repeat(1, 6, 1):")
print(hidden_repeated)
print(f"Shape: {hidden_repeated.shape}")

1. Original hidden state:
tensor([[1, 2, 3, 4],
        [5, 6, 7, 8]])
Shape: torch.Size([2, 4])

2. After unsqueeze(1):
tensor([[[1, 2, 3, 4]],

        [[5, 6, 7, 8]]])
Shape: torch.Size([2, 1, 4])

3. After repeat(1, 6, 1):
tensor([[[1, 2, 3, 4],
         [1, 2, 3, 4],
         [1, 2, 3, 4],
         [1, 2, 3, 4],
         [1, 2, 3, 4],
         [1, 2, 3, 4]],

        [[5, 6, 7, 8],
         [5, 6, 7, 8],
         [5, 6, 7, 8],
         [5, 6, 7, 8],
         [5, 6, 7, 8],
         [5, 6, 7, 8]]])
Shape: torch.Size([2, 6, 4])


In [10]:
import torch

# Create sample encoder outputs [seq_len, batch, hidden]
encoder_outputs = torch.tensor([
    # seq1
    [[1, 2, 3],  # batch1
     [4, 5, 6]], # batch2
    
    # seq2
    [[7, 8, 9],  # batch1
     [10,11,12]] # batch2
])  # Shape: [2, 2, 3]

print("Original shape:", encoder_outputs.shape)  # [2, 2, 3]
print("Original tensor:\n", encoder_outputs)

# Permute dimensions: [seq_len, batch, hidden] → [batch, seq_len, hidden]
# Dimension mapping: [0,1,2] → [1,0,2]
# 0 (seq_len=2) → 1  # Move to middle
# 1 (batch=2)  → 0  # Move to front
# 2 (hidden=3) → 2  # Stay at end
permuted = encoder_outputs.permute(1, 0, 2)
print("\nPermuted shape:", permuted.shape)  # [2, 2, 3]
print("Permuted tensor:\n", permuted)

Original shape: torch.Size([2, 2, 3])
Original tensor:
 tensor([[[ 1,  2,  3],
         [ 4,  5,  6]],

        [[ 7,  8,  9],
         [10, 11, 12]]])

Permuted shape: torch.Size([2, 2, 3])
Permuted tensor:
 tensor([[[ 1,  2,  3],
         [ 7,  8,  9]],

        [[ 4,  5,  6],
         [10, 11, 12]]])


Attention Linear Layer

In [9]:
import torch
import torch.nn as nn

# Example dimensions
batch_size = 2
seq_length = 3
hidden_dim = 4

# Sample tensors
hidden = torch.tensor([[[1, 2, 3, 4],
                       [5, 6, 7, 8]]])  # [1, 2, 4]

print(f"Hidden shape: {hidden.shape}")  # [1, 2, 4]

# Step 1: Remove first dimension of 1
hidden = hidden.squeeze(0)  # [2, 4]

# Step 2: Add new dimension at position 1
hidden = hidden.unsqueeze(1)  # [2, 1, 4]

# Step 3: Repeat middle dimension to match encoder_outputs seq_length
hidden = hidden.repeat(1, 3, 1)  # [2, 3, 4]
   
encoder_outputs = torch.tensor([
    [[0.1, 0.2, 0.3, 0.4],   # seq1
     [0.5, 0.6, 0.7, 0.8],   # seq2
     [0.9, 1.0, 1.1, 1.2]],  # seq3
    [[1.3, 1.4, 1.5, 1.6],   # batch2-seq1
     [1.7, 1.8, 1.9, 2.0],   # batch2-seq2
     [2.1, 2.2, 2.3, 2.4]]   # batch2-seq3
])  # [2, 3, 4]

# Shape transformations:
# 1. Concatenate: [batch_size, seq_len, hidden_dim*2]
concat = torch.cat((hidden, encoder_outputs), dim=2)  # [2, 3, 8]

# 2. Linear layer: hidden_dim*2 → 1
attn_fc = nn.Linear(hidden_dim*2, 1)

# 3. Apply tanh for stable gradients
energy = torch.tanh(attn_fc(concat))  # [2, 3, 1]

print(f"Energy shape: {energy.shape}")  # [2, 3, 1]

Hidden shape: torch.Size([1, 2, 4])
Energy shape: torch.Size([2, 3, 1])


`tqdm` is a progress bar library in Python to shows execution progress of interations
Key Features: 
- Progress bar visualization
- ETA (Estimated Time of Arrival)
- Processing speed (iterations/second)
- Memory usage
- Customizable display options


In [7]:
import tqdm

# Basic usage
for i in tqdm.tqdm(range(500000000)):
    # do something
    pass
# Output: 100%|██████████| 100/100 [00:00<00:00, 2000.00it/s]

# With custom description
for i in tqdm.tqdm(range(100), desc="Processing"):
    pass
# Output: Processing: 100%|██████████| 100/100 [00:00<00:00, 2000.00it/s]

100%|██████████| 500000000/500000000 [00:25<00:00, 19805178.39it/s]
Processing: 100%|██████████| 100/100 [00:00<00:00, 2424453.18it/s]


In [13]:
# With nested loops
for i in tqdm.tqdm(range(100), desc='Outer'):
    for j in tqdm.tqdm(range(10000000), desc='Inner', leave=False):
        pass

Outer:   0%|          | 0/100 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
Outer:   1%|          | 1/100 [00:00<00:57,  1.71it/s]
[A
[A
[A
[A
[A
[A
Outer:   2%|▏         | 2/100 [00:01<00:57,  1.70it/s]
[A
[A
[A
[A
[A
[A
Outer:   3%|▎         | 3/100 [00:01<00:56,  1.71it/s]
[A
[A
[A
[A
[A
[A
Outer:   4%|▍         | 4/100 [00:02<00:55,  1.73it/s]
[A
[A
[A
[A
[A
[A
Outer:   5%|▌         | 5/100 [00:02<00:54,  1.73it/s]
[A
[A
[A
[A
[A
[A
Outer:   6%|▌         | 6/100 [00:03<00:54,  1.72it/s]
[A
[A
[A
[A
[A
[A
Outer:   7%|▋         | 7/100 [00:04<00:54,  1.72it/s]
[A
[A
[A
[A
[A
[A
Outer:   8%|▊         | 8/100 [00:04<00:53,  1.73it/s]
[A
[A
[A
[A
[A
[A
Outer:   9%|▉         | 9/100 [00:05<00:52,  1.74it/s]
[A
[A
[A
[A
[A
[A
Outer:  10%|█         | 10/100 [00:05<00:52,  1.72it/s]
[A
[A
[A
[A
[A
[A
Outer:  11%|█         | 11/100 [00:06<00:51,  1.72it/s]
[A
[A
[A
[A
[A
[A
Outer:  12%|█▏        | 12/100 [00:06<00:50,  1.73it/s]
[