# NLP - Exploring Padding and Packing Sequences

By [Akshaj Verma](https://akshajverma.com)

This notebook takes you throught the basics of padding and packing with respect to RNNs in PyTorch.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [2]:
torch.manual_seed(7)

<torch._C.Generator at 0x7fe1440a7ed0>

## Padding and Packing Sequences

In [3]:
a = torch.LongTensor([1, 2, 3])
b = torch.LongTensor([4, 5])
c = torch.LongTensor([7, 8, 9, 10])
d = torch.LongTensor([11, 12, 23, 14, 15, 16])
e = torch.LongTensor([17, 18, 19, 20])
f = torch.LongTensor([21, 22])
# X = [a, b, c, d, e, f]
X = [d, e, c, a, b, f]

X

[tensor([11, 12, 23, 14, 15, 16]),
 tensor([17, 18, 19, 20]),
 tensor([ 7,  8,  9, 10]),
 tensor([1, 2, 3]),
 tensor([4, 5]),
 tensor([21, 22])]

### Padding

In [4]:
X_padded = pad_sequence(X, batch_first=True, padding_value=0)
X_padded

tensor([[11, 12, 23, 14, 15, 16],
        [17, 18, 19, 20,  0,  0],
        [ 7,  8,  9, 10,  0,  0],
        [ 1,  2,  3,  0,  0,  0],
        [ 4,  5,  0,  0,  0,  0],
        [21, 22,  0,  0,  0,  0]])

### Packing

Returns the list and batch sizes.

In [5]:
X_packed = pack_padded_sequence(X_padded, lengths=[6, 4, 4, 3, 2, 2], batch_first=True)
X_packed

PackedSequence(data=tensor([11, 17,  7,  1,  4, 21, 12, 18,  8,  2,  5, 22, 23, 19,  9,  3, 14, 20,
        10, 15, 16]), batch_sizes=tensor([6, 6, 4, 3, 1, 1]), sorted_indices=None, unsorted_indices=None)

### Padding packed sequence

In [6]:
pad_packed_sequence(X_packed, batch_first=True)

(tensor([[11, 12, 23, 14, 15, 16],
         [17, 18, 19, 20,  0,  0],
         [ 7,  8,  9, 10,  0,  0],
         [ 1,  2,  3,  0,  0,  0],
         [ 4,  5,  0,  0,  0,  0],
         [21, 22,  0,  0,  0,  0]]), tensor([6, 4, 4, 3, 2, 2]))

## Minimal Example #1 [Dataloader]

Input data.

In [7]:
X

[tensor([11, 12, 23, 14, 15, 16]),
 tensor([17, 18, 19, 20]),
 tensor([ 7,  8,  9, 10]),
 tensor([1, 2, 3]),
 tensor([4, 5]),
 tensor([21, 22])]

In [8]:
BATCH_SIZE = 2

In [9]:
class InputData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [10]:
train_data = InputData(X)

# Collate function is required here to use tensors of different sizes here
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=lambda x: x)

In [11]:
for i, batch in enumerate(train_loader):
    len_list = list(map(len, batch))
    
    padded_seq = pad_sequence(batch, batch_first=True)
    packed_seq = pack_padded_sequence(padded_seq, lengths=len_list, batch_first=True)
    padded_seq_1 = pad_packed_sequence(packed_seq, batch_first=True)
    
    print(f"Batch Number: {i}")
    print("-"* 50, "\n")
    print("Original Batch:")
    print(batch)
    print("\nPadded Batch:")
    print(padded_seq)
    print("\nPack padded seq:")
    print(packed_seq)
    print("\nPad packed seq:")
    print(padded_seq_1)
    print("=" * 100)

Batch Number: 0
-------------------------------------------------- 

Original Batch:
[tensor([11, 12, 23, 14, 15, 16]), tensor([17, 18, 19, 20])]

Padded Batch:
tensor([[11, 12, 23, 14, 15, 16],
        [17, 18, 19, 20,  0,  0]])

Pack padded seq:
PackedSequence(data=tensor([11, 17, 12, 18, 23, 19, 14, 20, 15, 16]), batch_sizes=tensor([2, 2, 2, 2, 1, 1]), sorted_indices=None, unsorted_indices=None)

Pad packed seq:
(tensor([[11, 12, 23, 14, 15, 16],
        [17, 18, 19, 20,  0,  0]]), tensor([6, 4]))
Batch Number: 1
-------------------------------------------------- 

Original Batch:
[tensor([ 7,  8,  9, 10]), tensor([1, 2, 3])]

Padded Batch:
tensor([[ 7,  8,  9, 10],
        [ 1,  2,  3,  0]])

Pack padded seq:
PackedSequence(data=tensor([ 7,  1,  8,  2,  9,  3, 10]), batch_sizes=tensor([2, 2, 2, 1]), sorted_indices=None, unsorted_indices=None)

Pad packed seq:
(tensor([[ 7,  8,  9, 10],
        [ 1,  2,  3,  0]]), tensor([4, 3]))
Batch Number: 2
-------------------------------------

## Minimal Example #2 [Dataloader + RNN]

Input data.

In [12]:
X

[tensor([11, 12, 23, 14, 15, 16]),
 tensor([17, 18, 19, 20]),
 tensor([ 7,  8,  9, 10]),
 tensor([1, 2, 3]),
 tensor([4, 5]),
 tensor([21, 22])]

In [13]:
class InputData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [14]:
train_data = InputData(X)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=False, collate_fn=lambda x:x)

In [15]:
BATCH_SIZE = 2
EMBEDDING_SIZE = 5
VOCAB_SIZE = 24 #numbers 1 to 23
HIDDEN_SIZE = 2

In [16]:
class ModelRNN(nn.Module):
    
    def __init__(self, embedding_size, vocab_size, hidden_size):
        super(ModelRNN, self).__init__()
        
        self.word_embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_size)
        self.rnn = nn.GRU(input_size = embedding_size, hidden_size = hidden_size, batch_first = True)
        
    def forward(self, batch):
        len_list = list(map(len, batch))
        print("Original Batch:")
        print(batch)
        print("\nList of tensor lengths per batch", len_list)
        padded_batch = pad_sequence(batch, batch_first=True)
        print("\nPadded Batch:")
        print(padded_batch)
        embeds = self.word_embeddings(padded_batch)
        print("\nEmbeddings: ")
        print(embeds)
        pack_embeds = pack_padded_sequence(embeds, lengths=len_list, batch_first=True)
        print("\nPacked Embeddings:")
        print(pack_embeds)
        print("\nRNN Output")
        rnn_out, rnn_hidden = self.rnn(pack_embeds)
        print("\n RNN Out: ")
        print(rnn_out)
        print("\nRNN Hidden:")
        print(rnn_hidden)
        print("\nPadded RNN Output:")
        padded_rnn_out = pad_packed_sequence(rnn_out, batch_first = True) 
        print(padded_rnn_out)
        
        return padded_rnn_out, rnn_hidden

In [17]:
model_rnn = ModelRNN(embedding_size=EMBEDDING_SIZE, vocab_size=VOCAB_SIZE, hidden_size=HIDDEN_SIZE)
model_rnn

ModelRNN(
  (word_embeddings): Embedding(24, 5)
  (rnn): GRU(5, 2, batch_first=True)
)

In [18]:
for i, batch in enumerate(train_loader):
    len_list = list(map(len, batch))
    
    print(f"Batch Number: {i}")
    print("-"* 50, "\n")   
    model_rnn(batch)
    print("=" * 100)


Batch Number: 0
-------------------------------------------------- 

Original Batch:
[tensor([11, 12, 23, 14, 15, 16]), tensor([17, 18, 19, 20])]

List of tensor lengths per batch [6, 4]

Padded Batch:
tensor([[11, 12, 23, 14, 15, 16],
        [17, 18, 19, 20,  0,  0]])

Embeddings: 
tensor([[[-1.1669, -0.4375, -2.1085,  1.1450, -0.3822],
         [-0.3553,  0.7542,  0.6901, -0.1443, -0.5146],
         [-0.9336, -0.1527, -0.5300,  1.4535, -1.5414],
         [ 1.5496,  0.5989,  0.4675, -0.1439, -0.8120],
         [-0.3866, -1.0370,  0.5920, -0.7557,  0.3917],
         [ 0.5722,  0.3078, -0.1259, -0.9578,  1.7518]],

        [[ 0.9796,  0.4105,  1.7675,  0.7569,  0.9862],
         [-0.8253,  0.1633,  0.5013,  1.4206,  1.1542],
         [-1.5366,  1.0571, -1.1047,  0.1274, -0.0189],
         [-0.4073,  0.5317, -0.7420, -0.6375,  0.6794],
         [ 0.8989, -1.3884, -0.1670,  0.2851, -0.6411],
         [ 0.8989, -1.3884, -0.1670,  0.2851, -0.6411]]],
       grad_fn=<EmbeddingBackward>)

Pa