#library

In [None]:
import torch
import numpy as np

from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence, pad_packed_sequence

# data

In [None]:
data = ['hello world',
        'midnight',
        'calculation',
        'path',
        'short circuit']

In [None]:
char_set = ['<pad>'] + list(set(char for seq in data for char in seq))

char2idx = {char:idx for idx,char in enumerate(char_set)}

print('char_set:', char_set)
print('char_set length:', len(char_set))

char_set: ['<pad>', ' ', 'c', 'o', 'm', 'i', 'a', 'n', 'p', 'u', 'r', 'd', 't', 's', 'e', 'h', 'l', 'w', 'g']
char_set length: 19


# padding

In [None]:
x = [torch.LongTensor([char2idx[char] for char in seq]) for seq in data]

for sequence in x:

    print(sequence)

tensor([15, 14, 16, 16,  3,  1, 17,  3, 10, 16, 11])
tensor([ 4,  5, 11,  7,  5, 18, 15, 12])
tensor([ 2,  6, 16,  2,  9, 16,  6, 12,  5,  3,  7])
tensor([ 8,  6, 12, 15])
tensor([13, 15,  3, 10, 12,  1,  2,  5, 10,  2,  9,  5, 12])


In [None]:
lengths = [len(seq) for seq in x]

print('lengths:', lengths)

lengths: [11, 8, 11, 4, 13]


In [None]:
padded_sequence = pad_sequence(x, batch_first = True)

print(padded_sequence)
print(padded_sequence.shape)

tensor([[15, 14, 16, 16,  3,  1, 17,  3, 10, 16, 11,  0,  0],
        [ 4,  5, 11,  7,  5, 18, 15, 12,  0,  0,  0,  0,  0],
        [ 2,  6, 16,  2,  9, 16,  6, 12,  5,  3,  7,  0,  0],
        [ 8,  6, 12, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [13, 15,  3, 10, 12,  1,  2,  5, 10,  2,  9,  5, 12]])
torch.Size([5, 13])


# packing

In [None]:
sorted_idx = sorted(range(len(lengths)), key = lengths.__getitem__, reverse=True)

sorted_x = [x[idx] for idx in sorted_idx]

for sequence in sorted_x:

    print(sequence)

tensor([13, 15,  3, 10, 12,  1,  2,  5, 10,  2,  9,  5, 12])
tensor([15, 14, 16, 16,  3,  1, 17,  3, 10, 16, 11])
tensor([ 2,  6, 16,  2,  9, 16,  6, 12,  5,  3,  7])
tensor([ 4,  5, 11,  7,  5, 18, 15, 12])
tensor([ 8,  6, 12, 15])


In [None]:
packed_sequence = pack_sequence(sorted_x)

print(packed_sequence)

PackedSequence(data=tensor([13, 15,  2,  4,  8, 15, 14,  6,  5,  6,  3, 16, 16, 11, 12, 10, 16,  2,
         7, 15, 12,  3,  9,  5,  1,  1, 16, 18,  2, 17,  6, 15,  5,  3, 12, 12,
        10, 10,  5,  2, 16,  3,  9, 11,  7,  5, 12]), batch_sizes=tensor([5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 1, 1]), sorted_indices=None, unsorted_indices=None)


# RNN output

In [None]:
eye = torch.eye(len(char_set))
embedded_tensor = eye[padded_sequence]
print(embedded_tensor.shape)

torch.Size([5, 13, 19])


In [None]:
embedded_packed_seq = pack_sequence([eye[x[idx]] for idx in sorted_idx])
print(embedded_packed_seq.data.shape)

torch.Size([47, 19])


In [None]:
rnn = torch.nn.RNN(input_size = len(char_set), hidden_size = 30, batch_first = True)

In [None]:
rnn_output, hidden = rnn(embedded_tensor)

print(rnn_output.shape)
print(hidden.shape)

torch.Size([5, 13, 30])
torch.Size([1, 5, 30])


In [None]:
rnn_output, hidden = rnn(embedded_packed_seq)
print(rnn_output.data.shape)
print(hidden.data.shape)

torch.Size([47, 30])
torch.Size([1, 5, 30])


# unpacking

In [None]:
unpacked_sequence, seq_lengths = pad_packed_sequence(embedded_packed_seq, batch_first = True)
print(unpacked_sequence.shape)
print(seq_lengths)

torch.Size([5, 13, 19])
tensor([13, 11, 11,  8,  4])


In [None]:
embedded_padded_sequence = eye[pad_sequence(sorted_x, batch_first = True)]
print(embedded_padded_sequence.shape)

torch.Size([5, 13, 19])


In [None]:
sorted_lengths = sorted(lengths, reverse=True)

new_packed_sequence = pack_padded_sequence(embedded_padded_sequence, sorted_lengths, batch_first = True)

print(new_packed_sequence.data.shape)
print(new_packed_sequence.batch_sizes)

torch.Size([47, 19])
tensor([5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 1, 1])
