## Install

In [0]:
!pip3 install torch torchvision numpy

## Imports

In [0]:
from pprint import pprint

from matplotlib import pyplot as plt
import numpy as np

import torch as th
from torch import nn
import torchvision
from torchvision import transforms

## Config

In [213]:
device = th.device('cuda' if th.cuda.is_available() else 'cpu')
print(f'Using {device}')


sequence_length = 10
input_size = 1
hidden_size = 128
num_layers = 3
num_classes = 2

num_steps = 5000
batch_size = 256

learning_rate = 0.005
lr_decay_step = 100
lr_gamma = 0.9

Using cuda


## Dataset: integers in decimal and binary

In [202]:
EMPTY_INPUT = -1

def dec_padded(decimal_batch, width=sequence_length):
    fixed_width = np.full(shape=(decimal_batch.shape[0], width),
                          fill_value=EMPTY_INPUT)
    fixed_width[:, 0] = decimal_batch
    return fixed_width.astype(np.float32)
  
def dec_to_bin_padded(decimal_batch, width=sequence_length):
  def helper(decimal):
    fixed_width = np.binary_repr(decimal[0]).zfill(width)
    return np.array(list(fixed_width)).astype(np.int64)
  decimal_batch = np.expand_dims(decimal_batch, -1)
  return np.apply_along_axis(helper, axis=1, arr=decimal_batch)

def create_batch(batch_size):
  """Creates a batch of (decimal, binary) data.
  
  Decimal input is shaped [batch, seq_len], where the decimal is right-padded
    with -1s along the sequence dim.
  Binary output is shaped [batch, seq_len], where the binary repr is
    left-padded with 0s along the sequence dim.
  """
  max_int = 2 ** sequence_length
  integers = np.random.randint(max_int, size=(batch_size,))
  
  dec_array = dec_padded(integers)
  bin_array = dec_to_bin_padded(integers)

  input, output = th.from_numpy(dec_array), th.from_numpy(bin_array)
  return input.to(device), output.to(device)

x, y = create_batch(2)
print(x.shape, y.shape)
print(x)
print(y)

torch.Size([2, 10]) torch.Size([2, 10])
tensor([[ 732.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,
           -1.],
        [ 325.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,
           -1.]], device='cuda:0')
tensor([[ 1,  0,  1,  1,  0,  1,  1,  1,  0,  0],
        [ 0,  1,  0,  1,  0,  0,  0,  1,  0,  1]], device='cuda:0')


In [0]:
class DecimalBinaryDataset(th.utils.data.Dataset):
  def __init__(self, num_bits):
    super().__init__()
    self._num_bits = num_bits
    self._max_val = 2 ** num_bits
    
    # Pre-cache all values.
    integers = np.arange(self._max_val)
    decimal = dec_padded(integers, num_bits)
    binary = dec_to_bin_padded(integers, num_bits)
    self._decimal = th.from_numpy(decimal).to(device)
    self._binary = th.from_numpy(binary).to(device)
    
  def __len__(self):
    return self._max_val

  def __getitem__(self, index):
    return self._decimal[index, :], self._binary[index, :]

dataset = DecimalBinaryDataset(sequence_length)

train_loader = th.utils.data.DataLoader(
    dataset=dataset, batch_size=batch_size, shuffle=True)
test_loader = th.utils.data.DataLoader(
    dataset=dataset, batch_size=batch_size, shuffle=False)

## Model (RNN)

In [0]:
class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, num_classes):
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, num_classes)
    
  def forward(self, x):
    """
    Args:
      x: shape [batch, seq_len]
      
    Returns:
      pred: class scores, shaped [batch, seq_len, num_classes]
    """
    x = x.unsqueeze(-1)  # Convert to [batch, seq_len, 1]
    
    # Set initial hidden and cell states.
    h0 = th.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    c0 = th.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
    
    # Forward propagate LSTM.
    out, _ = self.lstm(x, (h0, c0))  # out shape (batch_size, seq_len, hid_size)
    
    # Decode the hidden state at each timestep.
    out = self.fc(out)
    return out

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

## Train

In [217]:
# Loss and optimizer.
loss_fn = nn.CrossEntropyLoss()
optimizer = th.optim.Adam(model.parameters(), lr=learning_rate)
lr_schedule = th.optim.lr_scheduler.StepLR(optimizer, lr_decay_step, lr_gamma)


for step in range(num_steps):
  inputs, labels = create_batch(batch_size)

  # Forward
  outputs = model(inputs)
  outputs = outputs.view(-1, num_classes)
  labels = labels.view(-1)
  loss = loss_fn(outputs, labels)

  # Backward
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  lr_schedule.step()

  if (step + 1) % 200 == 0:
    print(f'Step [{step+1}/{num_steps}], '
          f'LR: {optimizer.param_groups[0]["lr"]:.4}, '
          f'Loss: {loss.item():.4}')  

Step [200/5000], LR: 0.0045, Loss: 0.5602
Step [400/5000], LR: 0.003645, Loss: 0.5327
Step [600/5000], LR: 0.002952, Loss: 0.49
Step [800/5000], LR: 0.002391, Loss: 0.461
Step [1000/5000], LR: 0.001937, Loss: 0.5068
Step [1200/5000], LR: 0.001569, Loss: 0.4617
Step [1400/5000], LR: 0.001271, Loss: 0.4236
Step [1600/5000], LR: 0.001029, Loss: 0.4509
Step [1800/5000], LR: 0.0008339, Loss: 0.4348
Step [2000/5000], LR: 0.0006754, Loss: 0.39
Step [2200/5000], LR: 0.0005471, Loss: 0.4025
Step [2400/5000], LR: 0.0004431, Loss: 0.3484
Step [2600/5000], LR: 0.0003589, Loss: 0.3503
Step [2800/5000], LR: 0.0002907, Loss: 0.3521
Step [3000/5000], LR: 0.0002355, Loss: 0.3317
Step [3200/5000], LR: 0.0001908, Loss: 0.3109
Step [3400/5000], LR: 0.0001545, Loss: 0.3135
Step [3600/5000], LR: 0.0001252, Loss: 0.2938
Step [3800/5000], LR: 0.0001014, Loss: 0.2999
Step [4000/5000], LR: 8.212e-05, Loss: 0.311
Step [4200/5000], LR: 6.651e-05, Loss: 0.2934
Step [4400/5000], LR: 5.388e-05, Loss: 0.2852
Step [46

In [174]:
print(x)
print(y)
model(x)


tensor([[  62.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.],
        [ 122.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.,   -1.]], device='cuda:0')
tensor([[ 0,  0,  1,  1,  1,  1,  1,  0],
        [ 0,  1,  1,  1,  1,  0,  1,  0]], device='cuda:0')


tensor([[[ 10.2906,  -9.7800],
         [  0.0232,  -0.2042],
         [ -1.9191,   0.8894],
         [ -3.0977,   2.8689],
         [ -4.0217,   4.0933],
         [ -3.2193,   3.4450],
         [ -1.0313,   1.3374],
         [  1.1034,  -1.0046]],

        [[  0.7029,  -1.0438],
         [ -2.6161,   1.5917],
         [ -4.1202,   4.0117],
         [ -4.1097,   4.2963],
         [ -1.5814,   1.8960],
         [  2.0017,  -1.9088],
         [ -0.1470,   0.0256],
         [  0.1043,  -0.2213]]], device='cuda:0')

## Test

In [209]:
with th.no_grad():
  correct, total = 0, 0
  for (inputs, labels) in test_loader:
    # Forward
    outputs = model(inputs)
    _, predicted = th.max(outputs.data, -1)
    total += labels.size(0)
    eq_mask = predicted == labels
    batch_correct = eq_mask.cpu().numpy().all(-1)
    for i, correct in enumerate(batch_correct):
      if not correct:
        print(inputs[i][0].cpu(), labels[i].cpu(), predicted[i, :].cpu())
    correct += batch_correct.sum()
  accuracy = correct / total
  print(f'Accuracy of model on 10000 test integers: {100 * accuracy:0.2f}%')

tensor(1.) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1]) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
tensor(2.) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  1,  0]) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
tensor(3.) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  1,  1]) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  1,  0])
tensor(4.) tensor([ 0,  0,  0,  0,  0,  0,  0,  1,  0,  0]) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  1,  1])
tensor(5.) tensor([ 0,  0,  0,  0,  0,  0,  0,  1,  0,  1]) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1])
tensor(6.) tensor([ 0,  0,  0,  0,  0,  0,  0,  1,  1,  0]) tensor([ 0,  0,  0,  0,  0,  1,  0,  0,  0,  1])
tensor(7.) tensor([ 0,  0,  0,  0,  0,  0,  0,  1,  1,  1]) tensor([ 0,  0,  0,  0,  0,  1,  0,  0,  0,  1])
tensor(8.) tensor([ 0,  0,  0,  0,  0,  0,  1,  0,  0,  0]) tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1])
tensor(10.) tensor([ 0,  0,  0,  0,  0,  0,  1,  0,  1,  0]) tensor([ 0,  0,  0,  0,  0,  0,  1,  0,  1,  1])
tensor(12.) tensor

## Save model

In [0]:
th.save(model.state_dict(), '/tmp/mnist_rnn.ckpt')