In [2]:
import torch
from torch import nn, optim
import numpy as np
import RVQE

In [3]:
torch.set_num_threads(2)

Our goal is to create a RNN or LSTM with roughly 837 or a larger one with roughly 100000 parameters, and compare it in the dna long sequence task implemented within RVQE.
In either test the batch size is 128.

In [4]:
dataset_t = lambda length: RVQE.datasets.all_datasets["dna"](0, num_shards=0, batch_size=128, sentence_length=length)

In [5]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
def to_one_hot(labels, num_classes=2**3):
    return torch.eye(num_classes)[labels]

In [6]:
SEEDS = [9120, 2783, 2057, 6549, 3201, 7063, 5243, 3102, 5303, 5819, 3693, 4884, 2231, 5514, 8850, 6861, 3106, 2378, 8697, 1821, 9480, 8483, 1633, 9678, 6596, 4509, 8618, 9765, 6346, 2969];
LENGTHS = [5, 10, 20, 50, 100, 200, 500, 1000];

# LSTM

In [13]:
HIDDEN_SIZE_837 = 10
NUM_LAYERS_837 = 1
ARGS_837 = (HIDDEN_SIZE_837, NUM_LAYERS_837)

class SimpleLSTM(nn.Module):
    """
        This is a very simplistic LSTM setup. We found a single layer performs
        much better than two layers with a smaller hidden size.
    """
    
    def __init__(self, hidden_size: int, num_layers: int, io_size=2**3):
        super().__init__()
        
        self.rnn = nn.LSTM(input_size=io_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.lin = nn.Linear(hidden_size, io_size)
        
    def reset(self):
        self.lin.reset_parameters()
        for name, param in self.rnn.named_parameters():
            # give an orthogonal start
            if "weight_hh" in name:
                # stacked
                h = param.data.shape[1]
                for i in range(4):
                    torch.nn.init.orthogonal_(param.data[h*i : h*(i+1), :])
            elif "bias" in name:
                param.data.fill_(0)
            elif "weight_ih" in name:
                torch.nn.init.xavier_uniform_(param.data)
            else:
                raise Exception(f"cannot initialize {name}")
        
    @property
    def num_parameters(self):
        return count_parameters(self.rnn) + count_parameters(self.lin)
        
    def forward(self, sentence):
        rnn_out, _ = self.rnn(sentence)
        return self.lin(rnn_out)

In [15]:
SimpleLSTM(*ARGS_837).num_parameters

888

In [41]:
def run_model(lrs: list, lengths: list, seeds: list, results: dict, model_args: tuple):    
    for lr in lrs:
        results[lr] = results[lr] if lr in results else {}
        _results = results[lr]

        for length in lengths:

            dataset = dataset_t(length)
            print(f"created LSTM with {SimpleLSTM(*model_args).num_parameters} parameters")

            criterion = nn.CrossEntropyLoss()

            _results[length] = _results[length] if length in _results else []
            __results = _results[length]

            for seed in seeds:
                if seed in [ s for s, _ in __results ]:
                    continue

                torch.manual_seed(seed)
                model = SimpleLSTM(*model_args)
                model.reset()
                optimizer = optim.Adam(model.parameters(), lr=lr)   # this has been found to converge fastest

                for step in range(1, 100*1000): # cap amounts to the same number of samples seen as for qrnn
                    sentence, target = dataset.next_batch(0, RVQE.data.TrainingStage.TRAIN)

                    # transform sentence to one-hot as in the qrnn case
                    sentence = to_one_hot(RVQE.data.targets_for_loss(sentence))            

                    optimizer.zero_grad()
                    out = model(sentence.float())

                    # unlike the qrnn case, we use the entire output as loss
                    # this gives the rnn an advantage!
                    out = out.transpose(1, 2)
                    target = RVQE.data.targets_for_loss(target)
                    loss = criterion(out, target)

                    loss.backward()
                    optimizer.step()

                    if torch.isnan(loss):
                        print("nan")
                        __results.append([seed, np.nan])
                        break

                    if loss < 0.001:
                        __results.append([seed, step])
                        print(f"length {length} converged after {step} steps.")
                        break

                    if step % 500 == 0:
                        pass
                        print(f"{step:06d} {loss:.2e}")

                else:
                    print(f"length {length} did not converge after {step} steps.")
                    __results.append([seed, -1])
    
    return results

## Small Net

In [23]:
#lr_results_small = {}
run_model([.3, .1, .03, .01], LENGTHS[:4], SEEDS[:5], lr_results_small, ARGS_837)

created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters


{0.3: {5: [[9120, 24], [2783, 25], [2057, 28], [6549, 31], [3201, 33]],
  10: [[9120, 47], [2783, 48], [2057, 71], [6549, 108], [3201, 288]],
  20: [[9120, 97], [2783, 384], [2057, 306], [6549, 311], [3201, 196]],
  50: [[9120, 1011], [2783, 2343], [2057, 1338], [6549, 1052], [3201, 2243]]},
 0.1: {5: [[9120, 36], [2783, 46], [2057, 42], [6549, 36], [3201, 35]],
  10: [[9120, 87], [2783, 120], [2057, 75], [6549, 105], [3201, 211]],
  20: [[9120, 353], [2783, 161], [2057, 136], [6549, 134], [3201, 239]],
  50: [[9120, 463], [2783, 500], [2057, 241], [6549, 360], [3201, 555]],
  100: [[9120, 475], [2783, 328], [2057, 741], [6549, 983], [3201, 517]],
  200: [[9120, 1823], [2783, 462], [2057, 276], [6549, 366], [3201, 1353]],
  500: [[9120, 4323], [2783, 606], [2057, 507], [6549, 298], [3201, 1539]],
  1000: []},
 0.03: {5: [[9120, 289], [2783, 328], [2057, 349], [6549, 312], [3201, 318]],
  10: [[9120, 378], [2783, 489], [2057, 422], [6549, 465], [3201, 485]],
  20: [[9120, 492], [2783, 6

In [32]:
lr_results_small

{0.3: {5: [[9120, 24], [2783, 25], [2057, 28], [6549, 31], [3201, 33]],
  10: [[9120, 47], [2783, 48], [2057, 71], [6549, 108], [3201, 288]],
  20: [[9120, 97], [2783, 384], [2057, 306], [6549, 311], [3201, 196]],
  50: [[9120, 1011], [2783, 2343], [2057, 1338], [6549, 1052], [3201, 2243]]},
 0.1: {5: [[9120, 36], [2783, 46], [2057, 42], [6549, 36], [3201, 35]],
  10: [[9120, 87], [2783, 120], [2057, 75], [6549, 105], [3201, 211]],
  20: [[9120, 353], [2783, 161], [2057, 136], [6549, 134], [3201, 239]],
  50: [[9120, 463], [2783, 500], [2057, 241], [6549, 360], [3201, 555]]},
 0.03: {5: [[9120, 289], [2783, 328], [2057, 349], [6549, 312], [3201, 318]],
  10: [[9120, 378], [2783, 489], [2057, 422], [6549, 465], [3201, 485]],
  20: [[9120, 492], [2783, 677], [2057, 577], [6549, 591], [3201, 883]],
  50: [[9120, 837], [2783, 855], [2057, 650], [6549, 861], [3201, 1590]]},
 0.01: {5: [[9120, 849], [2783, 893], [2057, 877], [6549, 749], [3201, 880]],
  10: [[9120, 902], [2783, 1373], [2057,

In [33]:
{ k: np.mean([ np.mean([ t for __, t in vv if t != -1 ]) for _, vv in v.items() ]) for k, v in lr_results_small.items() }

{0.3: 499.20000000000005, 0.1: 196.75, 0.03: 592.4, 0.01: 1414.25}

the best learning rate for the lstm seems to be 0.1; so proceed with this setting

In [None]:
#results_small = {.1: lr_results_small[.1].copy()}
run_model([.1], LENGTHS, SEEDS, results_small, ARGS_837)

created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
created LSTM with 888 parameters
length 500 converged after 363 steps.
000500 1.27e+00
001000 1.03e+00
001500 1.00e+00


In [43]:
results_small

{0.1: {5: [[9120, 36],
   [2783, 46],
   [2057, 42],
   [6549, 36],
   [3201, 35],
   [7063, 62],
   [5243, 36],
   [3102, 37],
   [5303, 36],
   [5819, 36],
   [3693, 57],
   [4884, 44],
   [2231, 61],
   [5514, 35],
   [8850, 40],
   [6861, 46],
   [3106, 36],
   [2378, 36],
   [8697, 49],
   [1821, 38],
   [9480, 49],
   [8483, 47],
   [1633, 43],
   [9678, 51],
   [6596, 41],
   [4509, 40],
   [8618, 53],
   [9765, 49],
   [6346, 41],
   [2969, 42]],
  10: [[9120, 87],
   [2783, 120],
   [2057, 75],
   [6549, 105],
   [3201, 211],
   [7063, 86],
   [5243, 132],
   [3102, 116],
   [5303, 112],
   [5819, 155],
   [3693, 125],
   [4884, 124],
   [2231, 256],
   [5514, 78],
   [8850, 69],
   [6861, 130],
   [3106, 82],
   [2378, 157],
   [8697, 161],
   [1821, 99],
   [9480, 130],
   [8483, 194],
   [1633, 169],
   [9678, 150],
   [6596, 107],
   [4509, 104],
   [8618, 219],
   [9765, 203],
   [6346, 241],
   [2969, 138]],
  20: [[9120, 353],
   [2783, 161],
   [2057, 136],
   [6549, 1

In [44]:
import pandas as pd
pd.DataFrame([ [key, seed, step, .0] for key in results_small[.1] for seed, step in results_small[.1][key] ], columns=["sentence_length", "seed", "hparams/epoch", "hparams/validate_best"], index=None).to_csv("~/small-lstm.csv")