In [1]:
from typing import Iterable, List

import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F

DEVICE = 'cuda'

In [2]:
def convert_to_binary(n: int, length: int) -> List[int]:
    binary = [int(d) for d in bin(n)[2:]]
    result = np.zeros(length).astype(np.float32)
    result[-len(binary):] = binary
    
    return result


class EmbedNumber(nn.Module):
    def __init__(self, max_d: int, emb_size: int):
        super(EmbedNumber, self).__init__()
        
        self.max_d = max_d
        self.layer = nn.Linear(max_d, emb_size)
        
    def forward(self, x: Tensor):
        x_binary = [convert_to_binary(d, self.max_d) for d in x]
        x_binary = torch.tensor(x_binary).to(x.device)
        x_binary = (x_binary - x_binary.mean(dim=0, keepdim=True)) / x_binary.std(dim=0, keepdim=True)
        
        return self.layer(x_binary)

In [6]:
from tqdm import tqdm

ds_size = 10000
z_dim = 16
dataset = torch.randn(ds_size, z_dim).to(DEVICE)
numbers = torch.arange(ds_size).to(DEVICE)
max_num_iters = 500
emb_size = 2048
max_d = 32
hid_size = 512

binary_codes = (torch.rand(ds_size, 32) > 0.5).float().numpy()


def convert_to_binary(n: int, length: int) -> List[int]:
#     binary = [int(d) for d in bin(n)[2:]]
#     result = np.zeros(length).astype(np.float32)
#     result[-len(binary):] = binary
    
    return binary_codes[n]

model = nn.Sequential(
    EmbedNumber(max_d, emb_size),
    nn.BatchNorm1d(emb_size),
    nn.ReLU(),
    
    nn.Linear(emb_size, hid_size),
    nn.BatchNorm1d(hid_size),
    nn.ReLU(),
    
    nn.Linear(hid_size, hid_size),
    nn.BatchNorm1d(hid_size),
    nn.ReLU(),
    
    nn.Linear(hid_size, hid_size),
    nn.BatchNorm1d(hid_size),
    nn.ReLU(),
    
    nn.Linear(hid_size, hid_size),
    nn.BatchNorm1d(hid_size),
    nn.ReLU(),
    
    nn.Linear(hid_size, z_dim)
)

model = model.to(DEVICE)

optim = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, factor=0.2, patience=50, threshold=0.005, verbose=True)

for i in tqdm(range(max_num_iters)):
    preds = model(numbers)
    loss = F.mse_loss(preds, dataset)

    optim.zero_grad()
    loss.backward()
    optim.step()
    
    scheduler.step(loss)

    if i % 100 == 0:
        print(f'Loss: {loss.item():.05f}')


  0%|          | 0/500 [00:00<?, ?it/s][A
  0%|          | 1/500 [00:00<03:45,  2.21it/s][A

Loss: 1.19155



  0%|          | 2/500 [00:00<03:35,  2.31it/s][A
  1%|          | 3/500 [00:01<03:30,  2.36it/s][A
  1%|          | 4/500 [00:01<03:26,  2.40it/s][A
  1%|          | 5/500 [00:02<03:23,  2.43it/s][A
  1%|          | 6/500 [00:02<03:19,  2.47it/s][A
  1%|▏         | 7/500 [00:02<03:16,  2.51it/s][A
  2%|▏         | 8/500 [00:03<03:14,  2.53it/s][A
  2%|▏         | 9/500 [00:03<03:23,  2.41it/s][A
  2%|▏         | 10/500 [00:04<03:21,  2.43it/s][A
  2%|▏         | 11/500 [00:04<03:17,  2.47it/s][A
  2%|▏         | 12/500 [00:04<03:15,  2.50it/s][A
  3%|▎         | 13/500 [00:05<03:12,  2.53it/s][A
  3%|▎         | 14/500 [00:05<03:11,  2.54it/s][A
  3%|▎         | 15/500 [00:06<03:09,  2.55it/s][A
  3%|▎         | 16/500 [00:06<03:08,  2.56it/s][A
  3%|▎         | 17/500 [00:06<03:18,  2.43it/s][A
  4%|▎         | 18/500 [00:07<03:19,  2.41it/s][A
  4%|▍         | 19/500 [00:07<03:15,  2.46it/s][A
  4%|▍         | 20/500 [00:08<03:12,  2.50it/s][A
  4%|▍         | 21

KeyboardInterrupt: 

In [3]:
from tqdm import tqdm

ds_size = 10000
z_dim = 32
hid_size = 2048
dataset = torch.randn(ds_size, z_dim).to(DEVICE)
max_num_iters = 1000

model = nn.LSTM(z_dim, hid_size, num_layers=1, batch_first=True)
output = nn.Sequential(
    nn.Linear(hid_size, z_dim),
).to(DEVICE)
model = model.to(DEVICE)

optim = torch.optim.Adam(list(model.parameters()) + list(output.parameters()), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, factor=0.2, patience=50, threshold=0.005, verbose=True)

inputs = torch.cat([torch.zeros(1, z_dim).to(DEVICE), dataset])
targets = torch.cat([dataset, torch.zeros(1, z_dim).to(DEVICE)])

for i in tqdm(range(max_num_iters)):
    hiddens, _ = model(inputs.unsqueeze(0))
    hiddens = hiddens.squeeze(0)
    preds = output(hiddens)
    loss = F.mse_loss(preds, targets)

    optim.zero_grad()
    loss.backward()
    optim.step()
    
    scheduler.step(loss)

    if i % 100 == 0:
        print(f'Loss: {loss.item()}')

  0%|          | 1/1000 [00:02<40:59,  2.46s/it]

Loss: 0.9995099902153015


 10%|█         | 101/1000 [07:30<1:31:48,  6.13s/it]

Loss: 0.6497169137001038


 20%|██        | 201/1000 [17:56<1:24:49,  6.37s/it]

Loss: 0.11679525673389435


 30%|███       | 301/1000 [29:02<1:01:58,  5.32s/it]

Loss: 0.016076019033789635


 40%|████      | 401/1000 [39:25<1:06:23,  6.65s/it]

Loss: 0.0030346722342073917


 50%|█████     | 501/1000 [50:18<52:05,  6.26s/it]  

Loss: 0.0013863989152014256


 60%|██████    | 601/1000 [1:00:39<38:49,  5.84s/it]

Loss: 0.0006943172193132341


 67%|██████▋   | 672/1000 [1:07:29<30:02,  5.50s/it]

Epoch   671: reducing learning rate of group 0 to 2.0000e-04.


 70%|███████   | 701/1000 [1:10:16<29:23,  5.90s/it]

Loss: 0.00021488837955985218


 80%|████████  | 801/1000 [1:19:45<19:08,  5.77s/it]

Loss: 0.00019937483011744916


 90%|█████████ | 901/1000 [1:29:20<09:14,  5.61s/it]

Loss: 0.00019017573504243046


100%|██████████| 1000/1000 [1:38:50<00:00,  5.93s/it]


In [4]:
from tqdm import tqdm

ds_size = 1024
z_dim = 64
hid_size = 1024
dataset = torch.randn(ds_size, z_dim).to(DEVICE)
max_num_iters = 5000
bottleneck_dim = 16

model = nn.Sequential(
    nn.Linear(z_dim, hid_size),
    nn.ReLU(),
    nn.BatchNorm1d(hid_size),
    
    nn.Linear(hid_size, bottleneck_dim),
    nn.ReLU(),
    nn.BatchNorm1d(bottleneck_dim),
    
    nn.Linear(bottleneck_dim, hid_size),
    nn.ReLU(),
    nn.BatchNorm1d(hid_size),
    
    nn.ReLU(),
    nn.BatchNorm1d(hid_size),
    nn.Linear(hid_size, z_dim)
)

model = model.to(DEVICE)

optim = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, factor=0.2, patience=50, threshold=0.005, verbose=True)

for i in range(max_num_iters):
    preds = model(dataset)
    loss = F.mse_loss(preds, dataset)

    optim.zero_grad()
    loss.backward()
    optim.step()
    
    scheduler.step(loss)

    if i % 500 == 0:
        print(f'Loss: {loss.item():.05f}')

Loss: 1.34892



  3%|▎         | 168/5000 [00:20<02:07, 37.75it/s][A

Loss: 0.01501
Loss: 0.00158
Epoch  1484: reducing learning rate of group 0 to 2.0000e-04.
Loss: 0.00019
Loss: 0.00012
Loss: 0.00007
Loss: 0.00004
Loss: 0.00002
Loss: 0.00001
Epoch  4351: reducing learning rate of group 0 to 4.0000e-05.
Loss: 0.00000


KeyboardInterrupt: 