## Notes

If it stalls around loss 60, you might need a smaller batch size, e.g. 8



## Log

### 20180506
Why is this not working? It stalls around loss=60 and just reconstructs the same mean image each time.

- batchnorm and pad? no lots of other models use them
    - https://github.com/josephsmann/UnsupervisedDeepLearning-Pytorch/blob/jm/udlp/autoencoder/convVAE.py#L19
    - https://github.com/taey16/pix2pixBEGAN.pytorch/blob/master/models/BEGAN.py
- relu, same
- lr - similar to other models
- inner params - similar but larger than other working models
- batch? Maybe it seems better with a lower batch (e.g. 8 instead of 32)
- loss, this seems right
    - except some people use a loss balance https://github.com/AppliedDataSciencePartners/WorldModels/blob/master/vae/arch.py#L97
        where they multuply reconstruction loss by 10 but i've got ~100 vs ~1e-4 so I don't really need to!
        
- ok a lower batch size seemed to get it over that initial hump

In [1]:
import os
os.sys.path.append('.')

In [2]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [3]:
import torch
from torch.nn import functional as F
from torch.autograd import Variable
from torch import nn, optim
import torch.utils.data

# load as dask array
import dask.array as da
import dask
import h5py

import os
import glob
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

  from ._conv import register_converters as _register_converters


In [4]:
from vae import VAE, loss_function
from helpers.summarize import TorchSummarizeDf
from helpers.dataset import NumpyDataset, TQDMDaskProgressBar, load_npzs

In [5]:
env_name='sonic'
cuda= torch.cuda.is_available()
num_epochs=200
batch_size=6
data_cache_file = '/MLDATA/sonic/vae.hdf5'

# Load data

In [6]:
# load as dask array

filenames = sorted(glob.glob('./data/vae/obs_data_' + env_name + '_*.npz'))

if not os.path.isfile(data_cache_file):
    data_train = load_npzs(filenames)
    print(data_train)
    with TQDMDaskProgressBar():
        da.to_hdf5(data_cache_file, '/x', data_train)
       
    # clear mem
    del data_train 
    import gc
    gc.collect()

In [7]:
# load
data = da.from_array(h5py.File(data_cache_file)['x'], chunks=(2000, 128, 128, 3))
data
data_split = int(len(data)*0.8)
data_train = data[:data_split]
data_test = data[data_split:]
data_train, data_test

(dask.array<getitem, shape=(216000, 128, 128, 3), dtype=float32, chunksize=(2000, 128, 128, 3)>,
 dask.array<getitem, shape=(54000, 128, 128, 3), dtype=float32, chunksize=(2000, 128, 128, 3)>)

In [8]:
   
dataset_train = NumpyDataset(data_train)
loader_train = torch.utils.data.DataLoader(dataset_train, pin_memory=True, shuffle=False, batch_size=batch_size)


dataset_test = NumpyDataset(data_test)
loader_test = torch.utils.data.DataLoader(dataset_test, pin_memory=True, shuffle=False, batch_size=batch_size)

dataset_train, loader_train

(<helpers.dataset.NumpyDataset at 0x7f857cf324a8>,
 <torch.utils.data.dataloader.DataLoader at 0x7f857cf215c0>)

# View model

In [9]:
vae = VAE(image_size=128, z_dim=32, conv_dim=64, code_dim=8, k_dim=128).cuda()

In [12]:
state_dict = torch.load('./models/VAE_state_dict.pkl')
vae.load_state_dict(vae.state_dict())

In [13]:
img = np.random.randn(64, 64, 3)
img = np.random.randn(64*2, 64*2, 3)
gpu_img = Variable(torch.from_numpy(img[np.newaxis].transpose(0, 3, 1, 2))).float().cuda()

with TorchSummarizeDf(vae) as tdf:
    x, mu, logvar = vae.forward(gpu_img)
    print(x.size())
    print(loss_function(x, gpu_img, mu, logvar))
    x = x.data.cpu().numpy()[0].transpose(1, 2, 0)
    df = tdf.make_df()
df

torch.Size([1, 3, 128, 128])
Variable containing:
 248.6288
[torch.cuda.FloatTensor of size 1 (GPU 0)]

Total parameters 788736


Unnamed: 0,name,class_name,input_shape,output_shape,nb_params,level
1,encoder.0.conv,Conv2d,"[[(-1, 3, 128, 128)]]","[[(-1, 64, 128, 128)]]",1792,2
2,encoder.0.bn,BatchNorm2d,"[[(-1, 64, 128, 128)]]","[[(-1, 64, 128, 128)]]",128,2
3,encoder.0.act,ReLU,"[[(-1, 64, 128, 128)]]","[[(-1, 64, 128, 128)]]",0,2
4,encoder.0,ConvBlock,"[[(-1, 3, 128, 128)]]","[[(-1, 64, 128, 128)]]",1920,1
5,encoder.1.conv,Conv2d,"[[(-1, 64, 128, 128)]]","[[(-1, 128, 64, 64)]]",131200,2
6,encoder.1.bn,BatchNorm2d,"[[(-1, 128, 64, 64)]]","[[(-1, 128, 64, 64)]]",256,2
7,encoder.1.act,ReLU,"[[(-1, 128, 64, 64)]]","[[(-1, 128, 64, 64)]]",0,2
8,encoder.1,ConvBlock,"[[(-1, 64, 128, 128)]]","[[(-1, 128, 64, 64)]]",131456,1
9,encoder.2.conv,Conv2d,"[[(-1, 128, 64, 64)]]","[[(-1, 192, 32, 32)]]",393408,2
10,encoder.2.bn,BatchNorm2d,"[[(-1, 192, 32, 32)]]","[[(-1, 192, 32, 32)]]",384,2


## Train

In [14]:
import collections

def train(loader, net, optimizer, loss_function, test=False, cuda=True):
    if test:
        net.eval()
    else:
        net.train()
    info = collections.defaultdict(list)
    
    with tqdm(total=len(loader)*loader.batch_size, mininterval=0.5, desc='test' if test else 'training') as prog:
        for i, (batch,) in enumerate(loader):
            x = Variable(batch.transpose(1,3)).cuda() #*255 # FIXME (I divided by 255 once too many during gathering)
            y, mu, logvar = vae.forward(x)
            loss = loss_function(y, x, mu, logvar)
            info['loss'].append(loss.cpu().data.numpy()[0])
            
            if not test:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            prog.update(loader.batch_size)
            prog.desc='loss={:2.4f}'.format(np.mean(info['loss'][-100:]))
            
            if i%(100000//batch_size)==0:
                print('[{}/{}] loss={:2.4f}'.format(i, len(loader), np.mean(info['loss'][-100:])))
        print('[{}/{}] loss={:2.4f}'.format(i, len(loader), np.mean(info['loss'][-100:])))
        prog.close()
    return info

In [15]:
# Plot reconstructions
def plot_results(loader=loader_test, n=2, epoch=0):
    x, = next(iter(loader))

    X = Variable(x).cuda().transpose(1,3).contiguous()
    Y, mu, logvar = vae.forward(X)
    loss = loss_function(Y, X, mu, logvar)

    y=Y.cpu().data.transpose(1,3).numpy()
    for i in range(n):
        plt.subplot(1, 2, 1)
        plt.title('original')
        plt.imshow(x[i].numpy())

        plt.subplot(1, 2, 2)
        plt.imshow(y[i])
        plt.title('reconstructed, loss {:2.4f}'.format(loss.cpu().data.numpy()[0]))

        plt.suptitle('epoch {}, index {}, original'.format(epoch, i))
        plt.show()
        
# plot_results(loader=loader_test, n=2, epoch=epoch)

In [16]:
# Train

optimizer = optim.Adam(vae.parameters(), lr=1e-3)
import torch.optim.lr_scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=6, verbose=True)

In [17]:
infos=[]

In [18]:
for epoch in range(num_epochs): 
    info = train(loader_train, vae, optimizer, loss_function, test=False, cuda=True)
    info_val = train(loader_test, vae, optimizer, loss_function, test=True, cuda=True)
    scheduler.step(np.mean(info_val['loss']))
    
    print('Epoch {}, loss={:2.4f}, loss_val={:2.4f}'.format(epoch, np.mean(info['loss']), np.mean(info_val['loss'])))
    infos.append([info, info_val])
    
    plot_results(loader=loader_test, n=2, epoch=epoch)
    
    torch.save(vae, './models/VAE_{}.pkl'.format(epoch))

Widget Javascript not detected.  It may not be installed or enabled properly.


[0/36000] loss=81.0109



KeyboardInterrupt: 

In [None]:
torch.save(vae, './models/VAE.pkl')
torch.save(vae.state_dict(), './models/VAE_state_dict.pkl')

In [None]:
# plot history
histories = []
for info, info_val in infos:
    history = {k+'_val':np.mean(v) for k,v in info_val.items()}
    history.update({k:np.mean(v) for k,v in info.items()})
    histories.append(history)
histories = pd.DataFrame(histories)
histories.plot()

In [None]:
# Plot reconstructions
x, = next(iter(loader_test))

X = Variable(x).cuda().transpose(1,3).contiguous()
Y, mu, logvar = vae.forward(X)
loss = loss_function(Y, X, mu, logvar)

y=Y.cpu().data.transpose(1,3).numpy()
for i in range(2):
    plt.title('%s original'%i)
    plt.imshow(x[i].numpy())
    plt.show()
    
    plt.imshow(y[i])
    plt.title('%s reconstructed'%i)
    plt.show()

In [None]:
# check the balance of the two losses, in some situations we might want to reblance, e.g. if KLD>>l2_dist and it's stalled

# def loss_function(recon_x, x, mu, logvar):
#     n, c, h, w = recon_x.size()
#     recon_x = recon_x.view(n, -1)
#     x = x.view(n, -1)
#     # L2 distance
#     l2_dist = torch.sqrt(torch.sum(torch.pow(recon_x - x, 2), 1))
#     # see Appendix B from VAE paper:
#     # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
#     # https://arxiv.org/abs/1312.6114
#     # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
#     KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), 1)
#     print(l2_dist, KLD)
#     return torch.mean(l2_dist + KLD)
# loss_function(Y, X, mu, logvar)

# End

In [None]:
1