In [1]:
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import torch.sparse as sp
from torch.autograd import Variable

from tqdm import tqdm
import numpy as np
import torchsummary
from datetime import datetime
from sklearn.metrics import confusion_matrix
import scipy
import torch.autograd
from torch.nn.utils import clip_grad_norm_


from sklearn.preprocessing import normalize

In [2]:
def make_coo_to_sparse_tensor(coo_mat):
    values = coo_mat.data
    indices = np.vstack((coo_mat.row, coo_mat.col))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = coo_mat.shape

    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu") 

In [4]:
train_mat = scipy.sparse.load_npz('../data/mat_bin_train.npz')
eval_mat = scipy.sparse.load_npz('../data/mat_bin_validate.npz')

train_mat_normalized = normalize(X=train_mat, norm='l2', axis=1)

In [5]:
def row_sum_dim_1(sparse_tensor, device):
    x = torch.sparse.FloatTensor(
            indices=torch.stack([
                sparse_tensor._indices()[0],
                torch.LongTensor(1).to(device).zero_().expand_as(sparse_tensor._indices()[0]),
            ]),
            values=sparse_tensor._values(),
            size=[sparse_tensor.shape[0], 1]).to(device)

    x.coalesce()
    return x.to_dense().squeeze()

def sparse_dense_mul(s, d):
  i = s._indices()
  v = s._values()
  dv = d[i[0,:], i[1,:]]  # get values from relevant entries of dense matrix
  return torch.sparse.FloatTensor(i, v * dv, s.size()).to_dense()

def sparse_dense_mul_sparse_output(s, d):
      i = s._indices()
      v = s._values()
      dv = d[i[0,:], i[1,:]]  # get values from relevant entries of dense matrix
      return torch.sparse.FloatTensor(i, v * dv, s.size())def DenseLayers(in_channels, out_channels, batch_norm=False, dropout_rate=None, activation=False):
        individual_layers = []
            
        if not dropout_rate is None:
            individual_layers.append(nn.Dropout(p=dropout_rate))
            
        individual_layers.append(nn.Linear(in_channels, out_channels))

        if activation is not None:
            individual_layers.append(nn.Tanh())
            
        if batch_norm is not None:
            individual_layers.append(nn.BatchNorm1d(num_features=out_channels))

        return nn.Sequential(*individual_layers)
    
class Encoder(nn.Module):
    def __init__(self, k, dense_layers, batch_norm, activation, dropout_rate):
        super().__init__()
        self.k  = k
        self.additional_layer = len(dense_layers)
        if self.additional_layer > 2:
            all_layers = [DenseLayers(in_channels=in_channels, out_channels=out_channels, dropout_rate=dropout_rate,
                                                          batch_norm=batch_norm, activation=activation) for (in_channels, out_channels) in zip(dense_layers[1:-1], dense_layers[2:])]
            
            all_layers[0:0] = [DenseLayers(in_channels=dense_layers[0], out_channels=dense_layers[1], batch_norm=batch_norm, dropout_rate=None, activation=activation)]
            self.dense_network = nn.Sequential(*all_layers)
        elif self.additional_layer > 1:
            self.dense_network = DenseLayers(in_channels=dense_layers[0], out_channels=dense_layers[1], batch_norm=batch_norm, dropout_rate=None, activation=activation)
        else:
            pass
        self.out_layer = nn.Linear(dense_layers[-1], 2 * k)
        
    def forward(self, X):
        if self.additional_layer > 1:
            X = self.dense_network(X)
        X = self.out_layer(X)
        mu, log_sigmas = torch.split(X, self.k, dim=1) 
        # cap sigma to value for training stability
        #sigmas = torch.clamp(input=sigmas, min=-np.inf, max=2) 
        return mu, log_sigmas
    
class Decoder(nn.Module):
    def __init__(self, n_items, dense_layers, batch_norm, activation, dropout_rate):
        super().__init__()
        self.n_items = n_items
        self.additional_layer = len(dense_layers)
        if self.additional_layer > 2:
            all_layers = [DenseLayers(in_channels=in_channels, out_channels=out_channels, dropout_rate=dropout_rate,
                                                          batch_norm=batch_norm, activation=activation) for (in_channels, out_channels) in zip(dense_layers[1:-1], dense_layers[2:])]
            
            all_layers[0:0] = [DenseLayers(in_channels=dense_layers[0], out_channels=dense_layers[1], batch_norm=batch_norm, dropout_rate=None, activation=activation)]
            print(all_layers)
            self.dense_network = nn.Sequential(*all_layers)
        elif self.additional_layer > 1:
            self.dense_network = DenseLayers(in_channels=dense_layers[0], out_channels=dense_layers[1], batch_norm=batch_norm, dropout_rate=None, activation=activation)
        else:
            pass
        self.out_layer = nn.Linear(dense_layers[-1], n_items)
        
    def forward(self, X):
        if self.additional_layer > 1:
            X = self.dense_network(X)
        X = self.out_layer(X)
        return F.log_softmax(X, dim=1)

In [6]:
def DenseLayers(in_channels, out_channels, batch_norm=False, dropout_rate=None, activation=False):
        individual_layers = []
            
        if not dropout_rate is None:
            individual_layers.append(nn.Dropout(p=dropout_rate))
            
        individual_layers.append(nn.Linear(in_channels, out_channels))

        if activation is not None:
            individual_layers.append(nn.Tanh())
            
        if batch_norm is not None:
            individual_layers.append(nn.BatchNorm1d(num_features=out_channels))

        return nn.Sequential(*individual_layers)
    
class Encoder(nn.Module):
    def __init__(self, k, dense_layers, batch_norm, activation, dropout_rate):
        super().__init__()
        self.k  = k
        self.additional_layer = len(dense_layers)
        if self.additional_layer > 2:
            all_layers = [DenseLayers(in_channels=in_channels, out_channels=out_channels, dropout_rate=dropout_rate,
                                                          batch_norm=batch_norm, activation=activation) for (in_channels, out_channels) in zip(dense_layers[1:-1], dense_layers[2:])]
            
            all_layers[0:0] = [DenseLayers(in_channels=dense_layers[0], out_channels=dense_layers[1], batch_norm=batch_norm, dropout_rate=None, activation=activation)]
            self.dense_network = nn.Sequential(*all_layers)
        elif self.additional_layer > 1:
            self.dense_network = DenseLayers(in_channels=dense_layers[0], out_channels=dense_layers[1], batch_norm=batch_norm, dropout_rate=None, activation=activation)
        else:
            pass
        self.out_layer = nn.Linear(dense_layers[-1], 2 * k)
        
    def forward(self, X):
        if self.additional_layer > 1:
            X = self.dense_network(X)
        X = self.out_layer(X)
        mu, log_sigmas = torch.split(X, self.k, dim=1) 
        # cap sigma to value for training stability
        #sigmas = torch.clamp(input=sigmas, min=-np.inf, max=2) 
        return mu, log_sigmas
    
class Decoder(nn.Module):
    def __init__(self, n_items, dense_layers, batch_norm, activation, dropout_rate):
        super().__init__()
        self.n_items = n_items
        self.additional_layer = len(dense_layers)
        if self.additional_layer > 2:
            all_layers = [DenseLayers(in_channels=in_channels, out_channels=out_channels, dropout_rate=dropout_rate,
                                                          batch_norm=batch_norm, activation=activation) for (in_channels, out_channels) in zip(dense_layers[1:-1], dense_layers[2:])]
            
            all_layers[0:0] = [DenseLayers(in_channels=dense_layers[0], out_channels=dense_layers[1], batch_norm=batch_norm, dropout_rate=None, activation=activation)]
            print(all_layers)
            self.dense_network = nn.Sequential(*all_layers)
        elif self.additional_layer > 1:
            self.dense_network = DenseLayers(in_channels=dense_layers[0], out_channels=dense_layers[1], batch_norm=batch_norm, dropout_rate=None, activation=activation)
        else:
            pass
        self.out_layer = nn.Linear(dense_layers[-1], n_items)
        
    def forward(self, X):
        if self.additional_layer > 1:
            X = self.dense_network(X)
        X = self.out_layer(X)
        return F.log_softmax(X, dim=1)

In [7]:
class Mult_VAE(nn.Module):
    def __init__(self, device, beta, k, n_items, dense_layers_encoder, dense_layers_decoder, batch_norm_encoder, batch_norm_decoder,
                 dropout_rate_encoder, dropout_rate_decoder, optimizer=None, writer=None):
        super().__init__()
        self.k = k
        self.Encoder = Encoder(k=k, dense_layers=dense_layers_encoder, batch_norm=batch_norm_encoder, activation=True, dropout_rate=dropout_rate_encoder)
        self.Decoder = Decoder(n_items=n_items, dense_layers=dense_layers_decoder, batch_norm=batch_norm_decoder, activation=True, dropout_rate=dropout_rate_decoder)
        self.device = device
        self.beta = beta
        self.optimizer = optimizer
        self.writer = writer
    
    def forward(self, X):
        mu, log_sigmas = self.Encoder(X)
        z = Variable(torch.randn(X.shape[0], self.k,  requires_grad=False)).to(self.device)
        log_pi = self.Decoder(z.mul(torch.sqrt(torch.exp(log_sigmas))).add(mu))
        return mu, log_sigmas, log_pi
    
    def loss(self, X, mu, log_sigmas, log_pi): # be careful don't overwrite some pre-defined loss from the super class.
        """self.writer.add_histogram('mu', mu, global_step=self.epoch)
        self.writer.add_histogram('sigmas', sigmas, global_step=self.epoch)
        self.writer.add_histogram('pi', pi, global_step=self.epoch)"""
        
        #log_likelihood = sparse_dense_mul(X, log_pi).sum(dim=1)
        log_likelihood = torch.sparse.sum(sparse_dense_mul_sparse_output(X, log_pi), dim=1).to_dense().to(self.device)
        
        #log_likelihood = torch.mul(X, pi).sum(dim=1)
        #print(f"log_sigmas.mean(): {log_sigmas.mean()}, log_sigmas.max(): {log_sigmas.max()}, log_sigmas.min(): {log_sigmas.min()}")
        KL = torch.sum(0.5 * (-log_sigmas + torch.exp(log_sigmas) + mu ** 2 - 1), dim=1) * self.beta
        
        """print(f"KL: {KL.sum()}, log_likelihood: {log_likelihood.sum()}")
        print(f"mu: {mu.sum()}, log_sigmas: {log_sigmas.sum()}, log_pi: {log_pi.sum()}")
        
        self.writer.add_histogram('log_likelihood', log_likelihood, global_step=self.epoch)
        self.writer.add_histogram('KL_part1', KL_part1, global_step=self.epoch)
        self.writer.add_histogram('KL_part2', KL_part2, global_step=self.epoch)
        self.writer.add_histogram('KL_part3', KL_part3, global_step=self.epoch)
        self.writer.add_histogram('KL', KL, global_step=self.epoch)"""
        
        #KL = 0.5 * (torch.sum(torch.log(sigmas), dim=1) - self.k + torch.sum((1/(sigmas + 1e-10)), dim=1) + torch.sum(torch.mul(sigmas, torch.mul(mu, mu)), dim=1))
        #print(f"log_likleihood: {log_likelihood}, part_1: {KL_part1}, part_2: {KL_part2}, part_3: {KL_part3}, KL: {KL}")
        #print(f"KL.shape: {KL.shape}, log_likelihood.shape: {log_likelihood.shape}")
        return torch.sum(torch.add(KL, log_likelihood))
    
    def forward_pass(self, X):
        self.train(True)
        self.zero_grad()
        mu, log_sigmas, log_pi = self.__call__(X)
        loss = self.loss(X=X, mu=mu, log_sigmas=log_sigmas, log_pi=log_pi)
        loss.backward()

         #print(f"loss: {loss}")
        """for name, param in self.named_parameters():
            if param.requires_grad:
                print(f"{name}: {param.grad.sum()}")"""
                
        # Gradient clipping
        clip_grad_norm_(self.parameters(), 20)

        self.optimizer.step()
        return loss

    def batch_run(self, X, batch_size, verbose=0):
        avr_loss = 0.0
        
        if verbose > 0:
            for batch_idx in tqdm(range(0, X.shape[0], batch_size)):
                batch_X = make_coo_to_sparse_tensor(X[batch_idx:(batch_idx + batch_size), :].tocoo()).to(self.device)
                cur_loss = self.forward_pass(batch_X)
                avr_loss += cur_loss
        else:
            for batch_idx in range(0, X.shape[0], batch_size):
                batch_X = make_coo_to_sparse_tensor(X[batch_idx:(batch_idx + batch_size), :].tocoo()).to(self.device)
                cur_loss = self.forward_pass(batch_X)
                avr_loss += cur_loss
        
        return avr_loss
    
    def train_model(self, X_train,  batch_size, epochs, verbose=0):          
        if self.optimizer is None:
            raise ValueError("Please assign first an optimizer to the model with model.optimizer = torch.optim.optimizer")
        
        if not self.writer is None:
            # Log the model graph with a small example.
            self.writer.add_graph(self, make_coo_to_sparse_tensor(X_train[:min(X_train.shape[0], 10)].tocoo()).to(self.device))

        for epoch in tqdm(range(epochs)):
            self.epoch=epoch # RM LATER!!
            avr_loss_train  = self.batch_run(X=X_train, batch_size=batch_size, verbose=0)

            if not self.writer is None:
                self.writer.add_scalar('loss_train', avr_loss_train, global_step=epoch)
                """for name, params in self.named_parameters():
                    self.writer.add_histogram(name, params, global_step=epoch)"""

            if verbose > 0:
                print(f" test Performance in epoch {epoch} was {avr_loss_train}.")

In [8]:
test = Mult_VAE(device=device, k = 10, n_items = train_mat.shape[1], dense_layers_encoder=[train_mat.shape[1], 1000,100, 100], dense_layers_decoder=[10, 100, 100, 1000], batch_norm_encoder=True,
                batch_norm_decoder=True, dropout_rate_decoder=0.5, dropout_rate_encoder=0.5, beta=0.2).to(device)

time_stamp = ''.join(str(datetime.now()).split('.')[:-1]).replace(' ', '_').replace(':', '_').replace('-', '_')
test.writer = SummaryWriter('./logs/' + time_stamp)

test.optimizer = optim.Adam(test.parameters(), lr=1e-3)
test.train_model(X_train=train_mat, batch_size=500, epochs=100, verbose=1)

[Sequential(
  (0): Linear(in_features=10, out_features=100, bias=True)
  (1): Tanh()
  (2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
), Sequential(
  (0): Dropout(p=0.5, inplace=False)
  (1): Linear(in_features=100, out_features=100, bias=True)
  (2): Tanh()
  (3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
), Sequential(
  (0): Dropout(p=0.5, inplace=False)
  (1): Linear(in_features=100, out_features=1000, bias=True)
  (2): Tanh()
  (3): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)]


	%140 : Float(10, 10) = aten::randn(%135, %136, %137, %138, %139), scope: Mult_VAE # <ipython-input-7-9d0e8a998e3a>:15:0
This may cause errors in trace checking. To disable trace checking, pass check_trace=False to torch.jit.trace()
  check_tolerance, _force_outplace, True, _module_class)
Not within tolerance rtol=1e-05 atol=1e-05 at input[4, 19463] (-9.855230331420898 vs. -10.529387474060059) and 262010 other locations (99.00%)
  check_tolerance, _force_outplace, True, _module_class)
  1%|          | 1/100 [00:49<1:21:46, 49.56s/it]

 test Performance in epoch 0 was -673633600.0.


  2%|▏         | 2/100 [01:38<1:20:43, 49.42s/it]

 test Performance in epoch 1 was -2401815040.0.


  3%|▎         | 3/100 [02:27<1:19:43, 49.31s/it]

 test Performance in epoch 2 was -4766439936.0.


  4%|▍         | 4/100 [03:16<1:18:49, 49.27s/it]

 test Performance in epoch 3 was -7137920000.0.


  5%|▌         | 5/100 [04:07<1:18:43, 49.72s/it]

 test Performance in epoch 4 was -9834000384.0.


  6%|▌         | 6/100 [04:56<1:17:38, 49.56s/it]

 test Performance in epoch 5 was -12470851584.0.


  7%|▋         | 7/100 [05:45<1:16:37, 49.43s/it]

 test Performance in epoch 6 was -14209414144.0.


  8%|▊         | 8/100 [06:35<1:15:38, 49.34s/it]

 test Performance in epoch 7 was -16338203648.0.


  9%|▉         | 9/100 [07:24<1:14:44, 49.28s/it]

 test Performance in epoch 8 was -19025678336.0.


 10%|█         | 10/100 [08:13<1:13:53, 49.26s/it]

 test Performance in epoch 9 was -22465705984.0.


 11%|█         | 11/100 [09:02<1:13:05, 49.27s/it]

 test Performance in epoch 10 was -26476201984.0.


 12%|█▏        | 12/100 [09:51<1:12:05, 49.15s/it]

 test Performance in epoch 11 was -30945871872.0.


 13%|█▎        | 13/100 [10:40<1:11:06, 49.05s/it]

 test Performance in epoch 12 was -35843248128.0.


 14%|█▍        | 14/100 [11:29<1:10:20, 49.07s/it]

 test Performance in epoch 13 was -40803627008.0.


 15%|█▌        | 15/100 [12:18<1:09:36, 49.14s/it]

 test Performance in epoch 14 was -46130581504.0.


 16%|█▌        | 16/100 [13:08<1:08:48, 49.15s/it]

 test Performance in epoch 15 was -52454580224.0.


 17%|█▋        | 17/100 [13:57<1:08:00, 49.17s/it]

 test Performance in epoch 16 was -59365064704.0.


 18%|█▊        | 18/100 [14:46<1:07:11, 49.17s/it]

 test Performance in epoch 17 was -67157442560.0.


 19%|█▉        | 19/100 [15:35<1:06:23, 49.17s/it]

 test Performance in epoch 18 was -75120451584.0.


 20%|██        | 20/100 [16:24<1:05:34, 49.18s/it]

 test Performance in epoch 19 was -83316875264.0.


 21%|██        | 21/100 [17:14<1:04:53, 49.28s/it]

 test Performance in epoch 20 was -92633710592.0.


 22%|██▏       | 22/100 [18:03<1:04:06, 49.32s/it]

 test Performance in epoch 21 was -103016177664.0.


 23%|██▎       | 23/100 [18:52<1:03:15, 49.29s/it]

 test Performance in epoch 22 was -113826299904.0.


 24%|██▍       | 24/100 [19:43<1:02:59, 49.73s/it]

 test Performance in epoch 23 was -125723746304.0.


 25%|██▌       | 25/100 [20:33<1:02:11, 49.75s/it]

 test Performance in epoch 24 was -138027728896.0.


 26%|██▌       | 26/100 [21:23<1:01:35, 49.94s/it]

 test Performance in epoch 25 was -151447060480.0.


 27%|██▋       | 27/100 [22:13<1:00:42, 49.90s/it]

 test Performance in epoch 26 was -165187698688.0.


 28%|██▊       | 28/100 [23:03<59:50, 49.87s/it]  

 test Performance in epoch 27 was -180930052096.0.


 29%|██▉       | 29/100 [23:53<59:05, 49.94s/it]

 test Performance in epoch 28 was -197942001664.0.


 30%|███       | 30/100 [24:43<58:20, 50.00s/it]

 test Performance in epoch 29 was -215263019008.0.


KeyboardInterrupt: 

In [15]:
x = torch.sparse.FloatTensor(
    indices=torch.LongTensor([[0, 1, 1], [2, 0, 2]]),
    values=torch.FloatTensor([3, 4, 5]),
    size=[2, 3])

In [18]:
x.to_dense()

tensor([[0., 0., 3.],
        [4., 0., 5.]])

In [19]:
x.to_dense().sum(dim=1)

tensor([3., 9.])

In [25]:
torch.sparse.sum(x, dim=1).to_dense()

tensor([3., 9.])

AttributeError: 'Tensor' object has no attribute 'todense'

In [29]:
def row_sum_dim_1(sparse_tensor):
    x = torch.sparse.FloatTensor(
            indices=torch.stack([
                sparse_tensor._indices()[0],
                torch.LongTensor(1).zero_().expand_as(sparse_tensor._indices()[0]),
            ]),
            values=sparse_tensor._values(),
            size=[sparse_tensor.shape[0], 1])

    x.coalesce()
    return x.to_dense().squeeze()

In [30]:
row_sum_dim_1(x)

tensor([3., 9.])

In [43]:
next(test.parameters())

Parameter containing:
tensor([[ 0.0064,     nan, -0.0148,  ...,     nan,     nan,     nan],
        [ 0.0147,     nan, -0.0120,  ...,     nan,     nan,     nan],
        [ 0.0124,     nan,  0.0091,  ...,     nan,     nan,     nan],
        ...,
        [-0.0115,     nan, -0.0006,  ...,     nan,     nan,     nan],
        [ 0.0030,     nan, -0.0130,  ...,     nan,     nan,     nan],
        [-0.0078,     nan,  0.0139,  ...,     nan,     nan,     nan]],
       device='cuda:0', requires_grad=True)

In [37]:
for name, param in test.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.grad.sum()}")

Encoder.dense_network.0.0.weight: nan
Encoder.dense_network.0.0.bias: nan
Encoder.dense_network.1.0.weight: nan
Encoder.dense_network.1.0.bias: nan
Encoder.dense_network.2.0.weight: nan
Encoder.dense_network.2.0.bias: nan
Encoder.out_layer.weight: nan
Encoder.out_layer.bias: nan
Decoder.dense_network.0.0.weight: nan
Decoder.dense_network.0.0.bias: nan
Decoder.dense_network.1.0.weight: nan
Decoder.dense_network.1.0.bias: nan
Decoder.dense_network.2.0.weight: nan
Decoder.dense_network.2.0.bias: nan
Decoder.out_layer.weight: nan
Decoder.out_layer.bias: nan


In [35]:
param.grad

tensor([[0., nan, 0.,  ..., nan, nan, nan],
        [0., nan, 0.,  ..., nan, nan, nan],
        [0., nan, 0.,  ..., nan, nan, nan],
        ...,
        [0., nan, 0.,  ..., nan, nan, nan],
        [0., nan, 0.,  ..., nan, nan, nan],
        [0., nan, 0.,  ..., nan, nan, nan]], device='cuda:0')

In [31]:
name

'Encoder.dense_network.0.0.weight'

In [13]:
list(test.named_parameters())

[('Encoder.dense_network.0.0.weight', Parameter containing:
  tensor([[-0.0074,     nan, -0.0067,  ...,     nan,     nan,     nan],
          [ 0.0101,     nan,  0.0015,  ...,     nan,     nan,     nan],
          [ 0.0149,     nan,  0.0033,  ...,     nan,     nan,     nan],
          ...,
          [ 0.0157,     nan, -0.0033,  ...,     nan,     nan,     nan],
          [ 0.0045,     nan, -0.0097,  ...,     nan,     nan,     nan],
          [-0.0010,     nan,  0.0100,  ...,     nan,     nan,     nan]],
         device='cuda:0', requires_grad=True)),
 ('Encoder.dense_network.0.0.bias', Parameter containing:
  tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
          nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, n