In [2]:
import sys
sys.path.append('..')  # Adds the parent directory to the path to access `mymodule`


In [3]:
"""
Core VAE Architecture Model:

    - Encoder:
        - Input vector of size 60664
        - Output Mean and Variance vectors of size 64 for latent space

    - Decoder:
        - Input latent space vector of size 64
        - Output Reconstrcuted input of size 60664
    
    - Using Xavier weight initialization

    - Using ReLU activation functions throughout

    - Addition of batch normalization layers after each activation

    - Usage:
        - Set model in trainer: model = Arch_Model.VAE()
"""

import torch
import torch.nn as nn
import mmvae.models.utils as utils
import mmvae.models as M

class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        #Encoder
        self.encoder = nn.Sequential(
            nn.Linear(60664, 1024),
            nn.BatchNorm1d(1024, 1),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512, 1),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256, 1),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU()
        )
        
        #Latent Space
        self.fc_mu = nn.Linear(64, 64)
        self.fc_var = nn.Linear(64, 64)
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(64, 256),
            nn.BatchNorm1d(256, 1),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.BatchNorm1d(512, 1),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.BatchNorm1d(1024, 1),
            nn.ReLU(),
            nn.Linear(1024, 60664),
            nn.ReLU(),
        )

        utils._submodules_init_weights_xavier_uniform_(self.encoder)
        utils._submodules_init_weights_xavier_uniform_(self.decoder)
        utils._submodules_init_weights_xavier_uniform_(self.fc_mu)
        utils._xavier_uniform_(self.fc_var, -1.0)

    #Call Encoder
    def encode(self, x):
        x = self.encoder(x)
        return self.fc_mu(x), self.fc_var(x)

    #Call Decoder
    def decode(self, z):
        return self.decoder(z)
    
    #Update parameters
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    #Forward pass
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon_x = self.decode(z)
        return recon_x, mu, logvar

In [4]:
"""
Trainer for data:

    - Sets Hyperparameters
    - Loads data
    - trains data
    
    - Parameters:
        - device, model, batch size, learning rate, epochs, starting kl value, 
            ending kl value, epoch to start annealing kl, number of annealing steps

    - Constructor:
        - Load model to device
        - Set optimizer
        - Initialize hyperparameters
        - Setup tensorboard writer
        - Load data 

    - Loss function:
        - Calulcates MSE loss and KL divergence using mean reduction
        - Returns these two values as tuple

    - Training loop:
        - Iterates over train_loader
        - Calculates KL annealing rate
        - Calls loss function 
        - Multiplies annealing rate by kl divergence
        - Adds annealed kl loss and MSE loss for total loss
        - Preforms norm clipping
        - Writes to tensboard

    - Annealing KL loss:
        - Parameters:
            - start_kl: initial weight of KL loss
            - end_kl: end weight that KL loss builds up to
            - annealing_start: epoch to start adding KL loss 
            - annealing_steps: number of epochs for KL loss to grow over
        
        - Training:
            - Check for starting epoch 
            - If current epoch = annealing_start, anneal kl
            - If current epoch != annealing_start, kl = 0
    
    - Usage:
        - Create instance of trainer: trainer = VAETrainer(device)
        - Call training loop function: trainer.train()
"""

import torch
import torch.nn.functional as F
import mmvae.trainers.utils as utils
import torch.nn as nn
import mmvae.models.Arch_Model as Arch_Model
from mmvae.data import configure_singlechunk_dataloaders
import torch.utils.tensorboard as tb

class VAETrainer:
    #Allow for possibility of sending specfifc hyperparameters into trainer
    def __init__(self, device, model=Arch_Model.VAE(), batch_size=128, learning_rate=0.0001, num_epochs=10, start_kl=0.0, end_kl=1.0, annealing_start=0, annealing_steps=10):
        #Configure
        self.model = model.to(device)
        self.device = device
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        #Hyperparameters
        self.lr = learning_rate
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.start_kl = start_kl #Initial Weight of KL loss
        self.end_kl = end_kl    #End weight of KL loss
        self.annealing_start = annealing_start  #Specify what epoch to start annealing kl 
        self.annealing_steps = annealing_steps  #Specify number of steps to anneal kl over
        #Tensorboard writer 
        self.writer = tb.SummaryWriter()
        #Load Data
        self.train_loader = loaders.configure_singlechunk_dataloaders(
            data_file_path='/active/debruinz_project/CellCensus_3M_Full/3m_human_full.npz',
            metadata_file_path=None,
            train_ratio=1,
            batch_size=self.batch_size,
            device=None
        )

    #Loss function returning MSE and KL divergence as a tuple using mean reduction
    def loss_function(self, recon_x, x: torch.Tensor, mu, logvar):
        reconstruction_loss = F.mse_loss(recon_x, x.to_dense(), reduction='mean') 
        kl_divergence = (-0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())).mean()
        return reconstruction_loss, kl_divergence

    #Training loop
    def train(self):
        print("Start Training ....")
        for epoch in range(self.num_epochs):
            for i, (x, _) in enumerate(self.train_loader):
                x = x.to(self.device)
                self.optimizer.zero_grad()
                recon_batch, mu, logvar = self.model(x)

                #Check starting epoch for kl
                annealing = 0
                if epoch >= self.annealing_start:
                    annealing_ratio = (epoch - self.annealing_start) / self.annealing_steps
                    annealing = self.start_kl + annealing_ratio * (self.end_kl - self.start_kl)
                
                #Combine MSE and KL for total loss (kl * 0 if not at starting epoch)
                recon_loss, kl_loss = self.loss_function(recon_batch, x, mu, logvar)
                annealing_kl = kl_loss * annealing
                loss = recon_loss + annealing_kl

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0) #Gradient Norm Clipping
                self.optimizer.step()

                #Tensorboard total loss over iterations
                self.writer.add_scalar('Loss/Iteration', loss.item(), epoch * len(self.train_loader) + i) 

            #Write to tensorboard  
            self.writer.add_scalar('Annealing Schedule', annealing, epoch) 
            self.writer.add_scalar('Loss/KL', kl_loss.item(), epoch)
            self.writer.add_scalar('Loss/MSE', recon_loss.item(), epoch)
            self.writer.add_scalar('Loss/Total', loss.item(), epoch)
            
        print("done training")
        self.writer.flush()

In [5]:
"""
Main file to call trainer:

    - Sets device to cuda if available, if not use CPU
    - Creater trainer and call train function
"""

import torch
from mmvae.trainers.Arch_Trainer import VAETrainer

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device:", device)
    trainer = VAETrainer(device)
    trainer.train()

if __name__ == "__main__":
    main()

Device: cpu


FileNotFoundError: [Errno 2] No such file or directory: '/active/debruinz_project/CellCensus_3M_Full/3m_human_full.npz'