## Latent Confounder Finder

This is the code for finding the estimated latent confounders.

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torchvision.utils import make_grid
from scipy.stats import norm
from sklearn import preprocessing

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Here the large dataset and small dataset are generated using 

```
Export_dataset.R
```

You should first generate from R and import them here for the cross-language reproducibility.



In [None]:
ldata = pd.read_csv("/content/drive/MyDrive/Thesis & Project/largedata.csv")

In [None]:
ldata = ldata.to_numpy()

In [None]:
sdata = pd.read_csv("/content/drive/MyDrive/Thesis & Project/smalldata.csv")

In [None]:
sdata = sdata.to_numpy()

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("GPU Enabled:",torch.cuda.is_available())

GPU Enabled: True


In [None]:
def setup_data_loaders(amount, batch_size=200, use_cuda=False):
  if amount == "L":
    datasets = pd.read_csv("/content/drive/MyDrive/Thesis & Project/largedata.csv")
  else:
    datasets = pd.read_csv("/content/drive/MyDrive/Thesis & Project/smalldata.csv")
  scaler = preprocessing.MinMaxScaler()
  names = datasets.columns
  d = scaler.fit_transform(datasets)
  scaled_df = pd.DataFrame(d, columns=names)
  scaled_data = scaled_df.to_numpy()
  train_set = scaled_data[range(0, int(len(datasets) / 5 * 4)), ].astype(np.float32)
  test_set = scaled_data[range(int(len(datasets) / 5 * 4), len(datasets)), ].astype(np.float32)
  data_loader = DataLoader(dataset=scaled_data.astype(np.float32), batch_size=len(datasets), shuffle=True)
  train_loader = DataLoader(dataset=train_set,
                            batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(dataset=test_set,
                           batch_size=batch_size, shuffle=False)
  return data_loader, train_loader, test_loader

## Defining VAE

In [None]:
class VAE(nn.Module):
    def __init__(self, latent_dim):
        super().__init__()
        
        self.encoder = nn.Sequential(nn.Linear(20, 256),
                                     nn.Softplus(),
                                     nn.Linear(256, 128),
                                     nn.BatchNorm1d(128),
                                     nn.Softplus(),
                                     nn.Linear(128, 64),
                                     nn.BatchNorm1d(64),
                                     nn.Softplus(),
                                     nn.Linear(64, 8),
                                     )
        
        self.mu     = nn.Linear(8, latent_dim)
        self.logvar = nn.Linear(8, latent_dim)
        
        self.latent_mapping = nn.Linear(latent_dim, 8)
        
        self.decoder = nn.Sequential(nn.Linear(8, 16),
                                     nn.Softplus(),
                                     nn.BatchNorm1d(16),
                                     nn.Linear(16, 64),
                                     nn.Softplus(),
                                     nn.BatchNorm1d(64),
                                     nn.Linear(64, 128),
                                     nn.Softplus(),
                                     nn.BatchNorm1d(128),
                                     nn.Linear(128, 20))        
        
    def encode(self, x):
        #x = x.view(x.size(0), -1)
        encoder = self.encoder(x)
        mu, logvar = self.mu(encoder), self.logvar(encoder)
        return mu, logvar
        
    def sample_z(self, mu, logvar):
        eps = torch.rand_like(mu)
        return mu + eps * torch.exp(0.5 * logvar)
    
    def decode(self, z, x):
        latent_z = self.latent_mapping(z)
        out = self.decoder(latent_z)
        reshaped_out = torch.sigmoid(out).reshape((-1, 20))
        return reshaped_out

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.sample_z(mu, logvar)
        output = self.decode(z, x)
        
        return output

In [None]:
def elbo_loss(x_generated, x_true, mu, logvar):
    recon_loss = nn.functional.mse_loss(x_generated, x_true, reduction='none')
    kld_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), 1).mean()
    loss = torch.mean(kld_loss + recon_loss)
    
    return loss, torch.mean(recon_loss), torch.mean(kld_loss)

In [None]:
# Define the functions

In [None]:
def training_function(latent_dimension, train, test):
  vae_net = VAE(latent_dim = latent_dimension)
  opt = torch.optim.Adam(vae_net.parameters())
  BATCH_SIZE = 200
  max_epochs = 30

  vae_net = vae_net.to(device)

  for epoch in range(max_epochs):
      
      train_loss = 0.0
      train_loss_rec = 0.0
      train_loss_kdl = 0.0
      
      for i, data in enumerate(train_loader, 0):

          inputs = data

          inputs = inputs.to(device)
          
          # training steps for normal model
          opt.zero_grad()
          
          mu, logvar = vae_net.encode(inputs)
          z = vae_net.sample_z(mu, logvar)
          outputs = vae_net.decode(z, inputs)

          loss, recon_loss, kld_loss = elbo_loss(outputs, inputs, mu, logvar)
          loss.backward()
          opt.step()   
        
          # print statistics
          train_loss += loss.item()
          train_loss_rec += recon_loss.item()
          train_loss_kdl += kld_loss.item()

    
      test_loss = 0.0
      test_loss_rec = 0.0
      test_loss_kdl = 0.0

      for i, data in enumerate(test_loader, 0):
        inputs = data
        inputs = inputs.to(device)
        mu, logvar = vae_net.encode(inputs)
        z = vae_net.sample_z(mu, logvar)
        outputs = vae_net.decode(z, inputs)
        
        loss, recon_loss, kld_loss = elbo_loss(outputs, inputs, mu, logvar)

        test_loss += loss.item()
        test_loss_rec += recon_loss.item()
        test_loss_kdl += kld_loss.item()


      print(f'Epoch {epoch+1} \t\t Training Loss: {\
                                              train_loss / len(train_loader)} \t\t Validation Loss: {\
                                                                                                      test_loss / len(test_loader)}')
  return vae_net


## Training function

In [None]:
def save_z(amount, num_hidden):
  data_loader, train_loader, test_loader = setup_data_loaders(amount)
  vae_net = training_function(num_hidden, train_loader, test_loader)
  for _, data in enumerate(data_loader, 0):
    inputs = data.to(device)
    mu, logvar = vae_net.encode(inputs)
    z = vae_net.sample_z(mu, logvar)
    z_np = z.cpu().detach().numpy() #convert to Numpy array
    df = pd.DataFrame(z_np) #convert to a dataframe
    df.to_csv("/content/vae/" + str(amount) + str(num_hidden) + "file.csv", index=False) #save to file

In [None]:
!mkdir vae

Started to train and save the corresponding latent confounder estimations. Then you can import them in R. With the same random seed in R, it can be guaranteed to experiment on the same dataset.

In [None]:
for _ in range(1, 21):
  save_z("L", _)

for _ in range(1, 21):
  save_z("S", _)

In [None]:
!zip -r latentvars.zip /content/vae/

## Model summary

In [None]:
from torchvision import models
from torchsummary import summary

In [None]:
summary(vae_net, input_size=(20, ))