## Preprosssing on UCI Wine Quality dataset

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torchvision.utils import make_grid
from scipy.stats import norm
from sklearn import preprocessing

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Here the dataset is generated using 

```
real_data.R
```

You should first generate from R and import them here for the cross-language reproducibility.



In [None]:
df = pd.read_csv("/content/drive/MyDrive/Thesis & Project/wine.csv")

In [None]:
df = df.iloc[:, range(0,10)]

In [None]:
df

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40
...,...,...,...,...,...,...,...,...,...,...
4995,7.0,0.500,0.25,2.0,0.070,3.0,22.0,0.9963,3.25,0.63
4996,7.6,0.900,0.06,2.5,0.079,5.0,10.0,0.9967,3.39,0.56
4997,8.1,0.545,0.18,1.9,0.080,13.0,35.0,0.9972,3.30,0.59
4998,8.3,0.610,0.30,2.1,0.084,11.0,50.0,0.9972,3.40,0.61


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("GPU Enabled:",torch.cuda.is_available())

GPU Enabled: True


In [None]:
def setup_data_loaders(batch_size=100, use_cuda=False):
  scaler = preprocessing.MinMaxScaler()
  names = df.columns
  d = scaler.fit_transform(df)
  scaled_df = pd.DataFrame(d, columns=names)
  scaled_data = scaled_df.to_numpy()
  train_set = scaled_data[range(0, int(len(df) / 5 * 4)), ].astype(np.float32)
  test_set = scaled_data[range(int(len(df) / 5 * 4), len(df)), ].astype(np.float32)
  data_loader = DataLoader(dataset=scaled_data.astype(np.float32), batch_size=len(df), shuffle=True)
  train_loader = DataLoader(dataset=train_set,
                            batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(dataset=test_set,
                           batch_size=batch_size, shuffle=False)
  return data_loader, train_loader, test_loader

## Defining VAE

In [None]:
class VAE(nn.Module):
    def __init__(self, latent_dim):
        super().__init__()
        
        self.input_dim = df.shape[1]
        self.encoder = nn.Sequential(nn.Linear(self.input_dim, 256),
                                     nn.Softplus(),
                                     nn.Linear(256, 128),
                                     nn.BatchNorm1d(128),
                                     nn.Softplus(),
                                     nn.Linear(128, 64),
                                     nn.BatchNorm1d(64),
                                     nn.Softplus(),
                                     nn.Linear(64, 8),
                                     )
        
        self.mu     = nn.Linear(8, latent_dim)
        self.logvar = nn.Linear(8, latent_dim)
        
        self.latent_mapping = nn.Linear(latent_dim, 8)
        
        self.decoder = nn.Sequential(nn.Linear(8, 16),
                                     nn.Softplus(),
                                     nn.BatchNorm1d(16),
                                     nn.Linear(16, 64),
                                     nn.Softplus(),
                                     nn.BatchNorm1d(64),
                                     nn.Linear(64, 128),
                                     nn.Softplus(),
                                     nn.BatchNorm1d(128),
                                     nn.Linear(128, self.input_dim))        
        
    def encode(self, x):
        #x = x.view(x.size(0), -1)
        encoder = self.encoder(x)
        mu, logvar = self.mu(encoder), self.logvar(encoder)
        return mu, logvar
        
    def sample_z(self, mu, logvar):
        eps = torch.rand_like(mu)
        return mu + eps * torch.exp(0.5 * logvar)
    
    def decode(self, z, x):
        latent_z = self.latent_mapping(z)
        out = self.decoder(latent_z)
        reshaped_out = torch.sigmoid(out).reshape((-1, self.input_dim))
        return reshaped_out

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.sample_z(mu, logvar)
        output = self.decode(z, x)
        
        return output

In [None]:
def elbo_loss(x_generated, x_true, mu, logvar):
    recon_loss = nn.functional.mse_loss(x_generated, x_true, reduction='none')
    kld_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp(), 1).mean()
    loss = torch.mean(kld_loss + recon_loss)
    
    return loss, torch.mean(recon_loss), torch.mean(kld_loss)

In [None]:
# Define the functions

In [None]:
def training_function(latent_dimension, train, test):
  vae_net = VAE(latent_dim = latent_dimension)
  opt = torch.optim.Adam(vae_net.parameters())
  max_epochs = 10

  vae_net = vae_net.to(device)

  for epoch in range(max_epochs):
      
      train_loss = 0.0
      train_loss_rec = 0.0
      train_loss_kdl = 0.0
      
      for i, data in enumerate(train_loader, 0):

          inputs = data

          inputs = inputs.to(device)
          
          # training steps for normal model
          opt.zero_grad()
          
          mu, logvar = vae_net.encode(inputs)
          z = vae_net.sample_z(mu, logvar)
          outputs = vae_net.decode(z, inputs)

          loss, recon_loss, kld_loss = elbo_loss(outputs, inputs, mu, logvar)
          loss.backward()
          opt.step()   
        
          # print statistics
          train_loss += loss.item()
          train_loss_rec += recon_loss.item()
          train_loss_kdl += kld_loss.item()

    
      test_loss = 0.0
      test_loss_rec = 0.0
      test_loss_kdl = 0.0

      for i, data in enumerate(test_loader, 0):
        inputs = data
        inputs = inputs.to(device)
        mu, logvar = vae_net.encode(inputs)
        z = vae_net.sample_z(mu, logvar)
        outputs = vae_net.decode(z, inputs)
        
        loss, recon_loss, kld_loss = elbo_loss(outputs, inputs, mu, logvar)

        test_loss += loss.item()
        test_loss_rec += recon_loss.item()
        test_loss_kdl += kld_loss.item()


      print(f'Epoch {epoch+1} \t\t Training Loss: {\
                                              train_loss / len(train_loader)} \t\t Validation Loss: {\
                                                                                                      test_loss / len(test_loader)}')
  return vae_net


## Training function

Started to train and save the corresponding latent confounder estimations. Then you can import them in R. With the same random seed in R, it can be guaranteed to experiment on the same dataset.

In [None]:
data_loader, train_loader, test_loader = setup_data_loaders()
vae_net = training_function(1, train_loader, test_loader)
for _, data in enumerate(data_loader, 0):
  inputs = data.to(device)
  mu, logvar = vae_net.encode(inputs)
  z = vae_net.sample_z(mu, logvar)
  z_np = z.cpu().detach().numpy() #convert to Numpy array
  df = pd.DataFrame(z_np) #convert to a dataframe
  df.to_csv("zvae.csv", index=False) #save to file

Epoch 1 		 Training Loss: 0.12336255107074975 		 Validation Loss: 0.09397087320685386
Epoch 2 		 Training Loss: 0.06656816257163882 		 Validation Loss: 0.04324515350162983
Epoch 3 		 Training Loss: 0.021971254120580853 		 Validation Loss: 0.013279058784246445
Epoch 4 		 Training Loss: 0.008760988037101925 		 Validation Loss: 0.009573898464441299
Epoch 5 		 Training Loss: 0.007341346889734268 		 Validation Loss: 0.009184784861281515
Epoch 6 		 Training Loss: 0.007110699242912233 		 Validation Loss: 0.009114605886861683
Epoch 7 		 Training Loss: 0.007016649807337671 		 Validation Loss: 0.009064065106213093
Epoch 8 		 Training Loss: 0.006934608181472868 		 Validation Loss: 0.008979910798370839
Epoch 9 		 Training Loss: 0.006887508637737483 		 Validation Loss: 0.008904504124075174
Epoch 10 		 Training Loss: 0.006891176127828658 		 Validation Loss: 0.00892068394459784


In [None]:
data_loader, train_loader, test_loader = setup_data_loaders()

In [None]:
df

Unnamed: 0,0
0,0.900269
1,0.078015
2,0.276061
3,0.067998
4,0.706525
...,...
4995,0.675068
4996,0.683754
4997,0.091734
4998,0.859953
