In [None]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.distributions as dist
import torch
import copy
import matplotlib.pyplot as plt
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import Callback
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from scripts.utils import ScaleData, train_keys
from scripts.AutoEncoder import Encoder, Decoder, AutoEncoderDataset
from tqdm import tqdm

import itertools
import seaborn as sns
%matplotlib notebook

In [None]:
device = torch.device("cpu")

In [None]:
def ELBO(encoder, decoder, X):
        # calculate the ELBO loss
        q_z_given_x = encoder.forward(X)

        q_samples = q_z_given_x.rsample()

        ones = torch.ones(2)
        zeros = torch.zeros(2)
        
#         if torch.cuda.is_available():
#             ones.cuda()
#             zeros.cuda()
#             q_samples.cuda()
        
        latent_prior = dist.Normal(zeros, ones)
             
        log_p_z = latent_prior.log_prob(q_samples).sum(-1)

        log_q_z_given_x = q_z_given_x.log_prob(q_samples).sum(-1)

        log_p_x_given_z = decoder.forward(q_samples).log_prob(X).sum(dim=1)
        
        ELBO = log_p_x_given_z + log_p_z - log_q_z_given_x

        return ELBO

In [None]:
encoder = Encoder(3, 15, VAE = True)
decoder = Decoder(3, 15, VAE = True) 

In [None]:
train_data_path = "/share/rcifdata/jbarr/UKAEAGroupProject/data/train_data_clipped.pkl"
train_data = AutoEncoderDataset(train_data_path, columns = train_keys, train = True)
train_data.data = train_data.data.sample(100_000)
train_data.scale()

train_loader = DataLoader(train_data, shuffle = True, batch_size = 2048)

valid_data_path = "/share/rcifdata/jbarr/UKAEAGroupProject/data/valid_data_clipped.pkl"
valid_data = AutoEncoderDataset(valid_data_path, columns = train_keys, train = True)
valid_data.data = valid_data.data.sample(100_000)
valid_data.scale()

valid_loader = DataLoader(valid_data, shuffle = True, batch_size = 2048)

In [None]:
# if torch.cuda.is_available():
#     encoder = encoder.cuda()
#     decoder = decoder.cuda() 

opt_vae = torch.optim.Adam(itertools.chain(encoder.parameters(), decoder.parameters()))
N_epochs = 50 # Note that you may want to run more than 10 epochs!
for epoch in range(N_epochs):
    train_loss = 0.0
    for X in train_loader:
#         if torch.cuda.is_available():
#             X = X.cuda()

        opt_vae.zero_grad()
        loss = -ELBO(encoder, decoder, X).mean()
        loss.backward()
        opt_vae.step()
        train_loss += loss.item() * X.shape[0] / len(train_data)
    print("Epoch %d, train loss = %0.4f" % (epoch, train_loss));

In [None]:
X_random_batch = next(iter(valid_loader))
X_random_batch.shape

In [None]:
with torch.no_grad():    
    out = encoder(X_random_batch).sample()

In [None]:
plt.figure()
plt.scatter(out[:,0], out[:,1])

In [None]:
test = "/share/rcifdata/jbarr/UKAEAGroupProject/data/test_data_clipped.pkl"

df_test = pd.read_pickle(test)
df_test = df_test.sample(10_000)
target = df_test['target']
df_test_good = df_test[df_test.target == 1]
df_test_good = df_test_good[train_keys]

df_test_good,_ = ScaleData(df_test_good)

df_test_bad = df_test[df_test.target == 0]
df_test_bad = df_test_bad[train_keys]
df_test_bad,_ = ScaleData(df_test_bad)

df_test_good.describe()

In [None]:
data_good = torch.from_numpy(df_test_good.values).float()
data_bad = torch.from_numpy(df_test_bad.values).float()
#data_good_batch = next(iter(data_good_loader))
with torch.no_grad():
    outputs_good = encoder.forward(data_good).sample()
    outputs_bad = encoder.forward(data_bad).sample()
#    outputs_good = encoder.forward(data_good).sample().detach().numpy()
    
plt.figure()
plt.scatter(outputs_good[:,0], outputs_good[:,1])
plt.scatter(outputs_bad[:,0], outputs_bad[:,1])

In [None]:
AE_output = decoder.forward(encoder.forward(data_good).sample()).sample().detach().numpy()
df_ae_output = pd.DataFrame(AE_output, columns = train_keys)
df_ae_output['AE'] = 'Outputs'

df_test_tmp = df_test_good
df_test_tmp['AE'] = 'Inputs'

In [None]:
df_compare = pd.concat([df_ae_output, df_test_tmp], ignore_index=True)
df_compare_sample = df_compare.sample(10_000)

In [None]:
for i in train_keys:
    plt.figure()
    x_min = df_compare_sample[i].quantile(0.1)
    x_max = df_compare_sample[i].quantile(0.9)
    sns.histplot(data = df_compare_sample, x = i, hue = "AE", binrange = (x_min, x_max), bins = 100);
    plt.xlabel(i)

# VAE 2

In [None]:
class LinearVAE(nn.Module):
    def __init__(self):
        super(LinearVAE, self).__init__()
        
 
        # encoder
        self.enc1 = nn.Linear(in_features=15, out_features = 10)
        self.enc2 = nn.Linear(in_features=10, out_features = 5)
        
        self.mu = nn.Linear(5, 2)
        self.sigma = nn.Linear(5,2)
 
        # decoder 
        self.dec1 = nn.Linear(in_features = 2, out_features = 5)
        self.dec2 = nn.Linear(in_features = 5, out_features = 10)
        self.dec3 = nn.Linear(10, 15)
        
    def reparameterize(self, mu, log_var):
        """
        :param mu: mean from the encoder's latent space
        :param log_var: log variance from the encoder's latent space
        """
        std = torch.exp(0.5*log_var) # standard deviation
        eps = torch.randn_like(std) # `randn_like` as we need the same size
        sample = mu + (eps * std) # sampling as if coming from the input space
        return sample
 
    def forward(self, x):
        # encoding
        x = x.float()
        x = F.relu(self.enc1(x.float()))
        x = F.relu(self.enc2(x.float()))
        # get `mu` and `log_var`
        mu = self.mu(x) # the first feature values as mean
        log_var = self.sigma(x) # the other feature values as variance
        # get the latent vector through reparameterization
        z = self.reparameterize(mu, log_var)
 
        # decoding
        z = F.relu(self.dec1(z.float()))
        z = F.relu(self.dec2(z.float()))
        reconstruction = self.dec3(z.float())
        return reconstruction.float(), mu.float(), log_var.float()

In [None]:
batch_size = 64
lr = 0.0001
epochs = 50

model = LinearVAE().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss(reduction='sum')

In [None]:
def final_loss(bce_loss, mu, logvar):
    """
    This function will add the reconstruction loss (BCELoss) and the 
    KL-Divergence.
    KL-Divergence = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    :param bce_loss: recontruction loss
    :param mu: the mean from the latent vector
    :param logvar: log variance from the latent vector
    """
    BCE = bce_loss 
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

In [None]:
def fit(model, dataloader):
    model.train()
    running_loss = 0.0
    for i, data in tqdm(enumerate(dataloader), total=int(len(train_data)/dataloader.batch_size)):
        #data, _ = data
        #data = data.to(device)
        #data = data.view(data.size(0), -1)
        optimizer.zero_grad()
        reconstruction, mu, logvar = model(data)
        bce_loss = criterion(reconstruction.float(), data.float())
        loss = final_loss(bce_loss, mu, logvar)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    train_loss = running_loss/len(dataloader.dataset)
    return train_loss

In [None]:
def validate(model, dataloader):
    model.eval()
    running_loss = 0.0
    with torch.no_grad():
        for i, data in tqdm(enumerate(dataloader), total=int(len(valid_data)/dataloader.batch_size)):
            #data = data.to(device)
            #data = data.view(data.size(0), -1)
            reconstruction, mu, logvar = model(data)
            bce_loss = criterion(reconstruction.float(), data.float())
            loss = final_loss(bce_loss, mu, logvar)
            running_loss += loss.item()
        
    val_loss = running_loss/len(dataloader.dataset)
    return val_loss

In [None]:
train_loss = []
val_loss = []
for epoch in range(epochs):
    print(f"Epoch {epoch+1} of {epochs}")
    train_epoch_loss = fit(model, train_loader)
    val_epoch_loss = validate(model, valid_loader)
    train_loss.append(train_epoch_loss)
    val_loss.append(val_epoch_loss)
    print(f"Train Loss: {train_epoch_loss:.4f}")
    print(f"Val Loss: {val_epoch_loss:.4f}")

In [None]:
AE_output2,_,_ = model(data_good)
AE_output2 = AE_output2.detach().numpy()
df_ae_output2 = pd.DataFrame(AE_output2, columns = train_keys)
df_ae_output2['AE'] = 'Outputs'

df_compare2 = pd.concat([df_ae_output2, df_test_tmp], ignore_index=True)
df_compare_sample2= df_compare2.sample(10_000)

In [None]:
for i in train_keys:
    plt.figure()
    x_min = df_compare_sample2[i].quantile(0.1)
    x_max = df_compare_sample2[i].quantile(0.9)
    sns.histplot(data = df_compare_sample2, x = i, hue = "AE", binrange = (x_min, x_max), bins = 100);
    plt.xlabel(i)