In [1]:
# Single VAE Autoencoder.ipynb
# encoder_layers = [[8], [10], [12], [14], [16], [18] ,[20]]
# decoder_layers = [[8, 10], [10, 12], [12, 14], [14, 16], [16, 18], [18, 20], [20,22]]
# latent_layer = [8, 10, 12, 14, 16, 18]
# EPOCHS = 150

In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.style.use('ggplot')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
src_df = pd.read_csv("Original.csv") # ../SB-001/SB-001
x_train = src_df.values
train_ds = torch.tensor(x_train).to(torch.float32)
print(train_ds.shape)
input_vector = src_df.shape[1]

torch.Size([57, 24])


In [4]:
input_vector

24

In [5]:
def plot_correlation_map(original_df, synth_df, name):

    corr_matrix1 = original_df.corr()
    corr_matrix2 = synth_df.corr()

    fig, axes = plt.subplots(1, 2, figsize=(18, 10))
    sns.heatmap(corr_matrix1, annot=True, cmap='coolwarm', ax=axes[0])
    axes[0].set_title('Original')

    sns.heatmap(corr_matrix2, annot=True, cmap='coolwarm', ax=axes[1])
    axes[1].set_title('Synthetic')

    plt.tight_layout()
    plt.savefig(f"{name}_corelation.png")
    plt.clf()
    plt.close(fig)

def plot_density_curves(df1, df2, name):
    num_columns = len(df1.columns)
    num_rows = int(np.ceil(num_columns / 5))
    fig, axes = plt.subplots(num_rows, 5, figsize=(20, 4 * num_rows))

    for i, column in enumerate(df1.columns):
        row = i // 5
        col = i % 5
        ax = axes[row, col] if num_rows > 1 else axes[col]

        sns.kdeplot(data=df1[column], color='blue', label='Original', ax=ax)
        sns.kdeplot(data=df2[column], color='green', label='Synthetic', ax=ax)

        #ax.set_title(column)
        ax.set_xlabel('x{}'.format(i+1))
        ax.set_xlim(0, 1)

        ax.legend()

    plt.tight_layout()
    plt.savefig(f'{name}_density_curves.png')
    plt.clf()
    plt.close(fig)


def mean_and_stddiv(original_df, synth_df, name):
    common_columns = set(original_df.columns) & set(synth_df.columns)
    results = {}

    for column in common_columns:
        mean_df1 = original_df[column].mean()
        std_df1 = original_df[column].std()
        mean_df2 = synth_df[column].mean()
        std_df2 = synth_df[column].std()

        results[column] = {'Mean_df1': mean_df1, 'Std_df1': std_df1,
                           'Mean_df2': mean_df2, 'Std_df2': std_df2}

    comparison_df = pd.DataFrame(results)
    comparison_df.to_csv(name + 'mean_std.csv', index=True)       
  

In [6]:
class autoencoder(nn.Module):
    def __init__(self, input_vector, encoder_layers, latent_layer, decoder_layers):
        super().__init__()
        
        """
        # Hyper params
        self.learning_rate = 0.01
        self.optimizer = optim.Adam(params=self.parameters(), lr=self.learning_rate)'
        """
        # Construct the layers of the encoder
        self.encoder = nn.ModuleList()
        self.encoder.append(nn.Linear(input_vector, encoder_layers[0]))
        for i in range(len(encoder_layers) - 1):
            self.encoder.append(nn.Linear(encoder_layers[i], encoder_layers[i+1]))
            self.encoder.append(nn.ReLU())
        #self.encoder.append(nn.Linear(encoder_layers[-1], latent_layer))
        #self.encoder.append(nn.BatchNorm(latent_layer)) # Will play with BatchNorm after getting this to work
        
        # Layers to make the latent vector
        self.encode_mean = nn.Linear(encoder_layers[-1], latent_layer)
        self.encode_variance = nn.Linear(encoder_layers[-1], latent_layer)
        
        # Construct the layers of the decoder
        self.decoder = nn.ModuleList()
        self.decoder.append(nn.Linear(latent_layer, decoder_layers[0]))
        for i in range(len(decoder_layers) - 1):
            self.decoder.append(nn.Linear(decoder_layers[i], decoder_layers[i+1]))
            self.decoder.append(nn.ReLU())
        self.decoder.append(nn.Linear(decoder_layers[-1], input_vector))

            # Method to run the encoder layers
    def encode(self, input_data):
        for layer in self.encoder:
            input_data = layer(input_data)
        mean = self.encode_mean(input_data)
        variance = self.encode_variance(input_data)
        return mean, variance
    
    # Resamples the distribution
    def reparameterize(self, mean, variance):
        #std = torch.exp(0.5 * variance)
        resampled_vector = mean + variance * torch.randn_like(variance)
        return resampled_vector
    
    # Method to run the decoder layers
    def decode(self, input_data):
        for layer in self.decoder:
            input_data = layer(input_data)
        return input_data
    
    # Run the autoencoder
    def forward(self, input_data):
        mean, variance = self.encode(input_data)
        resampled_vector = self.reparameterize(mean, variance)
        output = self.decode(resampled_vector)
        return output, mean, variance

def calc_loss(output_data, target, mean, variance):
    reconstruction_loss = nn.MSELoss(reduction='sum')(output_data, target)
    kl_div = -0.5 * torch.mean(torch.sum(1 + variance - mean ** 2 - torch.exp(variance), dim = 1), dim = 0)
    return reconstruction_loss + kl_div

In [7]:
encoder_layers = [[8], [10], [12], [14], [16], [18] ,[20]]
decoder_layers = [[8, 10], [10, 12], [12, 14], [14, 16], [16, 18], [18, 20], [20,22]]
latent_layer = [8, 10, 12, 14, 16, 18]
EPOCHS = 150

In [8]:
for latent in latent_layer:
    for enc_layers in encoder_layers:
        for dec_layers in decoder_layers:
            print(f"L{input_vector}_E{enc_layers}_B{latent}_D{dec_layers}")

L24_E[8]_B8_D[8, 10]
L24_E[8]_B8_D[10, 12]
L24_E[8]_B8_D[12, 14]
L24_E[8]_B8_D[14, 16]
L24_E[8]_B8_D[16, 18]
L24_E[8]_B8_D[18, 20]
L24_E[8]_B8_D[20, 22]
L24_E[10]_B8_D[8, 10]
L24_E[10]_B8_D[10, 12]
L24_E[10]_B8_D[12, 14]
L24_E[10]_B8_D[14, 16]
L24_E[10]_B8_D[16, 18]
L24_E[10]_B8_D[18, 20]
L24_E[10]_B8_D[20, 22]
L24_E[12]_B8_D[8, 10]
L24_E[12]_B8_D[10, 12]
L24_E[12]_B8_D[12, 14]
L24_E[12]_B8_D[14, 16]
L24_E[12]_B8_D[16, 18]
L24_E[12]_B8_D[18, 20]
L24_E[12]_B8_D[20, 22]
L24_E[14]_B8_D[8, 10]
L24_E[14]_B8_D[10, 12]
L24_E[14]_B8_D[12, 14]
L24_E[14]_B8_D[14, 16]
L24_E[14]_B8_D[16, 18]
L24_E[14]_B8_D[18, 20]
L24_E[14]_B8_D[20, 22]
L24_E[16]_B8_D[8, 10]
L24_E[16]_B8_D[10, 12]
L24_E[16]_B8_D[12, 14]
L24_E[16]_B8_D[14, 16]
L24_E[16]_B8_D[16, 18]
L24_E[16]_B8_D[18, 20]
L24_E[16]_B8_D[20, 22]
L24_E[18]_B8_D[8, 10]
L24_E[18]_B8_D[10, 12]
L24_E[18]_B8_D[12, 14]
L24_E[18]_B8_D[14, 16]
L24_E[18]_B8_D[16, 18]
L24_E[18]_B8_D[18, 20]
L24_E[18]_B8_D[20, 22]
L24_E[20]_B8_D[8, 10]
L24_E[20]_B8_D[10, 12]
L2

In [9]:
total_iterations = len(latent_layer) * len(encoder_layers) * len(decoder_layers)
print("Total Model in Pipeline:",total_iterations)

#print total models
import time
time.sleep(3)

for latent in latent_layer:
    for enc_layers in encoder_layers:
        for dec_layers in decoder_layers:
            print(f"TRAINING: L{input_vector}_E{enc_layers}_B{latent}_D{dec_layers}")
            model = autoencoder(input_vector, enc_layers, latent, dec_layers)

            optimizer = optim.Adam(model.parameters(), lr=0.001)
            train_dl = DataLoader(train_ds)

            history = []
            for epoch in range(EPOCHS):
                total_loss = 0
                for sample in train_dl:
                    # forward pass
                    pred, mean, variance = model(sample)
                    loss = calc_loss(pred, sample, mean, variance)
                    # backward pass
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    
                    total_loss += loss.item()
                history.append([epoch, total_loss])    
                #print('Epoch:', epoch, '\t Loss:', total_loss)
            print("TRAINING COMPLETED: LOSS", history[-1][1])
            history = pd.DataFrame(history, columns=['epoch', 'loss'])
            name = f"L{input_vector}_E{enc_layers}_B{latent}_D{dec_layers}" 

            
            plt.plot(history['epoch'], history['loss'])
            plt.savefig(f"{name}_loss.png")
            plt.clf()

            original_df = pd.DataFrame(train_ds.numpy(), columns=src_df.columns)
            synth_data = model(train_ds)[0].detach().numpy()
            synth_df = pd.DataFrame(synth_data, columns=src_df.columns)
            
            history.to_csv(f"{name}_history.csv")
            synth_df.to_csv(f"{name}_synthetic.csv")

            print("DRAWING FIGURES")
            plot_correlation_map(original_df, synth_df, name)
            plot_density_curves(original_df, synth_df, name)
            mean_and_stddiv(original_df, synth_df, name)
            print("FIGURES DRAWN")
            plt.close()       

Total Model in Pipeline: 294
TRAINING: L24_E[8]_B8_D[8, 10]
TRAINING COMPLETED: LOSS 12.617941856384277
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[8]_B8_D[10, 12]
TRAINING COMPLETED: LOSS 14.066182397305965
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[8]_B8_D[12, 14]
TRAINING COMPLETED: LOSS 13.273232325911522
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[8]_B8_D[14, 16]
TRAINING COMPLETED: LOSS 12.189386751502752
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[8]_B8_D[16, 18]
TRAINING COMPLETED: LOSS 12.541226111352444
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[8]_B8_D[18, 20]
TRAINING COMPLETED: LOSS 9.488120086491108
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[8]_B8_D[20, 22]
TRAINING COMPLETED: LOSS 8.668761268258095
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[10]_B8_D[8, 10]
TRAINING COMPLETED: LOSS 13.234442185610533
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[10]_B8_D[10, 12]
TRAINING COMPLETED: LOSS 11.07283552736044
DRAWING FIGURES
FIGURES DRAWN
TRAINING: L24_E[10]_