In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data_file):
        self.data = pd.read_csv(data_file)
        # Normalize input features between 0 and 1
        self.X = self.data[['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 
                            'X_7', 'X_8', 'X_9', 'X_10', 'X_11', 'X_12']].values
        self.X = self.X / np.max(self.X, axis=0)  # Normalize per feature
        self.y = self.data['y'].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# VAE model
class VAE(nn.Module):
    def __init__(self, input_size, latent_size):
        super(VAE, self).__init__()
        self.input_size = input_size
        self.latent_size = latent_size

        # Encoder: maps input to latent space
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, latent_size * 2)  # Outputs mu and logvar
        )

        # Decoder: reconstructs input from latent space
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_size),
            nn.Sigmoid()  # Ensures output values are between 0 and 1
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        encoded = self.encoder(x)
        mu, logvar = encoded[:, :self.latent_size], encoded[:, self.latent_size:]
        z = self.reparameterize(mu, logvar)
        decoded = self.decoder(z)
        return decoded, mu, logvar

# Load dataset
dataset = CustomDataset('assignment_dataset.csv')
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

# Model, optimizer, and loss function
vae = VAE(input_size=12, latent_size=8)  # Increased latent size
optimizer = optim.Adam(vae.parameters(), lr=1e-4)  # Reduced learning rate
criterion = nn.MSELoss()
beta = 0.1  # Balancing factor for KL divergence

# Training loop
epochs = 100
for epoch in range(epochs):
    epoch_loss = 0
    for x, _ in dataloader:
        x = x.float()
        optimizer.zero_grad()
        
        # Forward pass
        recon, mu, logvar = vae(x)
        
        # Compute the loss
        recon_loss = criterion(recon, x)
        kl_divergence = 0.5 * torch.sum(mu ** 2 + torch.exp(logvar) - logvar - 1)
        loss = criterion(recon, x) + beta * 0.5 * torch.sum(mu ** 2 + torch.exp(logvar) - logvar - 1)

        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    epoch_loss /= len(dataloader)
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}")

# Save model weights


Epoch [1/100], Loss: 0.2945
Epoch [2/100], Loss: 0.0992
Epoch [3/100], Loss: 0.0589
Epoch [4/100], Loss: 0.0518
Epoch [5/100], Loss: 0.0495
Epoch [6/100], Loss: 0.0484
Epoch [7/100], Loss: 0.0478
Epoch [8/100], Loss: 0.0476
Epoch [9/100], Loss: 0.0475
Epoch [10/100], Loss: 0.0472
Epoch [11/100], Loss: 0.0467
Epoch [12/100], Loss: 0.0468
Epoch [13/100], Loss: 0.0467
Epoch [14/100], Loss: 0.0465
Epoch [15/100], Loss: 0.0464
Epoch [16/100], Loss: 0.0462
Epoch [17/100], Loss: 0.0463
Epoch [18/100], Loss: 0.0466
Epoch [19/100], Loss: 0.0462
Epoch [20/100], Loss: 0.0463
Epoch [21/100], Loss: 0.0462
Epoch [22/100], Loss: 0.0462
Epoch [23/100], Loss: 0.0462
Epoch [24/100], Loss: 0.0460
Epoch [25/100], Loss: 0.0460
Epoch [26/100], Loss: 0.0459
Epoch [27/100], Loss: 0.0459
Epoch [28/100], Loss: 0.0461
Epoch [29/100], Loss: 0.0460
Epoch [30/100], Loss: 0.0459
Epoch [31/100], Loss: 0.0460
Epoch [32/100], Loss: 0.0457
Epoch [33/100], Loss: 0.0459
Epoch [34/100], Loss: 0.0458
Epoch [35/100], Loss: 0

In [18]:
torch.save(vae.state_dict(), 'vae_model.pth')
print("Model saved as 'vae_model.pth'")



Model saved as 'vae_model.pth'


In [19]:
# Generate synthetic data
vae.eval()  # Set the model to evaluation mode
latent_samples = torch.randn(1000, vae.latent_size)  # Sample from latent space
synthetic_data = vae.decoder(latent_samples).detach().numpy()




In [20]:
# Save synthetic data to CSV
synthetic_df = pd.DataFrame(synthetic_data, columns=[f'X_{i+1}' for i in range(12)])
synthetic_df.to_csv('synthetic_data.csv', index=False)
print("Synthetic data saved as 'synthetic_data.csv'")

# Load model and generate new data (optional)


Synthetic data saved as 'synthetic_data.csv'


In [22]:
loaded_vae = VAE(input_size=12, latent_size=8)
loaded_vae.load_state_dict(torch.load('vae_model.pth'))
loaded_vae.eval()


VAE(
  (encoder): Sequential(
    (0): Linear(in_features=12, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=16, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=12, bias=True)
    (5): Sigmoid()
  )
)

In [23]:

latent_samples = torch.randn(1000, loaded_vae.latent_size)
new_synthetic_data = loaded_vae.decoder(latent_samples).detach().numpy()
print("New synthetic data generated!")

New synthetic data generated!


In [24]:
original_data = pd.read_csv('assignment_dataset.csv')
synthetic_data = pd.read_csv('synthetic_data.csv')

print("Original Data Statistics:")
print(original_data.describe())




Original Data Statistics:
                X_0           X_1           X_2           X_3           X_4  \
count  10000.000000  10000.000000  10000.000000  1.000000e+04  10000.000000   
mean      -0.043590      0.105652      2.149697  2.672754e+00      2.733670   
std        2.749667      1.661659      1.714908  3.965479e+00      0.195528   
min      -13.097479     -7.456419      0.000534  3.794138e-09      0.000000   
25%       -1.833648     -0.960446      0.831932  2.738268e-01      2.628441   
50%       -0.112898      0.135436      1.766501  1.168059e+00      2.750429   
75%        1.697181      1.215033      3.061595  3.445960e+00      2.857354   
max       11.517463      7.916655     13.097479  4.584538e+01      3.376908   

                X_5           X_6           X_7           X_8           X_9  \
count  10000.000000  10000.000000  10000.000000  10000.000000  10000.000000   
mean       0.397514     -0.389984     -0.041177     -0.184348     -0.027133   
std        2.429113      

In [25]:
print("\nSynthetic Data Statistics:")
print(synthetic_data.describe())


Synthetic Data Statistics:
               X_1          X_2          X_3          X_4          X_5  \
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000   
mean      0.009720     0.164849     0.058832     0.806773     0.033667   
std       0.005879     0.007535     0.005662     0.007079     0.007488   
min       0.000578     0.136883     0.041293     0.777251     0.012911   
25%       0.005166     0.160347     0.055297     0.802205     0.028669   
50%       0.008488     0.164869     0.058928     0.806869     0.034119   
75%       0.013079     0.169411     0.062441     0.811110     0.039113   
max       0.032441     0.197905     0.085190     0.830168     0.051226   

                X_6           X_7           X_8           X_9          X_10  \
count  1.000000e+03  1.000000e+03  1.000000e+03  1.000000e+03  1.000000e+03   
mean   3.394686e-05  1.786560e-04  8.425759e-05  2.348946e-04  1.153173e-04   
std    8.046373e-05  3.225415e-04  1.656361e-04  4.158338e-04  2.164

DATA SYNTHENSIS ASSESSMENT




In [26]:
import pandas as pd
import numpy as np
from scipy.stats import ks_2samp

# Load original dataset
original_data = pd.read_csv('assignment_dataset.csv')

# Load synthetic dataset
synthetic_data = pd.read_csv('synthetic_data.csv')

# Select only the X columns for comparison
original_X = original_data[[f'X_{i+1}' for i in range(12)]]
synthetic_X = synthetic_data[[f'X_{i+1}' for i in range(12)]]

# Function to compute KS-test for each feature
def assess_fidelity_ks(original, synthetic):
    results = {}
    for column in original.columns:
        ks_stat, p_value = ks_2samp(original[column], synthetic[column])
        results[column] = {'KS Statistic': ks_stat, 'P-Value': p_value}
    return results

# Run KS-test on all features
fidelity_results = assess_fidelity_ks(original_X, synthetic_X)

# Print the results
print("Fidelity Assessment using Kolmogorov-Smirnov Test:")
for feature, result in fidelity_results.items():
    print(f"{feature}: KS Statistic = {result['KS Statistic']:.4f}, P-Value = {result['P-Value']:.4f}")


Fidelity Assessment using Kolmogorov-Smirnov Test:
X_1: KS Statistic = 0.5272, P-Value = 0.0000
X_2: KS Statistic = 0.9383, P-Value = 0.0000
X_3: KS Statistic = 0.8616, P-Value = 0.0000
X_4: KS Statistic = 0.9999, P-Value = 0.0000
X_5: KS Statistic = 0.5757, P-Value = 0.0000
X_6: KS Statistic = 0.5943, P-Value = 0.0000
X_7: KS Statistic = 0.5139, P-Value = 0.0000
X_8: KS Statistic = 0.5337, P-Value = 0.0000
X_9: KS Statistic = 0.5093, P-Value = 0.0000
X_10: KS Statistic = 0.5123, P-Value = 0.0000
X_11: KS Statistic = 0.5337, P-Value = 0.0000
X_12: KS Statistic = 0.5082, P-Value = 0.0000
