In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# Define the VAE architecture
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim * 2)  # latent_dim for mean and latent_dim for log-variance
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        encoded = self.encoder(x)
        mu, logvar = encoded[:, :latent_dim], encoded[:, latent_dim:]
        z = self.reparameterize(mu, logvar)
        decoded = self.decoder(z)
        return decoded, mu, logvar

In [3]:
def loss_function(recon_x, x, mu, logvar):
    BCE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

In [13]:
# Load and preprocess the dataset
data = pd.read_csv('train_LZdllcl.csv')  # Replace with your dataset
data = data.dropna()
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data.values)
data_tensor = torch.tensor(data_scaled, dtype=torch.float32)

# Hyperparameters
input_dim = data_tensor.shape[1]
hidden_dim = 128
latent_dim = 10
batch_size = 10
learning_rate = 0.001
num_epochs = 100

# DataLoader
dataset = TensorDataset(data_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model, optimizer, and loss function
model = VAE(input_dim, hidden_dim, latent_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        x = batch[0]
        recon_x, mu, logvar = model(x)
        loss = loss_function(recon_x, x, mu, logvar)
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(dataloader.dataset)}')



Epoch 1, Loss: 6.534273335141717
Epoch 2, Loss: 6.137148825430416
Epoch 3, Loss: 6.053219244367424
Epoch 4, Loss: 5.99924826590022
Epoch 5, Loss: 5.927865277882599
Epoch 6, Loss: 5.9212735480681165
Epoch 7, Loss: 5.8812586215590255
Epoch 8, Loss: 5.844510498104076
Epoch 9, Loss: 5.846303721853839
Epoch 10, Loss: 5.820506731004273
Epoch 11, Loss: 5.824508643359188
Epoch 12, Loss: 5.81340685139872
Epoch 13, Loss: 5.805784146210528
Epoch 14, Loss: 5.792514364191224
Epoch 15, Loss: 5.78231909148799
Epoch 16, Loss: 5.774022313020815
Epoch 17, Loss: 5.782075215417108
Epoch 18, Loss: 5.78666132212005
Epoch 19, Loss: 5.772145360715292
Epoch 20, Loss: 5.772608193966596
Epoch 21, Loss: 5.758814787396164
Epoch 22, Loss: 5.747418574212091
Epoch 23, Loss: 5.74332964466418
Epoch 24, Loss: 5.736948608872302
Epoch 25, Loss: 5.744226105935138
Epoch 26, Loss: 5.73829894471908
Epoch 27, Loss: 5.74284348229321
Epoch 28, Loss: 5.736325515736232
Epoch 29, Loss: 5.734014938308731
Epoch 30, Loss: 5.7284537805

In [11]:
# Generate synthetic data
model.eval()
with torch.no_grad():
    z = torch.randn((1000, latent_dim))  # Generate 100 new data points
    synthetic_data = model.decoder(z).numpy()

# Inverse transform to original scale
synthetic_data_original_scale = scaler.inverse_transform(synthetic_data)

# Convert to DataFrame
synthetic_data_df = pd.DataFrame(synthetic_data_original_scale, columns=data.columns)

print(synthetic_data_df.head())


    employee_id  no_of_trainings        age  previous_year_rating  \
0  20139.050781         1.045229  32.220737              3.675061   
1  33409.000000         1.772014  31.349373              0.281598   
2  51732.562500         0.957346  39.412472              3.087601   
3  22660.062500         1.519494  31.935362              2.466403   
4  44258.691406         1.138292  30.833788              3.772217   

   length_of_service  KPIs_met >80%  awards_won?  avg_training_score  \
0           4.410475       0.856520    -0.003540           57.141453   
1           4.996706       0.035717     0.006291           51.273022   
2           6.239806       0.034638    -0.016406           88.959335   
3           4.712906      -0.021354     0.010424           70.102997   
4           4.431880       0.362530    -0.011383           88.181564   

   is_promoted  
0    -0.007916  
1    -0.017817  
2     0.006293  
3     0.017701  
4     0.022015  
