In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import warnings
warnings.filterwarnings("ignore")



In [2]:
# Reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Load data
df = pd.read_csv("final_encoded_gbv.csv")

# Quick class check (optional)
print(df['vulnerability_target'].value_counts())


vulnerability_target
0    31002
1    13348
Name: count, dtype: int64


In [3]:
# Separate features and target
X = df.drop("vulnerability_target", axis=1)
y = df["vulnerability_target"]

# Extract minority class (1)
minority_data = X[y == 1].copy()
print("Minority class shape:", minority_data.shape)

# Identify binary (0/1) columns based on the minority portion
# (keeps one-hot columns clean after generation)
binary_cols = []
for col in minority_data.columns:
    vals = pd.Series(minority_data[col].dropna().unique())
    # treat 0.0/1.0 as binary too
    if set(vals.astype(float).round().unique()).issubset({0.0, 1.0}) and vals.nunique() <= 2:
        binary_cols.append(col)

print(f"Detected {len(binary_cols)} binary columns.")

# Store min/max per column for clipping synthetic values into real ranges
col_mins = minority_data.min(axis=0)
col_maxs = minority_data.max(axis=0)

# Convert to numpy (NO SCALING)
minority_np = minority_data.values.astype(np.float32)

# Input dimension for GAN
input_dim = minority_np.shape[1]
print("Input dimension:", input_dim)


Minority class shape: (13348, 55)
Detected 27 binary columns.
Input dimension: 55


In [4]:
class Generator(nn.Module):
    def __init__(self, input_dim, noise_dim=64):
        super(Generator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(noise_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    def forward(self, z):
        return self.net(z)

class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

# Initialize models & optimizers
noise_dim = 64
generator = Generator(input_dim=input_dim, noise_dim=noise_dim)
discriminator = Discriminator(input_dim=input_dim)

criterion = nn.BCELoss()
g_optimizer = optim.Adam(generator.parameters(), lr=0.0002)
d_optimizer = optim.Adam(discriminator.parameters(), lr=0.0002)


In [5]:
# Convert minority data to torch tensor (no scaling)
minority_tensor = torch.tensor(minority_np, dtype=torch.float32)

epochs = 3000
batch_size = 64

for epoch in range(epochs):
    # === Train Discriminator ===
    idx = np.random.randint(0, minority_tensor.shape[0], size=batch_size)
    real_data = minority_tensor[idx]
    real_labels = torch.ones(batch_size, 1)

    z = torch.randn(batch_size, noise_dim)
    fake_data = generator(z).detach()
    fake_labels = torch.zeros(batch_size, 1)

    d_real = discriminator(real_data)
    d_fake = discriminator(fake_data)

    d_loss_real = criterion(d_real, real_labels)
    d_loss_fake = criterion(d_fake, fake_labels)
    d_loss = d_loss_real + d_loss_fake

    discriminator.zero_grad()
    d_loss.backward()
    d_optimizer.step()

    # === Train Generator ===
    z = torch.randn(batch_size, noise_dim)
    generated = generator(z)
    g_loss = criterion(discriminator(generated), torch.ones(batch_size, 1))

    generator.zero_grad()
    g_loss.backward()
    g_optimizer.step()

    if epoch % 500 == 0:
        print(f"Epoch {epoch}: D Loss={d_loss.item():.4f}, G Loss={g_loss.item():.4f}")


Epoch 0: D Loss=0.7427, G Loss=0.6468
Epoch 500: D Loss=100.0186, G Loss=4.0124
Epoch 1000: D Loss=100.0022, G Loss=6.1369
Epoch 1500: D Loss=100.0007, G Loss=7.2420
Epoch 2000: D Loss=100.0003, G Loss=7.9677
Epoch 2500: D Loss=100.0002, G Loss=8.5768


In [6]:
# How many synthetic samples?
num_to_generate = 17000

# Generate
z = torch.randn(num_to_generate, noise_dim)
generated_data = generator(z).detach().cpu().numpy()

# Put into a DataFrame
synthetic_df = pd.DataFrame(generated_data, columns=X.columns)

# --- Post-process to keep data usable for models ---

# (A) Clip each column to the real minority's min/max to avoid extreme values
for col in synthetic_df.columns:
    synthetic_df[col] = np.clip(
        synthetic_df[col],
        col_mins[col],
        col_maxs[col]
    )

# (B) Force binary columns back to 0/1 using threshold 0.5
if len(binary_cols) > 0:
    synthetic_df[binary_cols] = (synthetic_df[binary_cols] >= 0.5).astype(int)

# Add target label
synthetic_df['vulnerability_target'] = 1

print("Synthetic data shape:", synthetic_df.shape)


Synthetic data shape: (17000, 56)


In [7]:
# Original splits
majority_df = df[df['vulnerability_target'] == 0]
minority_df_real = df[df['vulnerability_target'] == 1]

# Combine
balanced_df = pd.concat([majority_df, minority_df_real, synthetic_df], ignore_index=True)

# Shuffle
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("RBalanced dataset shape:", balanced_df.shape)
print("RBalanced class distribution:\n", balanced_df['vulnerability_target'].value_counts())

# Save
balanced_df.to_csv("RBalanced_gbv_data.csv", index=False)
print("RBalanced dataset saved as 'balanced_gbv_data.csv'")


RBalanced dataset shape: (61350, 56)
RBalanced class distribution:
 vulnerability_target
0    31002
1    30348
Name: count, dtype: int64
RBalanced dataset saved as 'balanced_gbv_data.csv'
