In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


In [15]:

# Step 1: Load and preprocess your dataset
data = pd.read_csv('../bcwd.csv')
X = data.drop('Class', axis=1)
y = data['Class']

In [24]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Step 2: Apply SMOTE to balance the dataset
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(xtrain, ytrain)

# Filter for minority class (assuming label '1' is the minority class)
minority_class = X_resampled[y_resampled == 0]

In [27]:
minority_class.shape

(286, 30)

In [28]:

# Step 3: Create a GAN architecture


# Define the Generator
def build_generator(input_dim, output_dim):
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dense(256, activation='relu'),
        layers.Dense(output_dim, activation='sigmoid')  # Adjust activation based on output needs
    ])
    return model

# Define the Discriminator
def build_discriminator(input_dim):
    model = keras.Sequential([
        layers.Dense(256, activation='relu', input_dim=input_dim),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # Output: real or fake
    ])
    return model


In [29]:

# Step 4: Compile the GAN
input_dim = minority_class.shape[1]  # Number of features
generator = build_generator(input_dim, input_dim)
discriminator = build_discriminator(input_dim)

discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Create the GAN model
z = layers.Input(shape=(input_dim,))
generated_data = generator(z)
discriminator.trainable = False
validity = discriminator(generated_data)
gan = keras.Model(z, validity)

gan.compile(loss='binary_crossentropy', optimizer='adam')


In [32]:

# Step 5: Training Loop
def train_gan(epochs, batch_size):
    for epoch in range(epochs):
        # Sample synthetic data from the SMOTE-generated dataset as input noise
        idx = np.random.randint(0, minority_class.shape[0], batch_size)
        noise = minority_class.iloc[idx].values  # Use SMOTE-generated data as noise
        
        # Generate synthetic data
        generated_data = generator.predict(noise, verbose=0)

        # Labels for real and fake data
        real_labels = np.ones((batch_size, 1))
        fake_labels = np.zeros((batch_size, 1))

        # Sample real data from the minority class
        real_idx = np.random.randint(0, minority_class.shape[0], batch_size)
        real_data = minority_class.iloc[real_idx].values  # Convert to numpy array

        # Train on real and fake data
        d_loss_real = discriminator.train_on_batch(real_data, real_labels)
        d_loss_fake = discriminator.train_on_batch(generated_data, fake_labels)

        # Train the generator
        g_loss = gan.train_on_batch(noise, real_labels)  # Use the same synthetic data for generator

        # Print progress
        if epoch % 100 == 0:
            print(f'Epoch: {epoch}, D Loss Real: {d_loss_real[0]}, D Loss Fake: {d_loss_fake[0]}, G Loss: {g_loss}')


In [33]:

# Train the Smotified GAN
train_gan(epochs=5000, batch_size=64)

# Step 6: Generate synthetic data for minority class
num_samples = 200  # Adjust as needed
# Generate new samples using the generator with SMOTE data as input
noise = minority_class.sample(num_samples, replace=True).values  # Sampling synthetic noise
synthetic_data = generator.predict(noise)

# Convert synthetic data to a DataFrame if needed
synthetic_df = pd.DataFrame(synthetic_data, columns=X.columns)


Epoch: 0, D Loss Real: 2.261296631617615e-36, D Loss Fake: 7.022300678727333e-07, G Loss: 14.171381950378418
Epoch: 100, D Loss Real: 2.2897015945173184e-34, D Loss Fake: 4.5072351895214524e-07, G Loss: 14.6145601272583
Epoch: 200, D Loss Real: 2.657567156271754e-35, D Loss Fake: 3.1334025152318645e-07, G Loss: 14.977738380432129
Epoch: 300, D Loss Real: 5.978281047612003e-33, D Loss Fake: 2.3003566695933841e-07, G Loss: 15.286734580993652
Epoch: 400, D Loss Real: 7.278525001489282e-29, D Loss Fake: 1.748909710386215e-07, G Loss: 15.560216903686523
Epoch: 500, D Loss Real: 8.850520029149015e-30, D Loss Fake: 1.3680600829957257e-07, G Loss: 15.806042671203613
Epoch: 600, D Loss Real: 7.22132147324498e-26, D Loss Fake: 1.0957514717802042e-07, G Loss: 16.02785873413086
Epoch: 700, D Loss Real: 1.7404311514172615e-26, D Loss Fake: 8.933613315775801e-08, G Loss: 16.231996536254883
Epoch: 800, D Loss Real: 6.598748749059064e-26, D Loss Fake: 7.380644717613905e-08, G Loss: 16.4231014251709
Ep

In [34]:
synthetic_df.head()

Unnamed: 0,radius_0ean,texture_0ean,peri0eter_0ean,area_0ean,s0oothness_0ean,co0pactness_0ean,concavity_0ean,concave points_0ean,sy00etry_0ean,fractal_di0ension_0ean,...,radius_worst,texture_worst,peri0eter_worst,area_worst,s0oothness_worst,co0pactness_worst,concavity_worst,concave points_worst,sy00etry_worst,fractal_di0ension_worst
0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,8.530664e-34,0.0,0.0
1,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,5.383519e-37,0.0,1.8497010000000001e-31,0.0,0.0
2,0.0,0.0,1.0,1.0,1.0,1.228333e-30,1.307045e-22,1.0,1.0,1.3305889999999999e-30,...,1.118533e-24,1.0,0.0,2.65877e-22,1.0,8.327307000000001e-17,8.687755e-38,1.819187e-13,1.241461e-31,1.735044e-30
3,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,2.209119e-32,0.0,8.056582000000001e-25,0.0,0.0
4,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,2.7072769999999996e-26,1.0,4.354917e-23,0.0,6.452467e-25,0.0,0.0
