# Using Generative Adversarial Network to Create Zero-Day attack

In [1]:
import pandas as pd
import numpy as np
from ctgan import synthesizers
from ctgan import CTGAN
import matplotlib.pyplot as plt
import seaborn as sns

### Creating Zero Attacks from BoT Dataset Attack Categories

In [24]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-BoT-IoT/NF-BoT-IoT-DDoS.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-BoT-IoT-DDoS.csv", index=False)
print("Augmented dataset saved successfully as")

# Summary
# -----------
print("\nProcess completed successfully!")


Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 56844

Training CTGAN on attack data...


Gen. (0.82) | Discrim. (-0.07): 100%|█████████| 100/100 [03:07<00:00,  1.87s/it]

CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 57844
Number of normal samples: 13859
Total samples in augmented dataset: 71703
Augmented dataset shuffled.
Augmented dataset saved successfully as

Process completed successfully!





In [3]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-BoT-IoT/NF-BoT-IoT-DoS.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-BoT-IoT-DoS.csv", index=False)
print("Augmented dataset saved successfully ")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 56833

Training CTGAN on attack data...


Gen. (0.61) | Discrim. (-0.28): 100%|█████████| 100/100 [02:45<00:00,  1.66s/it]

CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 57833
Number of normal samples: 13859
Total samples in augmented dataset: 71692
Augmented dataset shuffled.
Augmented dataset saved successfully 

Process completed successfully!





In [4]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-BoT-IoT/NF-BoT-IoT-Recon.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-BoT-IoT-Recon.csv", index=False)
print("Augmented dataset saved successfully ")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 470655

Training CTGAN on attack data...


Gen. (0.12) | Discrim. (-0.01): 100%|█████████| 100/100 [22:28<00:00, 13.49s/it]


CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 471655
Number of normal samples: 13859
Total samples in augmented dataset: 485514
Augmented dataset shuffled.
Augmented dataset saved successfully 

Process completed successfully!


In [20]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-BoT-IoT/NF-BoT-IoT-Theft.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-BoT-IoT-Theft.csv", index=False)
print("Augmented dataset saved successfully")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 1909

Training CTGAN on attack data...


Gen. (-1.42) | Discrim. (-0.24): 100%|████████| 100/100 [00:04<00:00, 21.65it/s]


CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 2909
Number of normal samples: 13859
Total samples in augmented dataset: 16768
Augmented dataset shuffled.
Augmented dataset saved successfully

Process completed successfully!


### Creating Zero Attacks from ToN IoT Dataset Attack Categories

In [6]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-TON-IoT_v1/NF-ToN-IoT-DDoS.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-ToN-IoT-DDoS.csv", index=False)
print("Augmented dataset saved successfully")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 149621

Training CTGAN on attack data...


Gen. (-0.42) | Discrim. (-0.12): 100%|████████| 100/100 [06:53<00:00,  4.13s/it]


CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 150621
Number of normal samples: 179647
Total samples in augmented dataset: 330268
Augmented dataset shuffled.
Augmented dataset saved successfully

Process completed successfully!


In [7]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-TON-IoT_v1/NF-ToN-IoT-DoS.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-ToN-IoT-DoS.csv", index=False)
print("Augmented dataset saved successfully")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 7533

Training CTGAN on attack data...


Gen. (-1.39) | Discrim. (-0.13): 100%|████████| 100/100 [00:17<00:00,  5.84it/s]


CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 8533
Number of normal samples: 179647
Total samples in augmented dataset: 188180
Augmented dataset shuffled.
Augmented dataset saved successfully

Process completed successfully!


In [8]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-TON-IoT_v1/NF-ToN-IoT-Injection.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-ToN-IoT-Injection.csv", index=False)
print("Augmented dataset saved successfully")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 680347

Training CTGAN on attack data...


Gen. (-0.74) | Discrim. (0.14): 100%|█████████| 100/100 [30:01<00:00, 18.02s/it]


CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 681347
Number of normal samples: 179647
Total samples in augmented dataset: 860994
Augmented dataset shuffled.
Augmented dataset saved successfully

Process completed successfully!


In [9]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-TON-IoT_v1/NF-ToN-IoT-Malware.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-ToN-IoT-Malware.csv", index=False)
print("Augmented dataset saved successfully")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 17389

Training CTGAN on attack data...


Gen. (1.02) | Discrim. (-0.53): 100%|█████████| 100/100 [00:44<00:00,  2.27it/s]


CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 18389
Number of normal samples: 179647
Total samples in augmented dataset: 198036
Augmented dataset shuffled.
Augmented dataset saved successfully

Process completed successfully!


In [10]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-TON-IoT_v1/NF-ToN-IoT-MITM.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-ToN-IoT-MITM.csv", index=False)
print("Augmented dataset saved successfully")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 1215

Training CTGAN on attack data...


Gen. (-1.09) | Discrim. (-0.19): 100%|████████| 100/100 [00:02<00:00, 34.95it/s]


CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 2215
Number of normal samples: 179647
Total samples in augmented dataset: 181862
Augmented dataset shuffled.
Augmented dataset saved successfully

Process completed successfully!


In [11]:
# Load the Dataset
df = pd.read_csv("/Users/promisea/SAMKNN/NF-TON-IoT_v1/NF-ToN-IoT-Scan.csv")
print("Dataset loaded successfully.")

# Data Preparation
# -------------------
# Drop irrelevant columns
X = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'Attack', 'Label'], axis=1)
y = df['Label']

# Encode labels if they are not numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y = y.astype('category').cat.codes

# Combine features and labels for CTGAN
data = X.copy()
data['Label'] = y

# Identify categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_columns}")

# Filter attack data (assuming Label=1 indicates attack)
attack_data = data[data['Label'] == 1]
print(f"Number of attack samples: {len(attack_data)}")

# Train CTGAN
# --------------
ctgan = CTGAN(
    epochs=100,          # Number of training epochs
    batch_size=500,      # Batch size
    verbose=True         # Verbose output during training
)

print("\nTraining CTGAN on attack data...")
ctgan.fit(attack_data, discrete_columns=categorical_columns)
print("CTGAN training completed.")

# Generate Synthetic Attack Data
# ----------------------------------
num_synthetic_samples = 1000  # Adjust as needed
print(f"\nGenerating {num_synthetic_samples} synthetic attack samples...")
synthetic_attack_data = ctgan.sample(num_synthetic_samples)

# Ensure 'Label' column exists and is set to 1
synthetic_attack_data['Label'] = 1
print("Synthetic attack data generated successfully.")

# Integrate Synthetic Data
# ----------------------------
# Combine real and synthetic attack data
augmented_attack_data = pd.concat([attack_data, synthetic_attack_data], ignore_index=True)
print(f"Total attack samples after augmentation: {len(augmented_attack_data)}")

# Combine with normal data (Label=0)
normal_data = data[data['Label'] == 0]
print(f"Number of normal samples: {len(normal_data)}")

# Create augmented dataset
augmented_data = pd.concat([normal_data, augmented_attack_data], ignore_index=True)
print(f"Total samples in augmented dataset: {len(augmented_data)}")

# Shuffle the dataset
augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
print("Augmented dataset shuffled.")

# Save Augmented Data (Modified)
# ----------------------------------
# Instead of saving only the synthetic_attack_data, we now save the entire augmented_data
augmented_data.to_csv("Augmented_NF-ToN-IoT-Scan.csv", index=False)
print("Augmented dataset saved successfully")

# Summary
# -----------
print("\nProcess completed successfully!")

Dataset loaded successfully.
Categorical columns: []
Number of attack samples: 12823

Training CTGAN on attack data...


Gen. (-0.16) | Discrim. (-0.19): 100%|████████| 100/100 [00:35<00:00,  2.82it/s]


CTGAN training completed.

Generating 1000 synthetic attack samples...
Synthetic attack data generated successfully.
Total attack samples after augmentation: 13823
Number of normal samples: 179647
Total samples in augmented dataset: 193470
Augmented dataset shuffled.
Augmented dataset saved successfully

Process completed successfully!
