# Traing on KDD datasets

In [1]:
import pandas as pd
import numpy as np
import sys
import sklearn
import io
import random

In [2]:
train_url = 'https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Train.csv'
test_url = 'https://raw.githubusercontent.com/merteroglu/NSL-KDD-Network-Instrusion-Detection/master/NSL_KDD_Test.csv'

In [3]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]


df = pd.read_csv(train_url,header=None, names = col_names)

df_test = pd.read_csv(test_url, header=None, names = col_names)

print('Dimensions of the Training set:',df.shape)
print('Dimensions of the Test set:',df_test.shape)

Dimensions of the Training set: (125973, 42)
Dimensions of the Test set: (22544, 42)


In [4]:
print('Label distribution Training set:')
print(df['label'].value_counts())
print()
print('Label distribution Test set:')
print(df_test['label'].value_counts())

Label distribution Training set:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: label, dtype: int64

Label distribution Test set:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178


In [5]:
print('Training set:')
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))

print()
print('Distribution of categories in service:')
print(df['service'].value_counts().sort_values(ascending=False).head())

Training set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 70 categories
Feature 'flag' has 11 categories
Feature 'label' has 23 categories

Distribution of categories in service:
http        40338
private     21853
domain_u     9043
smtp         7313
ftp_data     6860
Name: service, dtype: int64


In [6]:
# Test set
print('Test set:')
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        unique_cat = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} categories".format(col_name=col_name, unique_cat=unique_cat))


Test set:
Feature 'protocol_type' has 3 categories
Feature 'service' has 64 categories
Feature 'flag' has 11 categories
Feature 'label' has 38 categories


In [7]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
categorical_columns=['protocol_type', 'service', 'flag']

df_categorical_values = df[categorical_columns]
testdf_categorical_values = df_test[categorical_columns]

df_categorical_values.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [8]:
# protocol type
unique_protocol=sorted(df.protocol_type.unique())
string1 = 'Protocol_type_'
unique_protocol2=[string1 + x for x in unique_protocol]
print(unique_protocol2)

# service
unique_service=sorted(df.service.unique())
string2 = 'service_'
unique_service2=[string2 + x for x in unique_service]
print(unique_service2)


# flag
unique_flag=sorted(df.flag.unique())
string3 = 'flag_'
unique_flag2=[string3 + x for x in unique_flag]
print(unique_flag2)


# put together
dumcols=unique_protocol2 + unique_service2 + unique_flag2


#do it for test set
unique_service_test=sorted(df_test.service.unique())
unique_service2_test=[string2 + x for x in unique_service_test]
testdumcols=unique_protocol2 + unique_service2_test + unique_flag2


['Protocol_type_icmp', 'Protocol_type_tcp', 'Protocol_type_udp']
['service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest', 'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'ser

In [9]:
df_categorical_values_enc=df_categorical_values.apply(LabelEncoder().fit_transform)

print(df_categorical_values.head())
print('--------------------')
print(df_categorical_values_enc.head())

# test set
testdf_categorical_values_enc=testdf_categorical_values.apply(LabelEncoder().fit_transform)

  protocol_type   service flag
0           tcp  ftp_data   SF
1           udp     other   SF
2           tcp   private   S0
3           tcp      http   SF
4           tcp      http   SF
--------------------
   protocol_type  service  flag
0              1       20     9
1              2       44     9
2              1       49     5
3              1       24     9
4              1       24     9


In [10]:
df[categorical_columns] = df[categorical_columns].apply(LabelEncoder().fit_transform)
print(df.head())

   duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0         0              1       20     9        491          0     0   
1         0              2       44     9        146          0     0   
2         0              1       49     5          0          0     0   
3         0              1       24     9        232       8153     0   
4         0              1       24     9        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1                    0.00                    0.60   
2                    0.10                    0.

In [11]:
df['label'] = df['label'].apply(lambda x: 0 if x == 'normal' else 1)
print(df.head())

   duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0         0              1       20     9        491          0     0   
1         0              2       44     9        146          0     0   
2         0              1       49     5          0          0     0   
3         0              1       24     9        232       8153     0   
4         0              1       24     9        199        420     0   

   wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0               0       0    0  ...                  25   
1               0       0    0  ...                   1   
2               0       0    0  ...                  26   
3               0       0    0  ...                 255   
4               0       0    0  ...                 255   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                    0.17                    0.03   
1                    0.00                    0.60   
2                    0.10                    0.

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Assuming df is your DataFrame and it has a column 'label' indicating normal (1) and abnormal (0) data
# Separate normal and anomaly data
normal_data_df = df[df['label'] == 1].drop(columns=['label']).reset_index(drop=True)
anomaly_data_df = df[df['label'] == 0].drop(columns=['label']).reset_index(drop=True)

# Convert DataFrame to NumPy array
normal_data = normal_data_df.values.astype(np.float32)
anomaly_data = anomaly_data_df.values.astype(np.float32)

# Normalize data to the range [-1, 1] using MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
normal_data = scaler.fit_transform(normal_data)
anomaly_data = scaler.transform(anomaly_data)

# Convert to PyTorch tensor
normal_data = torch.tensor(normal_data)
anomaly_data = torch.tensor(anomaly_data)

# Print a few examples
print("Examples of normalized data:")
print(normal_data[:5])  # Print the first 5 rows

# Hyperparameters
batch_size = 64
latent_dim = 100
learning_rate = 0.0002
num_epochs = 100

# DataLoader
data_loader = torch.utils.data.DataLoader(normal_data, batch_size=batch_size, shuffle=True)


Examples of normalized data:
tensor([[-1.0000,  0.0000,  0.4203,  0.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -0.5186, -0.9765,
          1.0000,  1.0000, -1.0000, -1.0000, -0.9000, -0.8600, -1.0000,  1.0000,
         -0.8031, -0.8000, -0.9000, -1.0000, -1.0000,  1.0000,  1.0000, -1.0000,
         -1.0000],
        [-1.0000,  0.0000,  0.4203, -0.8000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -1.0000, -0.5264, -0.9256,
         -1.0000, -1.0000,  1.0000,  1.0000, -0.6800, -0.8800, -1.0000,  1.0000,
         -0.8583, -0.8600, -0.8600, -1.0000, -1.0000, -1.0000, -1.0000,  1.0000,
          1.0000],
        [-1.0000,  0.0000,  0.4203,  0.0000, -1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -1.0000, -1.0000, -1.000

In [14]:
class Discriminator(nn.Module):
    def __init__(self, img_dim):
        super().__init__()
        self.disc = nn.Sequential(
            nn.Linear(img_dim, 128),
            nn.LeakyReLU(0.1),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.disc(x)

class Generator(nn.Module):
    def __init__(self, z_dim, img_dim):
        super().__init__()
        self.gen = nn.Sequential(
            nn.Linear(z_dim, 256),

            nn.LeakyReLU(0.1),
            nn.Linear(256, img_dim),
            nn.Tanh()
        )

    def forward(self, x):
        return self.gen(x)

In [32]:
import os
sys.path.append('/homes/hp921/y3finalproject')
from utils import *
from torch.utils.tensorboard import SummaryWriter

save_path = 'savedmodel/'
os.makedirs(save_path, exist_ok=True)

# Setting up TensorBoard
writer = SummaryWriter('/homes/hp921/y3finalproject/runs/vanilla_KDD')


In [30]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("using ", device)

# Define your models, optimizer, and loss function
generator = Generator(latent_dim, normal_data.shape[1]).to(device)
discriminator = Discriminator(normal_data.shape[1]).to(device)

criterion = nn.BCELoss()
optimizer_G = optim.Adam(generator.parameters(), lr=learning_rate)
optimizer_D = optim.Adam(discriminator.parameters(), lr=learning_rate)

# Training loop
for epoch in range(1000):  # Reduced number of epochs for faster trials
    for batch_idx, real_data in enumerate(data_loader):
        real = real_data.to(device)
        noise = torch.randn(real.size(0), latent_dim).to(device)
        fake = generator(noise)

        # Train Discriminator
        disc_real = discriminator(real)
        disc_fake = discriminator(fake.detach())

        lossD_real = criterion(disc_real, torch.ones_like(disc_real))
        lossD_fake = criterion(disc_fake, torch.zeros_like(disc_fake))
        lossD = (lossD_real + lossD_fake) / 2

        optimizer_D.zero_grad()
        lossD.backward()
        optimizer_D.step()

        # Train Generator
        output = discriminator(fake)
        lossG = criterion(output, torch.ones_like(output))

        optimizer_G.zero_grad()
        lossG.backward()
        optimizer_G.step()

    # Calculate MMD and EMD scores
    mmd_score = compute_mmd(real, fake)
    emd_score = compute_emd(real, fake)

    # Get GPU usage
    memory_used, memory_total, utilization = get_gpu_usage()

    print(f"Epoch: {epoch}, Loss D: {lossD.item()}, Loss G: {lossG.item()}, MMD: {mmd_score}, EMD: {emd_score}, GPU Memory: {memory_used}/{memory_total} MiB, GPU Utilization: {utilization}%")
    # Log losses and scores to TensorBoard
    writer.add_scalar('Loss/Discriminator', lossD.item(), epoch)
    writer.add_scalar('Loss/Generator', lossG.item(), epoch)
    writer.add_scalar('Score/MMD', mmd_score, epoch)
    writer.add_scalar('Score/EMD', emd_score, epoch)
    writer.add_scalar('GPU/Memory_Used', memory_used, epoch)
    writer.add_scalar('GPU/Memory_Total', memory_total, epoch)
    writer.add_scalar('GPU/Utilization', utilization, epoch)

    print(f'Epoch [{epoch+1}/1000]  Loss D: {lossD.item()}, Loss G: {lossG.item()}')



# Save final model checkpoints
torch.save(generator.state_dict(), f'{save_path}generator_final.pth')
torch.save(discriminator.state_dict(), f'{save_path}discriminator_final.pth')



using  cuda
Epoch: 0, Loss D: 0.5600278973579407, Loss G: 0.7757691144943237, MMD: 0.889597475528717, EMD: 0.3784097120226399, GPU Memory: 329/8192 MiB, GPU Utilization: 4%
Epoch [1/1000]  Loss D: 0.5600278973579407, Loss G: 0.7757691144943237
Epoch: 1, Loss D: 0.4996947944164276, Loss G: 0.9124688506126404, MMD: 0.624276340007782, EMD: 0.23607543576319462, GPU Memory: 329/8192 MiB, GPU Utilization: 11%
Epoch [2/1000]  Loss D: 0.4996947944164276, Loss G: 0.9124688506126404
Epoch: 2, Loss D: 0.6413356065750122, Loss G: 0.7315020561218262, MMD: 0.6365139484405518, EMD: 0.2006716600683616, GPU Memory: 329/8192 MiB, GPU Utilization: 6%
Epoch [3/1000]  Loss D: 0.6413356065750122, Loss G: 0.7315020561218262
Epoch: 3, Loss D: 0.7111449837684631, Loss G: 0.66212397813797, MMD: 0.45910197496414185, EMD: 0.17795410881015833, GPU Memory: 329/8192 MiB, GPU Utilization: 15%
Epoch [4/1000]  Loss D: 0.7111449837684631, Loss G: 0.66212397813797
Epoch: 4, Loss D: 0.6194382905960083, Loss G: 0.774342238

KeyboardInterrupt: 

In [24]:
from sklearn.metrics import accuracy_score

generator = Generator(latent_dim, normal_data.shape[1]).to(device)
discriminator = Discriminator(normal_data.shape[1]).to(device)

# Evaluate discriminator accuracy on normal and anomaly data
def evaluate_discriminator(discriminator, data, labels):
    with torch.no_grad():
        predictions = discriminator(data.to(device)).cpu()
        predicted_labels = (predictions > 0.5).float()
        accuracy = accuracy_score(labels, predicted_labels)
        return accuracy

# Prepare labels
normal_labels = torch.ones(normal_data.size(0))
anomaly_labels = torch.zeros(anomaly_data.size(0))

# Evaluate on normal data
normal_accuracy = evaluate_discriminator(discriminator, normal_data, normal_labels)
print(f'Accuracy on normal data: {normal_accuracy:.4f}')

# Evaluate on anomaly data
anomaly_accuracy = evaluate_discriminator(discriminator, anomaly_data, anomaly_labels)
print(f'Accuracy on anomaly data: {anomaly_accuracy:.4f}')

Accuracy on normal data: 0.7461
Accuracy on anomaly data: 0.9829
