# Config

#todo   The environment is set up with anaconda take note of conda installation and setup

## Select Device Type

In [None]:
import torch
device = ""
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("NOTE : GPU in use")
else:
    device = torch.device("cpu")
    print("NOTE : CPU in use")

## Set Seed

In [None]:
torch.manual_seed(420)

# Train GAN

## Get Real Data

In [None]:
import pandas as pd

#load data from file
gan_train_data = pd.read_csv('KDDTrain+TopRowColNames.txt', header=None)

#drop duplicates
gan_train_data.drop_duplicates()


print(gan_train_data.columns)
print(gan_train_data)

## Prepare Real Data

### Numericalize

In [None]:
# Get all cols that have non float data type
for col in gan_train_data.columns:
    if gan_train_data.dtypes[col] != type(0.1): 
        print(f"{col} of type {gan_train_data.dtypes[col]}")

In [None]:
# Enumerate distinct values in col and add to dict 
def get_unique_keys_and_enumerate_dict(df, col_name): 
    # Input: dataframe df and target col_name
    # Output: dict in the form {'tcp': 0, 'udp': 1, 'icmp': 2}
    value_list = []
    for value in df[col_name]: 
        if value not in value_list: 
            value_list.append(value)
            #print(value)
    
    dict_items = {value:index for index, value in enumerate(value_list)}
    print(dict_items)
    print('\n')
    print('\n')
    print('\n')
    return dict_items

#get_unique_keys_and_enumerate_dict(gan_train_data, 1)

In [None]:
# Col Dictionaries
attack_types = get_unique_keys_and_enumerate_dict(gan_train_data, 41)
attack_types_binary = attack_types.copy()
for key in attack_types_binary:
    if key != 'normal':
        attack_types_binary[key] = 1
print(attack_types_binary)
print('\n')
print('\n')
print('\n')

protocol_types = get_unique_keys_and_enumerate_dict(gan_train_data, 1)
service = get_unique_keys_and_enumerate_dict(gan_train_data, 2)
flag =  get_unique_keys_and_enumerate_dict(gan_train_data, 3)


In [None]:
# Numericalize protocol_types
for value in gan_train_data[1]: 
    #print(value)
    #print(protocol_types[value])
    if type(value) != type(1) and type(value) != type(0.1):
        gan_train_data[1] = gan_train_data[1].replace(value, protocol_types[value])
        #print(f"Change {value} to {protocol_types[value]}")
    else:
        pass
        #print(value)

In [None]:
# Numericalize service
for value in gan_train_data[2]: 
    #print(value)
    #print(service[value])
    if type(value) != type(1) and type(value) != type(0.1):
        gan_train_data[2] = gan_train_data[2].replace(value, service[value])
        #print(f"Change {value} to {service[value]}")
    else:
        pass
        #print(value)
    
    

In [None]:
# Numericalize flag
for value in gan_train_data[3]: 
    #print(value)
    #print(flag[value])
    if type(value) != type(1) and type(value) != type(0.1):
        gan_train_data[3] = gan_train_data[3].replace(value, flag[value])
        #print(f"Change {value} to {flag[value]}")
    else:
        pass
        #print(value)

In [None]:
# Numericalize class
normal_traffic_count = 0
attack_traffic_count = 0
for value in gan_train_data[41]: 
    #print(value)
    #print(attack_types[value])
    if type(value) != type(1) and type(value) != type(0.1):
        if value == 'normal': 
            normal_traffic_count+=1
        else:
            attack_traffic_count+=1
        gan_train_data[41] = gan_train_data[41].replace(value, attack_types[value])

        #print(f"Change {value} to {attack_types[value]}")
    else:
        pass
        #print(value)

In [None]:
print(normal_traffic_count, attack_traffic_count)

In [None]:
# Enforce float64 datatype for all columns data
for col in gan_train_data.columns: 
    gan_train_data[col] = gan_train_data[col].astype('float64')
gan_train_data.astype('float64')

In [None]:
# Display new data form
print(gan_train_data.dtypes)
print(gan_train_data)

### Normalization

In [None]:
# x_i = (x_i - Min) / (Max - Min)
# Min-Max Scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
exclude_cols = []
for col in gan_train_data:
    if col not in exclude_cols:
        gan_train_data[col] = scaler.fit_transform(gan_train_data[[col]])

In [None]:
# Display new data form
for col in gan_train_data.columns: 
    min_val = gan_train_data[col].min()
    max_val = gan_train_data[col].max()
    print(f"Min: {min_val}, Max: {max_val}")

print(gan_train_data)
gan_train_data.to_csv('gan_train_data.txt')

In [None]:
normal_traffic_count = 0
attack_traffic_count = 0
normal_threshold = .02 
for value in gan_train_data[41]: 
    if value >= normal_threshold: 
        attack_traffic_count+=1
    else:
        normal_traffic_count+=1

In [None]:
print(normal_traffic_count, attack_traffic_count)

## Fit Models

### Create SDV Metadata

In [None]:
import os
# Get metadata from dataframes
if 'gan_train_data.txt' in os.listdir(): 
    gan_train_data = pd.read_csv('gan_train_data.txt')
    print('loaded gan training data: ')

from sdv.metadata import SingleTableMetadata
print(type(gan_train_data))
gan_train_metadata = SingleTableMetadata()
for col in gan_train_data:
    gan_train_metadata.add_column(
        column_name=str(col),
        sdtype='numerical',
        computer_representation='Float')
    gan_train_data = gan_train_data.rename(columns={col:str(col)})
#print(gan_train_metadata.to_dict())
print(gan_train_data.columns)
#gan_train_metadata.detect_from_dataframe(gan_train_data)

### CTGAN Model

In [None]:
#TODO: RETRAIN GANS WITH DROPPED DUPES
from sdv.single_table import CTGANSynthesizer
import os

# Fit Model only if model not found in directory
if 'ctGanSynthesizer.pkl' in os.listdir():
    ctGanSynthesizer = CTGANSynthesizer.load(filepath='ctGanSynthesizer.pkl')
    print('CT GAN Synthesizer loaded into ctGanSynthesizer variable')
    print(ctGanSynthesizer.get_parameters())
    print(ctGanSynthesizer.get_metadata())
else:
    ctGanSynthesizer = CTGANSynthesizer(gan_train_metadata, verbose=True)
    ctGanSynthesizer.fit(gan_train_data)
    ctGanSynthesizer.save(filepath='ctGanSynthesizer.pkl')
    print(ctGanSynthesizer.get_parameters())
    print(ctGanSynthesizer.get_metadata())

### CouplaGAN Model

In [None]:
from sdv.single_table import CopulaGANSynthesizer
import os

# Fit Model only if model not found in directory
if 'copulaGanSynthesizer.pkl' in os.listdir():
    copulaGanSynthesizer = CopulaGANSynthesizer.load(filepath='copulaGanSynthesizer.pkl')
    print('Coupla GAN Synthesizer loaded into copulaGanSynthesizer variable')
    print(copulaGanSynthesizer.get_parameters())
    print(copulaGanSynthesizer.get_metadata())
else:
    copulaGanSynthesizer = CopulaGANSynthesizer(gan_train_metadata, verbose=True)
    copulaGanSynthesizer.fit(gan_train_data)
    copulaGanSynthesizer.save(filepath='copulaGanSynthesizer.pkl')
    print(copulaGanSynthesizer.get_parameters())
    print(copulaGanSynthesizer.get_metadata())

# Train Discriminators

## Get and Prepare Data

### Create Dataframes

In [None]:
import pandas as pd

# Real Data
if 'real_train_data.txt' not in os.listdir():
    real_train_data = gan_train_data
    print('real_train_data')
    print(real_train_data)
else: 
    real_train_data = pd.read_csv('real_train_data.txt')
    print('loaded real_train_data')
    print(real_train_data)

# Synthetic Data
### Possible dataset: 
### ct vs coupla
### size of data vs stats of data (ratio attack_type:normal packets)
### separating each attack type into its own discriminator?

# ctGanSynthesizer
if 'ctGanSynthesizer_synthetic_train_data.txt' not in os.listdir():
    ctGanSynthesizer_synthetic_train_data = ctGanSynthesizer.sample(500000)
    ctGanSynthesizer_synthetic_train_data.drop_duplicates()
    print('\n\n ctGanSynthesizer_synthetic_train_data')
    print(ctGanSynthesizer_synthetic_train_data)

else: 
    ctGanSynthesizer_synthetic_train_data = pd.read_csv('ctGanSynthesizer_synthetic_train_data.txt')
    print('\n\n loaded ctGanSynthesizer_synthetic_train_data')
    print(ctGanSynthesizer_synthetic_train_data)

# couplaGanSynthesizer (not working, still in beta, see below block)
if 'couplaGanSynthesizer_synthetic_train_data.txt' not in os.listdir():
    couplaGanSynthesizer_synthetic_train_data = copulaGanSynthesizer.sample(500000)
    couplaGanSynthesizer_synthetic_train_data.drop_duplicates()
    print('\n\n couplaGanSynthesizer_synthetic_train_data')
    print(couplaGanSynthesizer_synthetic_train_data)

else: 
    couplaGanSynthesizer_synthetic_train_data = pd.read_csv('couplaGanSynthesizer_synthetic_train_data.txt')
    print('\n\n loaded couplaGanSynthesizer_synthetic_train_data')
    print(couplaGanSynthesizer_synthetic_train_data)


In [None]:
#real_train_data.drop(columns=['Unnamed: 0'])
#ctGanSynthesizer_synthetic_train_data.drop(columns=['Unnamed: 0'])
#couplaGanSynthesizer_synthetic_train_data.drop(columns=['Unnamed: 0'])

#print('\n\n compare')
#for i in range(5): 
#    print(f'row:{i}')
#    for j in range(42):
#        print(f'real: {real_train_data.iloc[i][j]}, synthetic: {synthetic_train_data.iloc[i][j]}')



#get_unique_keys_and_enumerate_dict(real_train_data, '41')

### IMPORTANT: couplaGAN Synthesizer not creating similar enough data. Seems that normal class has dissapeared from it
get_unique_keys_and_enumerate_dict(ctGanSynthesizer.sample(500), '41')
dict1 = get_unique_keys_and_enumerate_dict(copulaGanSynthesizer.sample(500), '41')


### Binary classification - attack method no longer specified

In [None]:
# Real Data
if 'real_train_data.txt' not in os.listdir():
    normal_threshold = .02 
    normal_traffic_count = 0
    attack_traffic_count = 0
    for value in real_train_data[41]: 
        #print(value)
        #print(attack_types[value])
        if value >= normal_threshold: 
            real_train_data[41] = real_train_data[41].replace(value, 1)
            attack_traffic_count += 1
        else:
            real_train_data[41] = real_train_data[41].replace(value, 0)
            normal_traffic_count += 1
    print('real_train_data classes:')
    print(f'normal_traffic: {normal_traffic_count}, attack traffic: {attack_traffic_count}')
    print(real_train_data[41])
    print(normal_traffic_count, attack_traffic_count)
    real_train_data.to_csv('real_train_data.txt')
else:
    normal_threshold = .02 
    normal_traffic_count = 0
    attack_traffic_count = 0
    for value in real_train_data['41']: 
        if value >= normal_threshold: 
            attack_traffic_count += 1
        else:
            normal_traffic_count += 1
    print(f'normal_traffic: {normal_traffic_count}, attack traffic: {attack_traffic_count}')


In [None]:
# ct GAN Synthetic Data
if 'ctGanSynthesizer_synthetic_train_data.txt' not in os.listdir():
    normal_threshold = .02 
    normal_traffic_count = 0
    attack_traffic_count = 0
    for value in ctGanSynthesizer_synthetic_train_data['41']: 
        #print(value)
        #print(attack_types[value])
        if value >= normal_threshold: 
            ctGanSynthesizer_synthetic_train_data['41'] = ctGanSynthesizer_synthetic_train_data['41'].replace(value, 1)
            attack_traffic_count += 1
        else:
            ctGanSynthesizer_synthetic_train_data['41'] = ctGanSynthesizer_synthetic_train_data['41'].replace(value, 0)
            normal_traffic_count += 1
    print('\n\nctGanSynthesizer_synthetic_train_data classes:')
    print(f'normal_traffic: {normal_traffic_count}, attack traffic: {attack_traffic_count}')
    print(ctGanSynthesizer_synthetic_train_data['41'])
    ctGanSynthesizer_synthetic_train_data.to_csv('ctGanSynthesizer_synthetic_train_data.txt') 
else:
    normal_threshold = .02 
    normal_traffic_count = 0
    attack_traffic_count = 0
    for value in ctGanSynthesizer_synthetic_train_data['41']: 
        if value >= normal_threshold: 
            attack_traffic_count += 1
        else:
            normal_traffic_count += 1
    print(f'normal_traffic: {normal_traffic_count}, attack traffic: {attack_traffic_count}')


In [None]:
# coupla GAN Synthetic Data 
if 'couplaGanSynthesizer_synthetic_train_data.txt' not in os.listdir():
    normal_threshold = .06 
    normal_traffic_count = 0
    attack_traffic_count = 0
    for value in couplaGanSynthesizer_synthetic_train_data['41']: 
        #print(value)|
        #print(attack_types[value])
        if value >= normal_threshold: 
            couplaGanSynthesizer_synthetic_train_data['41'] = couplaGanSynthesizer_synthetic_train_data['41'].replace(value, 1)
            attack_traffic_count += 1
        else:
            couplaGanSynthesizer_synthetic_train_data['41'] = couplaGanSynthesizer_synthetic_train_data['41'].replace(value, 0)
            normal_traffic_count += 1
    print('\n\ncouplaGanSynthesizer_synthetic_train_data classes:')
    print(f'normal_traffic: {normal_traffic_count}, attack traffic: {attack_traffic_count}')
    print(couplaGanSynthesizer_synthetic_train_data['41'])
    #print(normal_traffic_count, attack_traffic_count)
    couplaGanSynthesizer_synthetic_train_data.to_csv('couplaGanSynthesizer_synthetic_train_data.txt') 
else:
    normal_threshold = .02 
    normal_traffic_count = 0
    attack_traffic_count = 0
    for value in couplaGanSynthesizer_synthetic_train_data['41']: 
        if value >= normal_threshold: 
            attack_traffic_count += 1
        else:
            normal_traffic_count += 1
    print(f'normal_traffic: {normal_traffic_count}, attack traffic: {attack_traffic_count}')


### Feature Selection

In [None]:
# List of all curr datasets
real_train_data_rows = 125973
ctGanSynthesizer_synthetic_train_data_rows = 500000
couplaGanSynthesizer_synthetic_train_data_rows = 500000

real_train_data_columns = 42 #does not include class column = total_col - 1
ctGanSynthesizer_synthetic_train_data_columns = 42
couplaGanSynthesizer_synthetic_train_data_columns = 42

train_dataframes = [real_train_data, ctGanSynthesizer_synthetic_train_data, couplaGanSynthesizer_synthetic_train_data]

# Init target 
real_train_data_training_targets = torch.zeros(1, real_train_data_rows, dtype=torch.double)
ctGanSynthesizer_synthetic_train_data_training_targets = torch.zeros(1, ctGanSynthesizer_synthetic_train_data_rows, dtype=torch.double)
couplaGanSynthesizer_synthetic_train_data_training_targets = torch.zeros(1, couplaGanSynthesizer_synthetic_train_data_rows, dtype=torch.double)
target_tensors = [real_train_data_training_targets, ctGanSynthesizer_synthetic_train_data_training_targets, couplaGanSynthesizer_synthetic_train_data_training_targets]

# List of features intended to be dropped
# keep: 1, 2, 3, 11, 22, 24, 25, 28, 31, 32, 37, 38
real_train_data_cols_to_drop = [42, 40, 39, 36, 35, 34, 33, 30, 29, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 10, 9, 8, 7, 6, 5, 4, 0]
ctGanSynthesizer_synthetic_train_data_cols_to_drop = [42, 40, 39, 36, 35, 34, 33, 30, 29, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 10, 9, 8, 7, 6, 5, 4, 0]
couplaGanSynthesizer_synthetic_train_data_cols_to_drop = [42, 40, 39, 36, 35, 34, 33, 30, 29, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 10, 9, 8, 7, 6, 5, 4, 0]
cols_to_drop = [real_train_data_cols_to_drop, ctGanSynthesizer_synthetic_train_data_cols_to_drop, couplaGanSynthesizer_synthetic_train_data_cols_to_drop]

# Init data tensors
real_train_data_training_samples = torch.zeros(real_train_data_columns-len(cols_to_drop[0]), real_train_data_rows, dtype=torch.double)
ctGanSynthesizer_synthetic_train_data_training_samples = torch.zeros(ctGanSynthesizer_synthetic_train_data_columns-len(cols_to_drop[1]), ctGanSynthesizer_synthetic_train_data_rows, dtype=torch.double)
couplaGanSynthesizer_synthetic_train_data_training_samples = torch.zeros(couplaGanSynthesizer_synthetic_train_data_columns-len(cols_to_drop[2]), couplaGanSynthesizer_synthetic_train_data_rows, dtype=torch.double)
sample_tensors = [real_train_data_training_samples, ctGanSynthesizer_synthetic_train_data_training_samples, couplaGanSynthesizer_synthetic_train_data_training_samples]

i=0
for df in train_dataframes:
    # insert class values into initialized training_targets tensor
    class_col_val = 41
    try:
        numpy_array = df[class_col_val].values
    except:
        class_col_val = '41'
        cols_to_drop[i] = [str(j) for j in cols_to_drop[i]]
        numpy_array = df[class_col_val].values
    target_tensors[i] = torch.from_numpy(numpy_array).double()
    # Drop classification and create target tensor
    df_after_drop = df.drop(columns=[class_col_val])
    # Drop rest of cols in cols_to_drop
    df_after_drop = df_after_drop.drop(columns=cols_to_drop[i])
    # insert remaining col values into initialized training_samples tensor
    numpy_array = df_after_drop.values
    sample_tensors[i] = torch.from_numpy(numpy_array).double()
    i+=1

In [None]:
for i in range(len(target_tensors)): 
    print(i, '-----------------------------------------------')
    print(target_tensors[i].size())
    print(target_tensors[i])

    print(sample_tensors[i].size())
    print(sample_tensors[i])

## DNN 

### Create DNN Model

In [None]:
import torch.nn as nn
import torch.optim as optim

# Model Architecture
class SimpleDNN(nn.Module):
    def __init__(self):
        super(SimpleDNN, self).__init__()
        self.flatten = nn.Flatten()
        self.dense1 = nn.Linear(real_train_data_columns-len(cols_to_drop[0]), 64)
        self.dropout1 = nn.Dropout(0.5)
        self.dense2 = nn.Linear(64, 32)
        self.dropout2 = nn.Dropout(0.5)
        self.dense3 = nn.Linear(32, 1)

    def forward(self, x):
        x = self.flatten(x)
        x = torch.sigmoid(self.dense1(x))
        x = self.dropout1(x)
        x = torch.sigmoid(self.dense2(x))
        x = self.dropout2(x)
        x = self.dense3(x)
        #debugging(x)
        return x
def debugging(x):
    print(x)

# create discriminator instance
dnn_discriminator_real_data = SimpleDNN()
dnn_discriminator_ctGan_synthetic_data = SimpleDNN()
dnn_discriminator_couplaGan_synthetic_data = SimpleDNN()


# Loss function and optimizer
dnn_discriminator_real_data_criterion = nn.BCEWithLogitsLoss()
dnn_discriminator_ctGan_synthetic_data_criterion = nn.BCEWithLogitsLoss()
dnn_discriminator_couplaGan_synthetic_data_criterion = nn.BCEWithLogitsLoss()
dnn_discriminator_real_data_optimizer = optim.Adam(dnn_discriminator_real_data.parameters())
dnn_discriminator_ctGan_synthetic_data_optimizer = optim.Adam(dnn_discriminator_ctGan_synthetic_data.parameters())
dnn_discriminator_couplaGan_synthetic_data_optimizer = optim.Adam(dnn_discriminator_couplaGan_synthetic_data.parameters())

### Train Model on Data

In [None]:
# Train DNN on Real Data
if 'dnn_discriminator_real_data' in os.listdir():
    dnn_discriminator_real_data = SimpleDNN()
    dnn_discriminator_real_data.load_state_dict(torch.load('dnn_discriminator_real_data'))
    dnn_discriminator_real_data.eval()
else:
    dnn_discriminator_real_data.train()
    epochs = 100000
    loss_limit = .17
    input_data = sample_tensors[0].double()
    target = target_tensors[0].double()
    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        # Forward pass
        output = dnn_discriminator_real_data(input_data.float())
        
        # Calculate loss
        target = target.view(-1, 1)
        loss = dnn_discriminator_real_data_criterion(output, target)
        print(f'curr_loss: {loss.item()}')
        if loss.item() < loss_limit:
            break
        
        # Backward pass
        dnn_discriminator_real_data_optimizer.zero_grad()
        loss.backward()
        
        # Update weights
        dnn_discriminator_real_data_optimizer.step()
    
    torch.save(dnn_discriminator_real_data.state_dict(), 'dnn_discriminator_real_data')

In [None]:
# Train DNN on ct Gan Synthetic Data
if 'dnn_discriminator_ctGan_synthetic_data' in os.listdir():
    dnn_discriminator_ctGan_synthetic_data = SimpleDNN()
    dnn_discriminator_ctGan_synthetic_data.load_state_dict(torch.load('dnn_discriminator_ctGan_synthetic_data'))
    dnn_discriminator_ctGan_synthetic_data.eval()
else:
    dnn_discriminator_ctGan_synthetic_data.train()
    epochs = 100000
    loss_limit = .17
    input_data = sample_tensors[1].double()
    target = target_tensors[1].double()
    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        # Forward pass
        output = dnn_discriminator_ctGan_synthetic_data(input_data.float())
        
        # Calculate loss
        target = target.view(-1, 1)
        loss = dnn_discriminator_ctGan_synthetic_data_criterion(output, target)
        print(f'curr_loss: {loss.item()}')
        if loss.item() < loss_limit:
            break
        
        # Backward pass
        dnn_discriminator_ctGan_synthetic_data_optimizer.zero_grad()
        loss.backward()
        
        # Update weights
        dnn_discriminator_ctGan_synthetic_data_optimizer.step()
    
    torch.save(dnn_discriminator_ctGan_synthetic_data.state_dict(), 'dnn_discriminator_ctGan_synthetic_data')

In [None]:
#Train DNN on coupla Gan Synthetic Data
if 'dnn_discriminator_couplaGan_synthetic_data' in os.listdir():
    dnn_discriminator_couplaGan_synthetic_data = SimpleDNN()
    dnn_discriminator_couplaGan_synthetic_data.load_state_dict(torch.load('dnn_discriminator_couplaGan_synthetic_data'))
    dnn_discriminator_couplaGan_synthetic_data.eval()
else:
    dnn_discriminator_couplaGan_synthetic_data.train()
    epochs = 100000
    loss_limit = .17
    input_data = sample_tensors[2].double()
    target = target_tensors[2].double()
    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        # Forward pass
        output = dnn_discriminator_couplaGan_synthetic_data(input_data.float())
        
        # Calculate loss
        target = target.view(-1, 1)
        loss = dnn_discriminator_couplaGan_synthetic_data_criterion(output, target)
        print(f'curr_loss: {loss.item()}')
        if loss.item() < loss_limit:
            break
        
        # Backward pass
        dnn_discriminator_couplaGan_synthetic_data_optimizer.zero_grad()
        loss.backward()
        
        # Update weights
        dnn_discriminator_couplaGan_synthetic_data_optimizer.step()
    
    torch.save(dnn_discriminator_couplaGan_synthetic_data.state_dict(), 'dnn_discriminator_couplaGan_synthetic_data')

## CNN (DOES NOT WORK DO NOT RUN)

### Create CNN Model

In [None]:
import torch
import torch.nn as nn

input_channels = 16
class Simple1DCNN(nn.Module):
    def __init__(self):
        super(Simple1DCNN, self).__init__()
        self.conv1 = nn.Conv1d(input_channels, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64, 1)

    def forward(self, x):
        x = x.unsqueeze(0)
        print(x.size())
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        x = self.fc1(x)
        return x
    

# create discriminator instance
cnn_discriminator_real_data = Simple1DCNN()
cnn_discriminator_ctGan_synthetic_data = Simple1DCNN()


# Loss function and optimizer
cnn_discriminator_real_data_criterion = nn.BCEWithLogitsLoss()
cnn_discriminator_ctGan_synthetic_data_criterion = nn.BCEWithLogitsLoss()
cnn_discriminator_real_data_optimizer = optim.Adam(cnn_discriminator_real_data.parameters())
cnn_discriminator_ctGan_synthetic_data_optimizer = optim.Adam(cnn_discriminator_ctGan_synthetic_data.parameters())

#create custom dataset
from torch.utils.data import Dataset

class CustomCNNDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __getitem__(self, index):

        input = self.inputs[index].double()
        target = self.targets[index].double()
        return input, target

    def __len__(self):
        return len(self.inputs)

### Train Model on Data

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
torch.set_default_dtype(torch.float64)

# Train CNN on Real Data
if 'cnn_discriminator_real_data' in os.listdir():
    cnn_discriminator_real_data = Simple1DCNN()
    cnn_discriminator_real_data.load_state_dict(torch.load('cnn_discriminator_real_data'))
    cnn_discriminator_real_data.eval()
else:
    epochs = 100000
    loss_limit = .17
    batch_size = input_channels

    cnn_discriminator_real_data = Simple1DCNN()
    cnn_discriminator_real_data.train()
    criterion = cnn_discriminator_real_data_criterion
    optimizer = cnn_discriminator_real_data_optimizer

    input_data = sample_tensors[0].double()
    target = target_tensors[0].double()
    
    custom_dataset = CustomCNNDataset(input_data, target)
    train_loader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    
    for epoch in range(epochs):
        print(f'epoch: {epoch}')
        total_loss = 0
        num_loss_items = 0
        for inputs, targets in train_loader:
            inputs = inputs.double()
            outputs = cnn_discriminator_real_data(inputs) 
            #targets = targets.view(-1, 1)
            loss = criterion(outputs, targets)
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            # Update weights
            optimizer.step()
            
            total_loss += loss
            num_loss_items += 1
        
        print(f'last batch avg loss = {total_loss/num_loss_items}')
        if total_loss/num_loss_items < loss_limit: 
            break
    torch.save(cnn_discriminator_real_data.state_dict(), 'cnn_discriminator_real_data')

# Results

## Get Testing Data

In [None]:
import pandas as pd

#load data from file
test_data = pd.read_csv('KDDTest+.txt', header=None) #, names=col_name_lis
easy_test_data = pd.read_csv('KDDTest-21.txt', header=None)

#drop duplicates
test_data.drop_duplicates()
easy_test_data.drop_duplicates()

#print 
print(test_data)
print(easy_test_data)


## Prepare Test Data

### Numericalize

In [None]:
# Col Dictionaries
#attack_types = get_unique_keys_and_enumerate_dict(gan_train_data, 41)
#attack_types_binary = attack_types.copy()
#for key in attack_types_binary:
#    if key != 'normal':
#        attack_types_binary[key] = 1
#print(attack_types_binary)
#print('\n')
#print('\n')
#print('\n')

#protocol_types = get_unique_keys_and_enumerate_dict(gan_train_data, 1)
#service = get_unique_keys_and_enumerate_dict(gan_train_data, 2)
#flag =  get_unique_keys_and_enumerate_dict(gan_train_data, 3)


In [None]:
# Numericalize protocol_types
for value in test_data[1]: 
    #print(value)
    #print(protocol_types[value])
    if type(value) != type(1) and type(value) != type(0.1):
        test_data[1] = test_data[1].replace(value, protocol_types[value])
        print(f"Change {value} to {protocol_types[value]}")
    else:
        print(value)

# Numericalize protocol_types
for value in easy_test_data[1]: 
    #print(value)
    #print(protocol_types[value])
    if type(value) != type(1) and type(value) != type(0.1):
        easy_test_data[1] = easy_test_data[1].replace(value, protocol_types[value])
        print(f"Change {value} to {protocol_types[value]}")
    else:
        print(value)

In [None]:
# Numericalize service
for value in test_data[2]: 
    #print(value)
    #print(service[value])
    if type(value) != type(1) and type(value) != type(0.1):
        test_data[2] = test_data[2].replace(value, service[value])
        print(f"Change {value} to {service[value]}")
    else:
        print(value)
        
# Numericalize service
for value in easy_test_data[2]: 
    #print(value)
    #print(service[value])
    if type(value) != type(1) and type(value) != type(0.1):
        easy_test_data[2] = easy_test_data[2].replace(value, service[value])
        print(f"Change {value} to {service[value]}")
    else:
        print(value)

In [None]:
# Numericalize flag
for value in test_data[3]: 
    #print(value)
    #print(flag[value])
    if type(value) != type(1) and type(value) != type(0.1):
        test_data[3] = test_data[3].replace(value, flag[value])
        print(f"Change {value} to {flag[value]}")
    else:
        print(value)

# Numericalize flag
for value in easy_test_data[3]: 
    #print(value)
    #print(flag[value])
    if type(value) != type(1) and type(value) != type(0.1):
        easy_test_data[3] = easy_test_data[3].replace(value, flag[value])
        print(f"Change {value} to {flag[value]}")
    else:
        print(value)

In [None]:
# Enumerate distinct values in col and add to dict 
test_attack_types = get_unique_keys_and_enumerate_dict(test_data, 41)
test_attack_types_binary = test_attack_types.copy()
for key in test_attack_types_binary:
    if key != 'normal':
        test_attack_types_binary[key] = 1
    else:
        test_attack_types_binary[key] = 0
print(test_attack_types_binary)

In [None]:
# Numericalize class
for value in test_data[41]: 
    #print(value)
    #print(attack_types[value])
    if type(value) != type(1) and type(value) != type(0.1):
        test_data[41] = test_data[41].replace(value, test_attack_types_binary[value])
        print(f"Change {value} to {test_attack_types_binary[value]}")
    else:
        print(value)

# Numericalize class
for value in easy_test_data[41]: 
    #print(value)
    #print(attack_types[value])
    if type(value) != type(1) and type(value) != type(0.1):
        easy_test_data[41] = easy_test_data[41].replace(value, test_attack_types_binary[value])
        print(f"Change {value} to {test_attack_types_binary[value]}")
    else:
        print(value)

In [None]:
# Enforce float64 datatype for all columns data
for col in test_data.columns: 
    test_data[col] = test_data[col]
test_data.astype('float64')

# Enforce float64 datatype for all columns data
for col in easy_test_data.columns: 
    easy_test_data[col] = easy_test_data[col]
easy_test_data.astype('float64')

### Normalize

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
col_to_exclude = 'nevermind dont exclude'

for col in test_data:
    if col == col_to_exclude: 
        continue
    test_data[col] = scaler.fit_transform(test_data[[col]])

for col in easy_test_data: 
    if col == col_to_exclude: 
        continue
    easy_test_data[col] = scaler.fit_transform(easy_test_data[[col]])

### Feature Selection

In [None]:
# List of all curr datasets
test_dataframes = [test_data, easy_test_data]

test_data_rows = 22544
test_data_columns = 42 #does not include class column = total_col - 1

easy_test_data_rows = 11850
easy_test_data_columns = 42 #does not include class column = total_col - 1

# Init target tensors
test_data_targets = torch.zeros(1, test_data_rows, dtype=torch.double)
easy_test_data_targets = torch.zeros(1, easy_test_data_rows, dtype=torch.double)

testing_target_tensors = [test_data_targets, easy_test_data_targets]

# List of features intended to be dropped
test_data_inputs_cols_to_drop = [42, 40, 39, 36, 35, 34, 33, 30, 29, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 10, 9, 8, 7, 6, 5, 4, 0]
easy_test_data_inputs_cols_to_drop = [42, 40, 39, 36, 35, 34, 33, 30, 29, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 10, 9, 8, 7, 6, 5, 4, 0]

cols_to_drop = [test_data_inputs_cols_to_drop, easy_test_data_inputs_cols_to_drop]

# Init data tensors
test_data_inputs = torch.zeros(test_data_columns-len(test_data_inputs_cols_to_drop), test_data_rows, dtype=torch.double)
easy_test_data_inputs = torch.zeros(easy_test_data_columns-len(easy_test_data_inputs_cols_to_drop), easy_test_data_rows, dtype=torch.double)

testing_input_tensors = [test_data_inputs, easy_test_data_inputs]

# --------------------------------------------------------------------------------------------------

i=0
class_col_name = 41
for df in test_dataframes:
    # insert class values into initialized targets tensor
    numpy_array = df[class_col_name].values
    testing_target_tensors[i] = torch.from_numpy(numpy_array).double()
    #print(testing_target_tensors[i])  
    # Drop classification and create target tensor
    df_after_drop = df.drop(columns=[class_col_name])
    # Drop rest of cols in cols_to_drop
    df_after_drop = df_after_drop.drop(columns=cols_to_drop[i])
    # insert remaining col values into initialized training_samples tensor
    numpy_array = df_after_drop.values
    testing_input_tensors[i] = torch.from_numpy(numpy_array).double()
    #print(testing_input_tensors[i])  
    i+=1

# --------------------------------------------------------------------------------------------------

for target_tensors in testing_target_tensors:
    attack=0
    normal=0
    #print(target_tensors)
    for value in target_tensors:
        #print(value.item())
        if value.item() > .015: #value.item() > .10 and value.item() < .12
            value.fill_(1.0)
            attack+=1
        else:
            value.fill_(0.0)
            normal+=1
    print(f'attack: {attack}, normal: {normal}')
    #print(target_tensors)


## Model Trained with Real Data Results

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

#DNN eval mode
dnn_discriminator_real_data.eval()

# Make predictions

print('Testing Data')
with torch.no_grad():
    dnn_pred = (dnn_discriminator_real_data(testing_input_tensors[0].float()) > 0.015).float()
true_class = testing_target_tensors[0]

#print(dnn_pred)
#print(true_class)

dnn_test_data_accuracy = accuracy_score(true_class, dnn_pred)
dnn_test_data_precision = precision_score(true_class, dnn_pred)
dnn_test_data_recall = recall_score(true_class, dnn_pred)
dnn_test_data_f1 = f1_score(true_class, dnn_pred)
dnn_test_data_conf_matrix = confusion_matrix(true_class, dnn_pred)

print(f'accuracy: {dnn_test_data_accuracy}')
print(f'precision: {dnn_test_data_precision}')
print(f'recall: {dnn_test_data_recall}')
print(f'f1: {dnn_test_data_f1}')
print(f'conf_matrix: {dnn_test_data_conf_matrix}')

# Save results
with open('dnn_real_data_test_data_results.txt', 'w') as f:
    # Write the variable values into the file
    f.write(f'accuracy: {dnn_test_data_accuracy}\n')
    f.write(f'precision: {dnn_test_data_precision}\n')
    f.write(f'recall: {dnn_test_data_recall}\n')
    f.write(f'f1: {dnn_test_data_f1}\n')
    f.write(f'conf_matrix: {dnn_test_data_conf_matrix}\n')


print('Easy Testing Data')
with torch.no_grad():
    dnn_pred = (dnn_discriminator_real_data(testing_input_tensors[1].float()) > 0.5).float()
true_class = testing_target_tensors[1]

#print(dnn_pred)
#print(true_class)

dnn_easy_test_data_accuracy = accuracy_score(true_class, dnn_pred)
dnn_easy_test_data_precision = precision_score(true_class, dnn_pred)
dnn_easy_test_data_recall = recall_score(true_class, dnn_pred)
dnn_easy_test_data_f1 = f1_score(true_class, dnn_pred)
dnn_easy_test_data_conf_matrix = confusion_matrix(true_class, dnn_pred)

print(f'accuracy: {dnn_easy_test_data_accuracy}')
print(f'precision: {dnn_easy_test_data_precision}')
print(f'recall: {dnn_easy_test_data_recall}')
print(f'f1: {dnn_easy_test_data_f1}')
print(f'conf_matrix: {dnn_easy_test_data_conf_matrix}')

# Save model and results
with open('dnn_real_data_easy_test_data_results.txt', 'w') as f:
    # Write the variable values into the file
    f.write(f'accuracy: {dnn_easy_test_data_accuracy}\n')
    f.write(f'precision: {dnn_easy_test_data_precision}\n')
    f.write(f'recall: {dnn_easy_test_data_recall}\n')
    f.write(f'f1: {dnn_easy_test_data_f1}\n')
    f.write(f'conf_matrix: {dnn_easy_test_data_conf_matrix}\n')

#RNN just need to change dnn_pred to rnn_pred

## Mixed Data Results

### CT GAN

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

#DNN eval mode
dnn_discriminator_ctGan_synthetic_data.eval()

# Make predictions

print('Testing Data')
with torch.no_grad():
    dnn_pred = (dnn_discriminator_ctGan_synthetic_data(testing_input_tensors[0].float()) > 0.015).float()
true_class = testing_target_tensors[0]

#print(dnn_pred)
#print(true_class)

dnn_test_data_accuracy = accuracy_score(true_class, dnn_pred)
dnn_test_data_precision = precision_score(true_class, dnn_pred)
dnn_test_data_recall = recall_score(true_class, dnn_pred)
dnn_test_data_f1 = f1_score(true_class, dnn_pred)
dnn_test_data_conf_matrix = confusion_matrix(true_class, dnn_pred)

print(f'accuracy: {dnn_test_data_accuracy}')
print(f'precision: {dnn_test_data_precision}')
print(f'recall: {dnn_test_data_recall}')
print(f'f1: {dnn_test_data_f1}')
print(f'conf_matrix: {dnn_test_data_conf_matrix}')

# Save results
with open('dnn_ctGan_synthetic_data_test_data_results.txt', 'w') as f:
    # Write the variable values into the file
    f.write(f'accuracy: {dnn_test_data_accuracy}\n')
    f.write(f'precision: {dnn_test_data_precision}\n')
    f.write(f'recall: {dnn_test_data_recall}\n')
    f.write(f'f1: {dnn_test_data_f1}\n')
    f.write(f'conf_matrix: {dnn_test_data_conf_matrix}\n')


print('Easy Testing Data')
with torch.no_grad():
    dnn_pred = (dnn_discriminator_ctGan_synthetic_data(testing_input_tensors[1].float()) > 0.015).float()
true_class = testing_target_tensors[1]

#print(dnn_pred)
#print(true_class)

dnn_easy_test_data_accuracy = accuracy_score(true_class, dnn_pred)
dnn_easy_test_data_precision = precision_score(true_class, dnn_pred)
dnn_easy_test_data_recall = recall_score(true_class, dnn_pred)
dnn_easy_test_data_f1 = f1_score(true_class, dnn_pred)
dnn_easy_test_data_conf_matrix = confusion_matrix(true_class, dnn_pred)

print(f'accuracy: {dnn_easy_test_data_accuracy}')
print(f'precision: {dnn_easy_test_data_precision}')
print(f'recall: {dnn_easy_test_data_recall}')
print(f'f1: {dnn_easy_test_data_f1}')
print(f'conf_matrix: {dnn_easy_test_data_conf_matrix}')

# Save model and results
with open('dnn_ctGan_synthetic_data_easy_test_data_results.txt', 'w') as f:
    # Write the variable values into the file
    f.write(f'accuracy: {dnn_easy_test_data_accuracy}\n')
    f.write(f'precision: {dnn_easy_test_data_precision}\n')
    f.write(f'recall: {dnn_easy_test_data_recall}\n')
    f.write(f'f1: {dnn_easy_test_data_f1}\n')
    f.write(f'conf_matrix: {dnn_easy_test_data_conf_matrix}\n')

#RNN just need to change dnn_pred to rnn_pred

### coupla GAN

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

#DNN eval mode
dnn_discriminator_couplaGan_synthetic_data.eval()

# Make predictions

print('Testing Data')
with torch.no_grad():
    dnn_pred = (dnn_discriminator_couplaGan_synthetic_data(testing_input_tensors[0].float()) > 0.015).float()
true_class = testing_target_tensors[0]

#print(dnn_pred)
#print(true_class)

dnn_test_data_accuracy = accuracy_score(true_class, dnn_pred)
dnn_test_data_precision = precision_score(true_class, dnn_pred)
dnn_test_data_recall = recall_score(true_class, dnn_pred)
dnn_test_data_f1 = f1_score(true_class, dnn_pred)
dnn_test_data_conf_matrix = confusion_matrix(true_class, dnn_pred)

print(f'accuracy: {dnn_test_data_accuracy}')
print(f'precision: {dnn_test_data_precision}')
print(f'recall: {dnn_test_data_recall}')
print(f'f1: {dnn_test_data_f1}')
print(f'conf_matrix: {dnn_test_data_conf_matrix}')

# Save results
with open('dnn_couplaGan_synthetic_data_test_data_results.txt', 'w') as f:
    # Write the variable values into the file
    f.write(f'accuracy: {dnn_test_data_accuracy}\n')
    f.write(f'precision: {dnn_test_data_precision}\n')
    f.write(f'recall: {dnn_test_data_recall}\n')
    f.write(f'f1: {dnn_test_data_f1}\n')
    f.write(f'conf_matrix: {dnn_test_data_conf_matrix}\n')


print('Easy Testing Data')
with torch.no_grad():
    dnn_pred = (dnn_discriminator_couplaGan_synthetic_data(testing_input_tensors[1].float()) > 0.015).float()
true_class = testing_target_tensors[1]

#print(dnn_pred)
#print(true_class)

dnn_easy_test_data_accuracy = accuracy_score(true_class, dnn_pred)
dnn_easy_test_data_precision = precision_score(true_class, dnn_pred)
dnn_easy_test_data_recall = recall_score(true_class, dnn_pred)
dnn_easy_test_data_f1 = f1_score(true_class, dnn_pred)
dnn_easy_test_data_conf_matrix = confusion_matrix(true_class, dnn_pred)

print(f'accuracy: {dnn_easy_test_data_accuracy}')
print(f'precision: {dnn_easy_test_data_precision}')
print(f'recall: {dnn_easy_test_data_recall}')
print(f'f1: {dnn_easy_test_data_f1}')
print(f'conf_matrix: {dnn_easy_test_data_conf_matrix}')

# Save model and results
with open('dnn_couplaGan_synthetic_data_easy_test_data_results.txt', 'w') as f:
    # Write the variable values into the file
    f.write(f'accuracy: {dnn_easy_test_data_accuracy}\n')
    f.write(f'precision: {dnn_easy_test_data_precision}\n')
    f.write(f'recall: {dnn_easy_test_data_recall}\n')
    f.write(f'f1: {dnn_easy_test_data_f1}\n')
    f.write(f'conf_matrix: {dnn_easy_test_data_conf_matrix}\n')

#RNN just need to change dnn_pred to rnn_pred

# References

Papers: 

[[DRL-GAN A Hybrid Approach for Binary and Multiclass Network Intrusion Detection - Jan 2023.pdf]]

[[AN EFFICIENT DEEP LEARNING APPROACH FOR NETWORK INTRUSION DETECTION SYSTEM ON SOFTWARE DEFINED NETWORK - Jul 2022.pdf]]

Dataset: 

[NSL-KDD HTML Readme](https://storage.googleapis.com/kagglesdsdata/datasets/174616/394223/index.html?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20231208%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20231208T043702Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=8779b5b417b57e7b927a4fc84c28a0c7c06cc76a68af5eb4c57bb27afe04f6d3344e20f7c99faafcef2605d925a8798e2d7a46a8452822f87b8b08c828d46ea121d58c7a766a936522deaaae69c1dcfa863dde452cce8a18466fcd8e9d77618fcc76a361ec656ed0717ea833554b8441c4a6dbd812d70fc5201796149fe7a41a42033d5e86bed7e1c57fdaca3f7762897b97a6e4d5c054e399de79425287b538e05ccdc75ce8344fd996be6f56bd4914dacb25c45faccdd3bcab6eb9fe2300cf62352882432a5cac8fa7726edc6d334e9b88bed36596489bbc32cb8ace309d31ee137a2ad20c28bc7c38d758517422deaa12f92e9836fe571c9b189b1ba0b95b)

[NSL-KDD Kaggle Datacard](https://www.kaggle.com/datasets/hassan06/nslkdd?select=KDDTrain%2B.txt)

[SKLearn Preprocessing library Normalization MinMaxScaler Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)

General Models:

[Making Models Pytorch Official Youtube Tutorial](https://pytorch.org/tutorials/beginner/introyt/modelsyt_tutorial.html)

[Construct and Train Deep learning model Pytorch Docs](https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html)

[torch.nn Documentation](https://pytorch.org/docs/stable/nn.html#linear-layers)

[pandas.dataframe Documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

[numpy.ndarray Documentation](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html)

GAN:

[SDV CT GAN Documentation](https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers/ctgansynthesizer)

[SDV coupla GAN Documentation](https://docs.sdv.dev/sdv/single-table-data/modeling/synthesizers/copulagansynthesizer)

[Conditional sampling SDV Documentation](https://docs.sdv.dev/sdv/single-table-data/sampling/conditional-sampling)

[SingleTable Metadata Documentation](https://docs.sdv.dev/sdv/single-table-data/data-preparation/single-table-metadata-api)

Other: 

[Jupyter notebooks shortcuts](https://noteable.io/blog/jupyter-notebook-shortcuts-boost-productivity/)
