# Install / Import

In [1]:
!pip install ctgan

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: C:\Users\isund\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
# import from packages

from ctgan import CTGAN
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


import numpy as np
import pandas as pd

# Dataset

In [3]:
NIDS_DF = pd.read_csv("NF-UQ-NIDS-v2.csv")
NIDS_DF.head()


Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,L7_PROTO,IN_BYTES,IN_PKTS,OUT_BYTES,OUT_PKTS,...,TCP_WIN_MAX_OUT,ICMP_TYPE,ICMP_IPV4_TYPE,DNS_QUERY_ID,DNS_QUERY_TYPE,DNS_TTL_ANSWER,FTP_COMMAND_RET_CODE,Label,Attack,Dataset
0,192.168.100.148,65389,192.168.100.7,80,6,7.0,420,3,0,0,...,0,35840,140,0,0,0,0.0,1,DoS,NF-BoT-IoT-v2
1,192.168.100.148,11154,192.168.100.5,80,6,7.0,280,2,40,1,...,0,0,0,0,0,0,0.0,1,DoS,NF-BoT-IoT-v2
2,192.168.1.31,42062,192.168.1.79,1041,6,0.0,44,1,40,1,...,0,0,0,0,0,0,0.0,0,Benign,NF-ToN-IoT-v2
3,192.168.1.34,46849,192.168.1.79,9110,6,0.0,44,1,40,1,...,0,0,0,0,0,0,0.0,0,Benign,NF-ToN-IoT-v2
4,192.168.1.30,50360,192.168.1.152,1084,6,0.0,44,1,40,1,...,0,0,0,0,0,0,0.0,0,Benign,NF-ToN-IoT-v2


In [4]:
NIDS_DF.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75987976 entries, 0 to 75987975
Data columns (total 46 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   IPV4_SRC_ADDR                object 
 1   L4_SRC_PORT                  int64  
 2   IPV4_DST_ADDR                object 
 3   L4_DST_PORT                  int64  
 4   PROTOCOL                     int64  
 5   L7_PROTO                     float64
 6   IN_BYTES                     int64  
 7   IN_PKTS                      int64  
 8   OUT_BYTES                    int64  
 9   OUT_PKTS                     int64  
 10  TCP_FLAGS                    int64  
 11  CLIENT_TCP_FLAGS             int64  
 12  SERVER_TCP_FLAGS             int64  
 13  FLOW_DURATION_MILLISECONDS   int64  
 14  DURATION_IN                  int64  
 15  DURATION_OUT                 int64  
 16  MIN_TTL                      int64  
 17  MAX_TTL                      int64  
 18  LONGEST_FLOW_PKT             int64  
 19

In [5]:
NIDS_DF.drop(columns='IPV4_SRC_ADDR', inplace=True)

NIDS_DF.dropna(inplace=True)

In [6]:
NIDS_DF.drop(columns='IPV4_DST_ADDR', inplace=True)
NIDS_DF.drop(columns='Dataset', inplace=True)

In [7]:
#NIDS_DF.info()
#NIDS_DF.notna().any()

In [8]:
#NIDS_DF['Label'].value_counts()

In [9]:
#NIDS_DF['Attack'].value_counts()

In [10]:
# create integer encoding (preserves first-seen order) and a clean string column
attack_labels = pd.unique(NIDS_DF['Attack'])                # unique labels in appearance order
attack_to_int = {lab: i for i, lab in enumerate(attack_labels)}

NIDS_DF['Attack_int'] = NIDS_DF['Attack'].map(attack_to_int).astype('int64')
NIDS_DF['Attack_str'] = NIDS_DF['Attack'].astype('string')  # explicit string dtype

# optional: mapping back from int to label
int_to_attack = {v: k for k, v in attack_to_int.items()}

# show result
print("Mapping (label -> int):", attack_to_int)
print(NIDS_DF[['Attack', 'Attack_int']].head())

Mapping (label -> int): {'DoS': 0, 'Benign': 1, 'scanning': 2, 'DDoS': 3, 'xss': 4, 'Bot': 5, 'Reconnaissance': 6, 'password': 7, 'Fuzzers': 8, 'injection': 9, 'Theft': 10, 'Brute Force': 11, 'Infilteration': 12, 'Exploits': 13, 'Generic': 14, 'Analysis': 15, 'Backdoor': 16, 'mitm': 17, 'Shellcode': 18, 'ransomware': 19, 'Worms': 20}
   Attack  Attack_int
0     DoS           0
1     DoS           0
2  Benign           1
3  Benign           1
4  Benign           1


In [11]:
# Swap Benign and DoS mappings
attack_to_int['Benign'] = 0
attack_to_int['DoS'] = 1

# Update the reverse mapping
int_to_attack = {v: k for k, v in attack_to_int.items()}

# Recalculate Attack_int column with new mappings
NIDS_DF['Attack_int'] = NIDS_DF['Attack'].map(attack_to_int).astype('int64')

print("Updated Mapping (label -> int):", attack_to_int)
print(NIDS_DF[['Attack', 'Attack_int']].head())

Updated Mapping (label -> int): {'DoS': 1, 'Benign': 0, 'scanning': 2, 'DDoS': 3, 'xss': 4, 'Bot': 5, 'Reconnaissance': 6, 'password': 7, 'Fuzzers': 8, 'injection': 9, 'Theft': 10, 'Brute Force': 11, 'Infilteration': 12, 'Exploits': 13, 'Generic': 14, 'Analysis': 15, 'Backdoor': 16, 'mitm': 17, 'Shellcode': 18, 'ransomware': 19, 'Worms': 20}
   Attack  Attack_int
0     DoS           1
1     DoS           1
2  Benign           0
3  Benign           0
4  Benign           0


In [12]:
NIDS_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75987976 entries, 0 to 75987975
Data columns (total 45 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   L4_SRC_PORT                  int64  
 1   L4_DST_PORT                  int64  
 2   PROTOCOL                     int64  
 3   L7_PROTO                     float64
 4   IN_BYTES                     int64  
 5   IN_PKTS                      int64  
 6   OUT_BYTES                    int64  
 7   OUT_PKTS                     int64  
 8   TCP_FLAGS                    int64  
 9   CLIENT_TCP_FLAGS             int64  
 10  SERVER_TCP_FLAGS             int64  
 11  FLOW_DURATION_MILLISECONDS   int64  
 12  DURATION_IN                  int64  
 13  DURATION_OUT                 int64  
 14  MIN_TTL                      int64  
 15  MAX_TTL                      int64  
 16  LONGEST_FLOW_PKT             int64  
 17  SHORTEST_FLOW_PKT            int64  
 18  MIN_IP_PKT_LEN               int64  
 19

In [14]:
NIDS_DF.drop(columns=['Attack'], inplace=True)

In [15]:
NIDS_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75987976 entries, 0 to 75987975
Data columns (total 44 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   L4_SRC_PORT                  int64  
 1   L4_DST_PORT                  int64  
 2   PROTOCOL                     int64  
 3   L7_PROTO                     float64
 4   IN_BYTES                     int64  
 5   IN_PKTS                      int64  
 6   OUT_BYTES                    int64  
 7   OUT_PKTS                     int64  
 8   TCP_FLAGS                    int64  
 9   CLIENT_TCP_FLAGS             int64  
 10  SERVER_TCP_FLAGS             int64  
 11  FLOW_DURATION_MILLISECONDS   int64  
 12  DURATION_IN                  int64  
 13  DURATION_OUT                 int64  
 14  MIN_TTL                      int64  
 15  MAX_TTL                      int64  
 16  LONGEST_FLOW_PKT             int64  
 17  SHORTEST_FLOW_PKT            int64  
 18  MIN_IP_PKT_LEN               int64  
 19

In [16]:
NIDS_DF.to_csv('NIDS_DF_processed.csv', index=False)
print("Dataset exported to 'NIDS_DF_processed.csv'")

Dataset exported to 'NIDS_DF_processed.csv'
