In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

# Load the datasets
train_data = pd.read_csv('UNSW_NB15_training-set.csv')
test_data = pd.read_csv('UNSW_NB15_test-set.csv')

#Display the first few rows
print(train_data.head())
print(test_data.head())



   id       dur proto service state  spkts  dpkts  sbytes  dbytes       rate  \
0   1  0.121478   tcp       -   FIN      6      4     258     172  74.087490   
1   2  0.649902   tcp       -   FIN     14     38     734   42014  78.473372   
2   3  1.623129   tcp       -   FIN      8     16     364   13186  14.170161   
3   4  1.681642   tcp     ftp   FIN     12     12     628     770  13.677108   
4   5  0.449454   tcp       -   FIN     10      6     534     268  33.373826   

   ...  ct_dst_sport_ltm  ct_dst_src_ltm  is_ftp_login  ct_ftp_cmd  \
0  ...                 1               1             0           0   
1  ...                 1               2             0           0   
2  ...                 1               3             0           0   
3  ...                 1               3             1           1   
4  ...                 1              40             0           0   

   ct_flw_http_mthd  ct_src_ltm  ct_srv_dst  is_sm_ips_ports  attack_cat  \
0                 0   

In [5]:
#check for missing value
rint(train_data.isnull().sum())
print(test_data.isnull().sum())


id                   0
dur                  0
proto                0
service              0
state                0
spkts                0
dpkts                0
sbytes               0
dbytes               0
rate                 0
sttl                 0
dttl                 0
sload                0
dload                0
sloss                0
dloss                0
sinpkt               0
dinpkt               0
sjit                 0
djit                 0
swin                 0
stcpb                0
dtcpb                0
dwin                 0
tcprtt               0
synack               0
ackdat               0
smean                0
dmean                0
trans_depth          0
response_body_len    0
ct_srv_src           0
ct_state_ttl         0
ct_dst_ltm           0
ct_src_dport_ltm     0
ct_dst_sport_ltm     0
ct_dst_src_ltm       0
is_ftp_login         0
ct_ftp_cmd           0
ct_flw_http_mthd     0
ct_src_ltm           0
ct_srv_dst           0
is_sm_ips_ports      0
attack_cat 

In [9]:
#Handle Missing Values: Drop any rows with missing values, as required
train_data = train_data.dropna()
test_data = test_data.dropna()

# Calculate how many rows were dropped
print(f'Training set: {initial_train_rows - final_train_rows} rows were dropped due to missing values.')
print(f'Testing set: {initial_test_rows - final_test_rows} rows were dropped due to missing values.')



Training set: 0 rows were dropped due to missing values.
Testing set: 0 rows were dropped due to missing values.


In [28]:
# Check if any missing values exist in the training and testing sets
print("Missing values in the training set:")
print(train_data.isnull().sum())

print("Missing values in the testing set:")
print(test_data.isnull().sum())


Missing values in the training set:
id                           0
dur                          0
spkts                        0
dpkts                        0
sbytes                       0
                            ..
attack_cat_Generic           0
attack_cat_Normal            0
attack_cat_Reconnaissance    0
attack_cat_Shellcode         0
attack_cat_Worms             0
Length: 200, dtype: int64
Missing values in the testing set:
id                           0
dur                          0
spkts                        0
dpkts                        0
sbytes                       0
                            ..
attack_cat_Generic           0
attack_cat_Normal            0
attack_cat_Reconnaissance    0
attack_cat_Shellcode         0
attack_cat_Worms             0
Length: 200, dtype: int64


In [13]:
# Handle Categorical Value (based on the project categorical values in training data may not match those in the test data
# Find unique categorical Values

categorical_columns = train_data.select_dtypes(include=['object']).columns
print(categorical_columns)

for col in categorical_columns:
    print(f'{col} unique values in training set: {train_data[col].unique()}')
    print(f'{col} unique values in test set: {test_data[col].unique()}')


Index(['proto', 'service', 'state', 'attack_cat'], dtype='object')
proto unique values in training set: ['tcp' 'udp' 'arp' 'ospf' 'icmp' 'igmp' 'rtp' 'ddp' 'ipv6-frag' 'cftp'
 'wsn' 'pvp' 'wb-expak' 'mtp' 'pri-enc' 'sat-mon' 'cphb' 'sun-nd' 'iso-ip'
 'xtp' 'il' 'unas' 'mfe-nsp' '3pc' 'ipv6-route' 'idrp' 'bna' 'swipe'
 'kryptolan' 'cpnx' 'rsvp' 'wb-mon' 'vmtp' 'ib' 'dgp' 'eigrp' 'ax.25'
 'gmtp' 'pnni' 'sep' 'pgm' 'idpr-cmtp' 'zero' 'rvd' 'mobile' 'narp' 'fc'
 'pipe' 'ipcomp' 'ipv6-no' 'sat-expak' 'ipv6-opts' 'snp' 'ipcv'
 'br-sat-mon' 'ttp' 'tcf' 'nsfnet-igp' 'sprite-rpc' 'aes-sp3-d' 'sccopmce'
 'sctp' 'qnx' 'scps' 'etherip' 'aris' 'pim' 'compaq-peer' 'vrrp' 'iatp'
 'stp' 'l2tp' 'srp' 'sm' 'isis' 'smp' 'fire' 'ptp' 'crtp' 'sps'
 'merit-inp' 'idpr' 'skip' 'any' 'larp' 'ipip' 'micp' 'encap' 'ifmp'
 'tp++' 'a/n' 'ipv6' 'i-nlsp' 'ipx-n-ip' 'sdrp' 'tlsp' 'gre' 'mhrp' 'ddx'
 'ippc' 'visa' 'secure-vmtp' 'uti' 'vines' 'crudp' 'iplt' 'ggp' 'ip'
 'ipnip' 'st2' 'argus' 'bbn-rcc' 'egp' 'emcon' 'igp

In [21]:
# Remove Rows with mismatched categorical values( Filter out rows with categorical values that only appear in the test set
for col in categorical_columns:
    train_values = set(train_data[col].unique())
    test_values = set(test_data[col].unique())
    common_values = train_values.intersection(test_values)

    train_data = train_data[train_data[col].isin(common_values)]
    test_data = test_data[test_data[col].isin(common_values)]
    #print the changes
    


In [29]:
#Feature Engineering
#One-hot encoding to convert categorical columns into numerical form

train_data = pd.get_dummies(train_data, columns=categorical_columns)
test_data = pd.get_dummies(test_data, columns=categorical_columns)

# Ensure the same columns in train and test sets
train_data, test_data = train_data.align(test_data, join='inner', axis=1)





KeyError: "None of [Index(['proto', 'service', 'state', 'attack_cat'], dtype='object')] are in the [columns]"

In [33]:
#Normalize the numeric features (excluding the target labels).
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns.drop('label')

train_data[numeric_columns] = scaler.fit_transform(train_data[numeric_columns])
test_data[numeric_columns] = scaler.transform(test_data[numeric_columns])
