In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset into a DataFrame
data = pd.read_csv("E:\Final year project\iot23_combined.csv")

In [2]:
label_counts = data['label'].value_counts()
print(label_counts)

label
PartOfAHorizontalPortScan     16312685
Benign                         3775710
Okiru                          3518847
DDoS                           1512837
C&C                              16868
Attack                            4823
C&C-HeartBeat                     2802
-   benign   -                     127
C&C-FileDownload                    47
C&C-Torii                           30
FileDownload                        14
C&C-HeartBeat-FileDownload           8
Okiru-Attack                         3
C&C-Mirai                            1
Name: count, dtype: int64


In [3]:
label_mapping = {
    'PartOfAHorizontalPortScan': 0,
    'Benign': 1,
    'Okiru': 2,
    'DDoS': 3,
    'C&C': 4,
    'Attack': 5,
    'C&C-HeartBeat': 6,
    'C&C-FileDownload': 7,
    'C&C-Torii': 8,
    'FileDownload': 9,
    'C&C-HeartBeat-FileDownload': 10,
    'Okiru-Attack': 11,
    'C&C-Mirai': 12,
    '-   benign   -': 13,
}
data['label'] = data['label'].map(label_mapping)

In [4]:
data['label'].unique()

array([ 0,  1,  4,  2,  3,  6,  8,  5,  7, 11,  9, 10, 12, 13],
      dtype=int64)

In [5]:
binary_feature_columns = [
    'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ',
    'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH',
    'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_S3',
    'conn_state_SF', 'conn_state_SH', 'conn_state_SHR'
]

data[binary_feature_columns] = data[binary_feature_columns].astype(int)


In [6]:
data

Unnamed: 0.1,Unnamed: 0,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label,...,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR
0,0,2.998796,0,0,0.0,3.0,180.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
2,2,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
3,3,2.998804,0,0,0.0,3.0,180.0,0.0,0.0,1,...,0,0,0,1,0,0,0,0,0,0
4,4,0.000000,0,0,0.0,1.0,60.0,0.0,0.0,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25144797,999994,0.000000,0,0,0.0,1.0,40.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
25144798,999995,0.000000,0,0,0.0,1.0,40.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
25144799,999996,0.000000,0,0,0.0,1.0,40.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0
25144800,999997,0.000000,0,0,0.0,1.0,40.0,0.0,0.0,0,...,0,0,0,1,0,0,0,0,0,0


In [8]:
from sklearn.preprocessing import StandardScaler

# Create a list of non-binary feature columns
non_binary_feature_columns = [
    'duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes',
    'resp_pkts', 'resp_ip_bytes'
]

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the non-binary feature columns
data[non_binary_feature_columns] = scaler.fit_transform(data[non_binary_feature_columns])


In [9]:
X = data[['duration', 'orig_bytes', 'resp_bytes', 'missed_bytes', 'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'proto_icmp', 'proto_tcp', 'proto_udp', 'conn_state_OTH', 'conn_state_REJ', 'conn_state_RSTO', 'conn_state_RSTOS0', 'conn_state_RSTR', 'conn_state_RSTRH', 'conn_state_S0', 'conn_state_S1', 'conn_state_S2', 'conn_state_S3', 'conn_state_SF', 'conn_state_SH', 'conn_state_SHR']]
y = data['label']

In [10]:
y.unique()

array([ 1,  2,  5,  3,  4,  7,  9,  6,  8, 12, 10, 11, 13, 14],
      dtype=int64)

In [11]:
X

Unnamed: 0,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,proto_icmp,proto_tcp,...,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR
0,0.152556,-0.00038,-0.000362,-0.001151,-0.000505,-0.000487,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0
1,0.006778,-0.00038,-0.000362,-0.001151,-0.000621,-0.000663,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0
2,0.006778,-0.00038,-0.000362,-0.001151,-0.000621,-0.000663,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0
3,0.152556,-0.00038,-0.000362,-0.001151,-0.000505,-0.000487,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0
4,0.006778,-0.00038,-0.000362,-0.001151,-0.000621,-0.000663,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25144797,0.006778,-0.00038,-0.000362,-0.001151,-0.000621,-0.000692,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0
25144798,0.006778,-0.00038,-0.000362,-0.001151,-0.000621,-0.000692,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0
25144799,0.006778,-0.00038,-0.000362,-0.001151,-0.000621,-0.000692,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0
25144800,0.006778,-0.00038,-0.000362,-0.001151,-0.000621,-0.000692,-0.003381,-0.00084,0,1,...,0,0,0,1,0,0,0,0,0,0


In [12]:
from sklearn.model_selection import train_test_split

# Split the data into training (70%), validation (15%), and test (15%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

# Print the shapes of the resulting sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Training set shape: (17601361, 24) (17601361,)
Validation set shape: (3771720, 24) (3771720,)
Test set shape: (3771721, 24) (3771721,)


In [13]:
data.shape

(25144802, 26)

In [14]:
y.unique()

array([ 1,  2,  5,  3,  4,  7,  9,  6,  8, 12, 10, 11, 13, 14],
      dtype=int64)

CNN Model

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.optimizers import Adam

# Hyperparameters
num_classes = 14
input_shape = (24, 1)

model = Sequential()

# Conv blocks
model.add(Conv1D(filters=36, kernel_size=5,
                 activation='relu',
                 input_shape=input_shape))
model.add(BatchNormalization())

model.add(Conv1D(filters=72, kernel_size=5,
                 activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=144, kernel_size=3,
                 activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(filters=288, kernel_size=3,
                 activation='relu'))
model.add(BatchNormalization())

# Classifier 
model.add(GlobalAveragePooling1D())
model.add(Dense(num_classes, activation='softmax'))

# Compile
optimizer = Adam()
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 20, 36)            216       
                                                                 
 batch_normalization (Batch  (None, 20, 36)            144       
 Normalization)                                                  
                                                                 
 conv1d_1 (Conv1D)           (None, 16, 72)            13032     
                                                                 
 batch_normalization_1 (Bat  (None, 16, 72)            288       
 chNormalization)                                                
                                                                 
 max_pooling1d (MaxPooling1  (None, 8, 72)             0         
 D)                                                              
                                                        

In [20]:
import numpy as np

# Convert Pandas DataFrames to NumPy arrays
X_train = X_train.values
X_val = X_val.values
X_test = X_test.values
y_train = y_train.values
y_val = y_val.values

# Reshape input for channels dimension
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)


In [34]:
from tensorflow.keras.utils import to_categorical

num_classes = 14
y_train = to_categorical(y_train, num_classes=num_classes)
y_val = to_categorical(y_val, num_classes=num_classes)


IndexError: index 14 is out of bounds for axis 1 with size 14

In [27]:
np.unique(y_train)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [28]:
model.fit(X_train, y_train,
          validation_data=(X_val, y_val),
          batch_size=32,
          epochs=30)

Epoch 1/30


ValueError: in user code:

    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1338, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1322, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1303, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1081, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\training.py", line 1139, in compute_loss
        return self.compiled_loss(
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 142, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 268, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\losses.py", line 2122, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "C:\Users\racha\AppData\Local\Programs\Python\Python311\Lib\site-packages\keras\src\backend.py", line 5560, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1) and (None, 14) are incompatible
