In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from keras.applications.vgg19 import preprocess_input
import tensorflow as tf
from tensorflow.keras import datasets, layers, models

In [None]:
df = pd.read_csv('combined.csv')

In [None]:
label_array = df[' Label'].unique()
count = 0
for i in label_array:
  df.loc[(df[' Label']== i),' Label']= count
  print (i , count )
  count += 1

BENIGN 0
DDoS 1
PortScan 2
Bot 3
Infiltration 4
Web Attack � Brute Force 5
Web Attack � XSS 6
Web Attack � Sql Injection 7
FTP-Patator 8
SSH-Patator 9
DoS slowloris 10
DoS Slowhttptest 11
DoS Hulk 12
DoS GoldenEye 13
Heartbleed 14


In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [None]:
df = clean_dataset(df)

In [None]:
dataset = df.values

In [None]:
X = dataset[:,0:78]

In [None]:
Y =dataset[:,78]

In [None]:
Y=Y.astype(int)

In [None]:
from imblearn.over_sampling import SMOTE
up_dict = {3:5000,4:5000,5:5000,6:5000,7:5000,14:5000}
over =SMOTE (sampling_strategy = up_dict)



In [None]:
X,Y = over.fit_resample(X, Y)



In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train,X_val_test,Y_train,Y_val_test = train_test_split(X,Y,test_size = 0.3)

In [None]:
X_val,X_test,Y_val,Y_test = train_test_split(X_val_test,Y_val_test,test_size = 0.5)

In [None]:
print(X_train.shape)

(1997585, 78)


In [None]:
X_train = np.array([x.reshape(78,1) for x in X_train])
X_test = np.array([x.reshape(78,1) for x in X_test])
X_val = np.array([x.reshape(78,1) for x in X_val])

In [None]:
Y_train = np.array([x.reshape(1) for x in Y_train])
Y_test = np.array([x.reshape(1) for x in Y_test])
Y_val = np.array([x.reshape(1) for x in Y_val])

In [None]:
X_train.shape

(1997585, 78, 1)

In [None]:
Y_train.shape

(1997585, 1)

In [None]:
model = models.Sequential()
model.add(layers.Conv1D(32,5,activation='sigmoid',input_shape =(78,1)))
model.add(layers.Conv1D(32,5,activation='sigmoid'))
model.add(layers.MaxPool1D(2,1))
model.add(layers.Conv1D(32,5,activation = 'sigmoid'))
model.add(layers.MaxPool1D(2,1))
model.add(layers.BatchNormalization(batch_size = 32))
model.add(layers.Dropout(0.5))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='softmax'))
model.add(layers.Dense(15))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 74, 32)            192       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 70, 32)            5152      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 69, 32)            0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 65, 32)            5152      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 64, 32)            0         
_________________________________________________________________
batch_normalization (BatchNo (None, 64, 32)            128       
_________________________________________________________________
dropout (Dropout)            (None, 64, 32)            0

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(X_train, Y_train, epochs=10, 
                    validation_data=(X_val, Y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
test_loss, test_acc = model.evaluate(X_test, Y_test, verbose=2)

13377/13377 - 19s - loss: 0.0403 - accuracy: 0.9841


In [None]:
predictions = model.predict(X_test)

In [None]:
prediction_index=np.argmax(predictions,axis=1)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(Y_test,prediction_index))
print(classification_report(Y_test,prediction_index))

[[337422      6   2482    144      7      3     84     56      1     12
      11     98    668     14      1]
 [   220  18903      0      0      0      0      0      0      0      0
       0      0      1      1      0]
 [  1181      2  22675      0      0      1      1      0      0      0
       4      0      8      0      0]
 [   280      0      0    486      0      0      0      0      0      0
       0      0      0      0      0]
 [    10      0      0      0    768      0      0      0      0      0
       0      0      0      0      0]
 [    21      0      0      0      0     77    635      2      0     36
       0      0      0      0      0]
 [    28      0      0      0      0      1    668      2      0      7
       0      0      1      1      0]
 [     4      0      0      0      0      0      0    461      0    294
       0      0      0      1      0]
 [     1      0      0      0      0      0      0      0   1161      0
       3      0      0      0      0]
 [    23  

In [None]:
predictions[0]

array([ -0.37960762, -16.121174  ,   0.9474806 , -19.949036  ,
       -17.043604  , -16.379002  , -16.275553  , -17.146511  ,
       -20.958937  , -16.823114  , -16.545607  , -16.785376  ,
       -15.685574  , -19.28801   , -18.654753  ], dtype=float32)

In [None]:
Y_test[0]

array([2])