In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

In [2]:
def combine_dataset(files, col_names, processed = False):
	dtypes = {}
	if processed == False:
		for col_name in col_names:
			nominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',
				                 'service', 'ct_ftp', 'label_10'])  #Nominal column
			if col_name in nominal_names:
				dtypes[col_name] =  str
			else:
				dtypes[col_name] = np.float32
	else:
		for col_name in col_names:
			dtypes[col_name] = np.float32

	records = []
	for file in files:
		data = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)
		records.append(data)

	records_all = pd.concat(records) #When there is no index, concat adds them together regardless of the column names,


	return records_all

In [3]:
#Remove the unimportant feature, one-hot encoding, and convert the attack class to numeric
def select_feature_and_encoding(dataset, cols_to_drop, cols_nominal):
    
	# Drop the features has no meaning such as src ip.
    for cols in cols_to_drop:
        dataset.drop(cols, axis = 1, inplace = True)

	# Save the label and then drop it from dataset
    label_10 = dataset['label_10']
    dataset.drop('label_2', axis = 1, inplace = True)

	# replace the label with specific code
    replace_dict = { 'NaN': 0, 'Analysis': 1, 'Backdoors': 2, 'Backdoor': 2, 'DoS': 3,
                    'Exploits':4,' Fuzzers': 5, ' Fuzzers ':5, 'Generic': 6,
                    'Reconnaissance': 7, ' Shellcode ':8, 'Shellcode': 8,
                    'Worms':9, ' Reconnaissance ': 7,}
    dataset['label_10'] = label_10.replace(replace_dict)

	# replace the lost values
    replace_dict = {"NaN": 0, ' ': 0}
    for cols in ['ct_ftp', 'ct_flw', 'is_ftp']:
        dataset[cols] = dataset[cols].replace(replace_dict)
        
    for x in dataset['is_ftp']:
        if x != 0:
            x = 1

    for col_name in cols_nominal:
        dataset.drop(col_name, axis = 1, inplace = True) 

    return dataset  #Complete data set (including data and labels)

In [4]:
file_folder = 'unsw-NB15/'  #The location where the original file was read
col_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',
	             'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
	             'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
	             'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',
	             'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt',
	             'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips',
	             'ct_state_ttl', 'ct_flw', 'is_ftp', 'ct_ftp', 'ct_srv_src',
	             'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',
	             'ct_dst_sport', 'ct_dst_src', 'label_10', 'label_2']    #listed name

cols_to_drop = ['srcip', 'dstip', 'stime', 'ltime', 'sport', 'dsport']
cols_nominal = ['proto', 'service', 'state']   #Nominal features

files = [file_folder + 'UNSW-NB15_' + str(i+1) + '.csv' for i in range(4)]
dataset = combine_dataset(files, col_names)
dataset = dataset.fillna("NaN")

In [5]:
dataset = select_feature_and_encoding(dataset, cols_to_drop, cols_nominal)

In [6]:
X = dataset.drop('label_10', axis=1) 
y = dataset['label_10'] 

In [7]:
# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
feature_cnt = 19

In [8]:
# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [9]:
# 오토인코더 모델 정의
autoencoder = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(feature_cnt, activation='relu'),
    Dense(32, activation='relu'),
    Dense(64, activation='relu'),
    Dense(X_train.shape[1], activation='linear')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
autoencoder.compile(optimizer='adam', loss='mse')

In [11]:
# 오토인코더 모델 학습
start_time = time.time()
autoencoder.fit(X_train, X_train, epochs=50, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(patience=3)])
auto_training_time = time.time() - start_time
print("AutoEncoder Training Time:", auto_training_time, "seconds")

Epoch 1/50
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 660us/step - loss: 0.1162 - val_loss: 0.0290
Epoch 2/50
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 649us/step - loss: 0.0463 - val_loss: 0.0442
Epoch 3/50
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 650us/step - loss: 0.0380 - val_loss: 0.0098
Epoch 4/50
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 650us/step - loss: 0.0408 - val_loss: 0.0153
Epoch 5/50
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 659us/step - loss: 0.0482 - val_loss: 0.0153
Epoch 6/50
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 658us/step - loss: 0.0699 - val_loss: 0.0118
AutoEncoder Training Time: 114.36725497245789 seconds


In [12]:
# 오토인코더를 사용하여 데이터 압축
encoder = Model(inputs=autoencoder.layers[0].input, outputs=autoencoder.layers[2].output)
start_time = time.time()
X_train_compressed = encoder.predict(X_train)
X_test_compressed = encoder.predict(X_test)
auto_inference_time = time.time() - start_time
print("AutoEncoder Testing Time:", auto_inference_time, "seconds")

[1m63502/63502[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 457us/step
[1m15876/15876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 455us/step
AutoEncoder Testing Time: 65.55946516990662 seconds


In [13]:
# 압축된 데이터를 3차원으로 변환 (샘플 수, 타임 스텝 수, 특성 수)
X_train_compressed_3d = X_train_compressed.reshape(X_train_compressed.shape[0], 1, X_train_compressed.shape[1])
X_test_compressed_3d = X_test_compressed.reshape(X_test_compressed.shape[0], 1, X_test_compressed.shape[1])

In [14]:
# LSTM 모델 정의
model = Sequential([
    LSTM(64, input_shape=(X_train_compressed_3d.shape[1], X_train_compressed_3d.shape[2]), return_sequences=True),
    LSTM(32, return_sequences=False),
    Dense(10, activation='softmax')
])

  super().__init__(**kwargs)


In [15]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [16]:
# 라벨을 one-hot 인코딩
y_train_onehot = pd.get_dummies(y_train)
y_test_onehot = pd.get_dummies(y_test)

In [None]:
# LSTM 모델 학습
start_time = time.time()
model.fit(X_train_compressed_3d, y_train_onehot, epochs=10, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(patience=3)])
training_time = time.time() - start_time
print("Training Time:", training_time, "seconds")

Epoch 1/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 1ms/step - accuracy: 0.9620 - loss: 0.1238 - val_accuracy: 0.9689 - val_loss: 0.0826
Epoch 2/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1ms/step - accuracy: 0.9710 - loss: 0.0765 - val_accuracy: 0.9717 - val_loss: 0.0743
Epoch 3/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 1ms/step - accuracy: 0.9722 - loss: 0.0721 - val_accuracy: 0.9724 - val_loss: 0.0706
Epoch 4/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - accuracy: 0.9729 - loss: 0.0691 - val_accuracy: 0.9704 - val_loss: 0.0691
Epoch 5/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - accuracy: 0.9730 - loss: 0.0681 - val_accuracy: 0.9733 - val_loss: 0.0679
Epoch 6/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 1ms/step - accuracy: 0.9733 - loss: 0.0672 - val_accuracy: 0.9734 - val_loss: 0.066

In [None]:
# 모델 평가
start_time = time.time()
loss, accuracy = model.evaluate(X_test_compressed_3d, y_test_onehot)
inference_time = time.time() - start_time
print("Testing Time:", inference_time, "seconds")
print("Test Accuracy:", accuracy)