In [10]:
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import SimpleRNN, Dense, Input
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.utils import to_categorical

In [2]:
def combine_dataset(files, col_names, processed = False):
	dtypes = {}
	if processed == False:
		for col_name in col_names:
			nominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',
				                 'service', 'ct_ftp', 'label_10'])  #Nominal column
			if col_name in nominal_names:
				dtypes[col_name] =  str
			else:
				dtypes[col_name] = np.float32
	else:
		for col_name in col_names:
			dtypes[col_name] = np.float32

	records = []
	for file in files:
		data = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)
		records.append(data)

	records_all = pd.concat(records) #When there is no index, concat adds them together regardless of the column names,


	return records_all

In [3]:
#Remove the unimportant feature, one-hot encoding, and convert the attack class to numeric
def select_feature_and_encoding(dataset, cols_to_drop, cols_nominal):
    
	# Drop the features has no meaning such as src ip.
    for cols in cols_to_drop:
        dataset.drop(cols, axis = 1, inplace = True)

	# Save the label and then drop it from dataset
    label_10 = dataset['label_10']
    dataset.drop('label_2', axis = 1, inplace = True)

	# replace the label with specific code
    replace_dict = { 'NaN': 0, 'Analysis': 1, 'Backdoors': 2, 'Backdoor': 2, 'DoS': 3,
                    'Exploits':4,' Fuzzers': 5, ' Fuzzers ':5, 'Generic': 6,
                    'Reconnaissance': 7, ' Shellcode ':8, 'Shellcode': 8,
                    'Worms':9, ' Reconnaissance ': 7,}
    dataset['label_10'] = label_10.replace(replace_dict)

	# replace the lost values
    replace_dict = {"NaN": 0, ' ': 0}
    for cols in ['ct_ftp', 'ct_flw', 'is_ftp']:
        dataset[cols] = dataset[cols].replace(replace_dict)
        
    for x in dataset['is_ftp']:
        if x != 0:
            x = 1

    for col_name in cols_nominal:
        dataset.drop(col_name, axis = 1, inplace = True) 

    return dataset  #Complete data set (including data and labels)

In [4]:
file_folder = 'unsw-NB15/'  #The location where the original file was read
col_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',
	             'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
	             'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
	             'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',
	             'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt',
	             'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips',
	             'ct_state_ttl', 'ct_flw', 'is_ftp', 'ct_ftp', 'ct_srv_src',
	             'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',
	             'ct_dst_sport', 'ct_dst_src', 'label_10', 'label_2']    #listed name

cols_to_drop = ['srcip', 'dstip', 'stime', 'ltime', 'sport', 'dsport']
cols_nominal = ['proto', 'service', 'state']   #Nominal features

files = [file_folder + 'UNSW-NB15_' + str(i+1) + '.csv' for i in range(4)]
dataset = combine_dataset(files, col_names)
dataset = dataset.fillna("NaN")

In [5]:
dataset = select_feature_and_encoding(dataset, cols_to_drop, cols_nominal)

In [6]:
X = dataset.drop('label_10', axis=1)
y = dataset['label_10']

In [8]:
# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 인코더 모델 output의 feature 수를 결정하는 변수
feature_cnt = 19

In [9]:
# 오토인코더 모델 정의
autoencoder = Sequential([
    Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(feature_cnt, activation='relu'),
    Dense(32, activation='relu'),
    Dense(64, activation='relu'),
    Dense(X_scaled.shape[1], activation='linear')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
autoencoder.compile(optimizer='adam', loss='mse')

In [12]:
# 학습 시간 측정 시작
start_time = time.time()

In [13]:
# 오토인코더 모델 학습
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(patience=3)])

Epoch 1/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 647us/step - loss: 0.1138 - val_loss: 0.0228
Epoch 2/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 649us/step - loss: 0.0396 - val_loss: 0.0161
Epoch 3/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 642us/step - loss: 0.0356 - val_loss: 0.0117
Epoch 4/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 645us/step - loss: 0.0303 - val_loss: 0.0227
Epoch 5/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 641us/step - loss: 0.0540 - val_loss: 0.0076
Epoch 6/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 640us/step - loss: 0.0412 - val_loss: 0.0086
Epoch 7/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 644us/step - loss: 0.0414 - val_loss: 0.0096
Epoch 8/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 643us/step - loss: 0.0360 - v

<keras.src.callbacks.history.History at 0x245b0e10f90>

In [14]:
auto_training_time = time.time() - start_time
print("AutoEncoder Training Time:", auto_training_time, "seconds")

AutoEncoder Training Time: 186.93509459495544 seconds


In [15]:
encoder_input = autoencoder.layers[0].input  # 오토인코더 모델의 입력 레이어
encoder_output = autoencoder.layers[2].output  # 오토인코더 모델의 첫 번째 hidden layer의 출력
encoder_model = Model(inputs=encoder_input, outputs=encoder_output)

In [16]:
# 예측 시간 측정 시작
start_time = time.time()
# 압축 데이터 추출
X_compressed = encoder_model.predict(X_scaled)
#예측 시간 계산
auto_inference_time = time.time() - start_time
print("AutoEncoder Testing Time:", auto_inference_time, "seconds")

[1m79377/79377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 454us/step
AutoEncoder Testing Time: 65.97966694831848 seconds


In [18]:
# 데이터를 3D 형태로 변환 (샘플 수, 시간 단계 수, 특성 수)
X_reshaped = X_compressed.reshape(X_compressed.shape[0], 1, X_compressed.shape[1])

In [19]:
# 분류 라벨을 원-핫 인코딩
y_onehot = to_categorical(y)

In [20]:
# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_onehot, test_size=0.2, random_state=42)

In [21]:
# 압축된 특성으로 RNN 모델 정의
rnn_model = Sequential([
    SimpleRNN(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu', return_sequences=True),
    SimpleRNN(32, activation='relu'),
    Dense(10, activation='softmax')
])

  super().__init__(**kwargs)


In [22]:
rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [24]:
start_time = time.time()
rnn_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(patience=3)])
training_time = time.time() - start_time
print("Training Time:", training_time, "seconds")

Epoch 1/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 857us/step - accuracy: 0.9619 - loss: 0.1209 - val_accuracy: 0.9708 - val_loss: 0.0769
Epoch 2/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 847us/step - accuracy: 0.9717 - loss: 0.0743 - val_accuracy: 0.9715 - val_loss: 0.0723
Epoch 3/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 832us/step - accuracy: 0.9723 - loss: 0.0715 - val_accuracy: 0.9731 - val_loss: 0.0699
Epoch 4/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 834us/step - accuracy: 0.9731 - loss: 0.0697 - val_accuracy: 0.9731 - val_loss: 0.0690
Epoch 5/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 865us/step - accuracy: 0.9731 - loss: 0.0693 - val_accuracy: 0.9735 - val_loss: 0.0688
Epoch 6/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 856us/step - accuracy: 0.9736 - loss: 0.0679 - val_accuracy: 0.9732 - val

In [25]:
# 모델 평가
start_time = time.time()
loss, accuracy = rnn_model.evaluate(X_test, y_test)
inference_time = time.time() - start_time  # 예측 시간 측정 종료
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)
print("Testing Time:", inference_time, "seconds")

[1m15876/15876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 517us/step - accuracy: 0.9739 - loss: 0.0675
Test Loss: 0.06788550317287445
Test Accuracy: 0.9737170338630676
Testing Time: 8.287434816360474 seconds
