In [1]:
import time
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import SimpleRNN, Dense, Input
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
from tensorflow.keras.utils import to_categorical

In [2]:
def combine_dataset(files, col_names, processed = False):
	dtypes = {}
	if processed == False:
		for col_name in col_names:
			nominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',
				                 'service', 'ct_ftp', 'label_10'])  #Nominal column
			if col_name in nominal_names:
				dtypes[col_name] =  str
			else:
				dtypes[col_name] = np.float32
	else:
		for col_name in col_names:
			dtypes[col_name] = np.float32

	records = []
	for file in files:
		data = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)
		records.append(data)

	records_all = pd.concat(records) #When there is no index, concat adds them together regardless of the column names,


	return records_all

In [3]:
#Remove the unimportant feature, one-hot encoding, and convert the attack class to numeric
def select_feature_and_encoding(dataset, cols_to_drop, cols_nominal):
    
	# Drop the features has no meaning such as src ip.
    for cols in cols_to_drop:
        dataset.drop(cols, axis = 1, inplace = True)

	# Save the label and then drop it from dataset
    label_10 = dataset['label_10']
    dataset.drop('label_2', axis = 1, inplace = True)

	# replace the label with specific code
    replace_dict = { 'NaN': 0, 'Analysis': 1, 'Backdoors': 2, 'Backdoor': 2, 'DoS': 3,
                    'Exploits':4,' Fuzzers': 5, ' Fuzzers ':5, 'Generic': 6,
                    'Reconnaissance': 7, ' Shellcode ':8, 'Shellcode': 8,
                    'Worms':9, ' Reconnaissance ': 7,}
    dataset['label_10'] = label_10.replace(replace_dict)

	# replace the lost values
    replace_dict = {"NaN": 0, ' ': 0}
    for cols in ['ct_ftp', 'ct_flw', 'is_ftp']:
        dataset[cols] = dataset[cols].replace(replace_dict)
        
    for x in dataset['is_ftp']:
        if x != 0:
            x = 1

    for col_name in cols_nominal:
        dataset.drop(col_name, axis = 1, inplace = True) 

    return dataset  #Complete data set (including data and labels)

In [4]:
file_folder = 'unsw-NB15/'  #The location where the original file was read
col_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',
	             'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
	             'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
	             'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',
	             'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt',
	             'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips',
	             'ct_state_ttl', 'ct_flw', 'is_ftp', 'ct_ftp', 'ct_srv_src',
	             'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',
	             'ct_dst_sport', 'ct_dst_src', 'label_10', 'label_2']    #listed name

cols_to_drop = ['srcip', 'dstip', 'stime', 'ltime', 'sport', 'dsport']
cols_nominal = ['proto', 'service', 'state']   #Nominal features

files = [file_folder + 'UNSW-NB15_' + str(i+1) + '.csv' for i in range(4)]
dataset = combine_dataset(files, col_names)
dataset = dataset.fillna("NaN")

In [5]:
dataset = select_feature_and_encoding(dataset, cols_to_drop, cols_nominal)

In [6]:
X = dataset.drop('label_10', axis=1)
y = dataset['label_10']

In [7]:
# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# 분류 라벨을 원-핫 인코딩
y_onehot = to_categorical(y)

In [9]:
# LSTM 입력 형태로 데이터 재구성
X_reshaped = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)

In [10]:
# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_onehot, test_size=0.2, random_state=42)

In [11]:
# RNN 모델 정의
model = Sequential([
    SimpleRNN(64, input_shape=(X_train.shape[1], X_train.shape[2]), activation='relu', return_sequences=True),
    SimpleRNN(32, activation='relu'),
    Dense(10, activation='softmax')
])

  super().__init__(**kwargs)


In [12]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
# RNN 모델 학습
start_time = time.time()
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(patience=3)])
training_time = time.time() - start_time
print("Training Time:", training_time, "seconds")

Epoch 1/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 5ms/step - accuracy: 0.9631 - loss: 0.1120 - val_accuracy: 0.9712 - val_loss: 0.0732
Epoch 2/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 5ms/step - accuracy: 0.9730 - loss: 0.0704 - val_accuracy: 0.9729 - val_loss: 0.0717
Epoch 3/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 5ms/step - accuracy: 0.9733 - loss: 0.0701 - val_accuracy: 0.9725 - val_loss: 0.0746
Epoch 4/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 5ms/step - accuracy: 0.9726 - loss: 0.0726 - val_accuracy: 0.9724 - val_loss: 0.0704
Epoch 5/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 5ms/step - accuracy: 0.9725 - loss: 0.0737 - val_accuracy: 0.9655 - val_loss: 0.0922
Epoch 6/10
[1m28576/28576[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 5ms/step - accuracy: 0.9717 - loss: 0.0766 - val_accuracy: 0.9732 - val_loss:

In [14]:
# 모델 평가
start_time = time.time()
loss, accuracy = model.evaluate(X_test, y_test)
inference_time = time.time() - start_time  # 예측 시간 측정 종료
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)
print("Testing Time:", inference_time, "seconds")

[1m15876/15876[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 2ms/step - accuracy: 0.9657 - loss: 0.0990
Test Loss: 0.09998896718025208
Test Accuracy: 0.9656286239624023
Testing Time: 25.66959023475647 seconds
