In [19]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error

In [20]:
def combine_dataset(files, col_names, processed = False):
	dtypes = {}
	if processed == False:
		for col_name in col_names:
			nominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',
				                 'service', 'ct_ftp', 'label_10'])  #Nominal column
			if col_name in nominal_names:
				dtypes[col_name] =  str
			else:
				dtypes[col_name] = np.float32
	else:
		for col_name in col_names:
			dtypes[col_name] = np.float32

	records = []
	for file in files:
		data = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)
		records.append(data)

	records_all = pd.concat(records) #When there is no index, concat adds them together regardless of the column names,

	return records_all

In [21]:
#Remove the unimportant feature, one-hot encoding, and convert the attack class to numeric
def select_feature_and_encoding(dataset, cols_to_drop, cols_nominal):
    
	# Drop the features has no meaning such as src ip.
    for cols in cols_to_drop:
        dataset.drop(cols, axis = 1, inplace = True)

	# Save the label and then drop it from dataset
    label_10 = dataset['label_10']
    dataset.drop('label_2', axis = 1, inplace = True)

	# replace the label with specific code
    replace_dict = { 'NaN': 0, 'Analysis': 1, 'Backdoors': 2, 'Backdoor': 2, 'DoS': 3,
                    'Exploits':4,' Fuzzers': 5, ' Fuzzers ':5, 'Generic': 6,
                    'Reconnaissance': 7, ' Shellcode ':8, 'Shellcode': 8,
                    'Worms':9, ' Reconnaissance ': 7,}
    dataset['label_10'] = label_10.replace(replace_dict)

	# replace the lost values
    replace_dict = {"NaN": 0, ' ': 0}
    for cols in ['ct_ftp', 'ct_flw', 'is_ftp']:
        dataset[cols] = dataset[cols].replace(replace_dict)
        
    for x in dataset['is_ftp']:
        if x != 0:
            x = 1

    for col_name in cols_nominal:
        dataset.drop(col_name, axis = 1, inplace = True) 

    return dataset  #Complete data set (including data and labels)

In [22]:
file_folder = 'unsw-NB15/'  #The location where the original file was read
col_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',
	             'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
	             'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
	             'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',
	             'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt',
	             'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips',
	             'ct_state_ttl', 'ct_flw', 'is_ftp', 'ct_ftp', 'ct_srv_src',
	             'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',
	             'ct_dst_sport', 'ct_dst_src', 'label_10', 'label_2']    #listed name

cols_to_drop = ['srcip', 'dstip', 'stime', 'ltime', 'sport', 'dsport']
cols_nominal = ['proto', 'service', 'state']   #Nominal features

files = [file_folder + 'UNSW-NB15_' + str(i+1) + '.csv' for i in range(4)]
dataset = combine_dataset(files, col_names)
dataset = dataset.fillna("NaN")

In [24]:
dataset = select_feature_and_encoding(dataset, cols_to_drop, cols_nominal)

In [25]:
X = dataset.drop('label_10', axis=1) 
y = dataset['label_10']  

In [37]:
# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 인코더 모델 output의 feature 수를 결정하는 변수
feature_cnt = 19

In [38]:
# 오토인코더 모델 정의
autoencoder = Sequential([
    Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(feature_cnt, activation='relu'),
    Dense(32, activation='relu'),
    Dense(64, activation='relu'),
    Dense(X_scaled.shape[1], activation='linear')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [39]:
autoencoder.compile(optimizer='adam', loss='mse')

In [40]:
# 오토인코더 모델 학습
autoencoder.fit(X_scaled, X_scaled, epochs=50, batch_size=64, validation_split=0.1, callbacks=[EarlyStopping(patience=3)])

Epoch 1/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 657us/step - loss: 0.1068 - val_loss: 0.0180
Epoch 2/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 644us/step - loss: 0.0480 - val_loss: 0.0174
Epoch 3/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 655us/step - loss: 0.0448 - val_loss: 0.0078
Epoch 4/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 665us/step - loss: 0.0347 - val_loss: 0.0085
Epoch 5/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 704us/step - loss: 0.0494 - val_loss: 0.0273
Epoch 6/50
[1m35720/35720[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 671us/step - loss: 0.0526 - val_loss: 0.0338


<keras.src.callbacks.history.History at 0x25b0dbdb450>

In [41]:
# 인코더 모델 추출
encoder_input = autoencoder.layers[0].input 
encoder_output = autoencoder.layers[2].output 
encoder_model = Model(inputs=encoder_input, outputs=encoder_output)

In [42]:
# 데이터 압축 진행
X_compressed = encoder_model.predict(X_scaled)

[1m79377/79377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 463us/step


In [43]:
df = pd.DataFrame(data=X_compressed)

In [44]:
# 디코더 모델 추출
decoder_output = autoencoder.layers[-1].output
decoder_input = autoencoder.layers[3].input  
decoder_model = Model(inputs=decoder_input, outputs=decoder_output)

In [45]:
# 데이터 복원 진행
decoded_data = decoder_model.predict(X_compressed)

[1m79377/79377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 459us/step


In [46]:
mse = mean_squared_error(decoded_data, X_scaled)
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 0.03654828725532699


In [47]:
# 복원율 계산
reconstruction_rate = 1 - (mse / (decoded_data ** 2).mean())
print("Reconstruction Rate:", reconstruction_rate)

Reconstruction Rate: 0.9638336479590263
