In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [2]:
def combine_dataset(files, col_names, processed = False):
	dtypes = {}
	if processed == False:
		for col_name in col_names:
			nominal_names = set(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state',
				                 'service', 'ct_ftp', 'label_10'])  #Nominal column
			if col_name in nominal_names:
				dtypes[col_name] =  str
			else:
				dtypes[col_name] = np.float32
	else:
		for col_name in col_names:
			dtypes[col_name] = np.float32

	records = []
	for file in files:
		data = pd.read_csv(file, header = None, names = col_names, dtype = dtypes)
		records.append(data)

	records_all = pd.concat(records) #When there is no index, concat adds them together regardless of the column names,


	return records_all

In [3]:
#Remove the unimportant feature, one-hot encoding, and convert the attack class to numeric
def select_feature_and_encoding(dataset, cols_to_drop, cols_nominal):
    
	# Drop the features has no meaning such as src ip.
    for cols in cols_to_drop:
        dataset.drop(cols, axis = 1, inplace = True)

	# Save the label and then drop it from dataset
    label_10 = dataset['label_10']
    dataset.drop('label_2', axis = 1, inplace = True)

	# replace the label with specific code
    replace_dict = { 'NaN': 0, 'Analysis': 1, 'Backdoors': 2, 'Backdoor': 2, 'DoS': 3,
                    'Exploits':4,' Fuzzers': 5, ' Fuzzers ':5, 'Generic': 6,
                    'Reconnaissance': 7, ' Shellcode ':8, 'Shellcode': 8,
                    'Worms':9, ' Reconnaissance ': 7,}
    dataset['label_10'] = label_10.replace(replace_dict)

	# replace the lost values
    replace_dict = {"NaN": 0, ' ': 0}
    for cols in ['ct_ftp', 'ct_flw', 'is_ftp']:
        dataset[cols] = dataset[cols].replace(replace_dict)
        
    for x in dataset['is_ftp']:
        if x != 0:
            x = 1

    for col_name in cols_nominal:
        dataset.drop(col_name, axis = 1, inplace = True) 

    return dataset  #Complete data set (including data and labels)

In [4]:
file_folder = 'unsw-NB15/'  #The location where the original file was read
col_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur',
	             'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss',
	             'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
	             'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth',
	             'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt',
	             'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips',
	             'ct_state_ttl', 'ct_flw', 'is_ftp', 'ct_ftp', 'ct_srv_src',
	             'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport',
	             'ct_dst_sport', 'ct_dst_src', 'label_10', 'label_2']    #listed name

cols_to_drop = ['srcip', 'dstip', 'stime', 'ltime', 'sport', 'dsport']
cols_nominal = ['proto', 'service', 'state']   #Nominal features

files = [file_folder + 'UNSW-NB15_' + str(i+1) + '.csv' for i in range(4)]
dataset = combine_dataset(files, col_names)
dataset = dataset.fillna("NaN")

In [5]:
dataset = select_feature_and_encoding(dataset, cols_to_drop, cols_nominal)

In [6]:
X = dataset.drop('label_10', axis=1) 
y = dataset['label_10'] 

In [7]:
# 데이터 스케일링
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# PCA의 feature 수를 결정하는 변수
feature_cnt = 19

In [8]:
#pca 사용 차원 축소
pca = PCA(n_components=feature_cnt)
X_pca = pca.fit_transform(X_scaled)

In [9]:
X_original = pca.inverse_transform(X_pca)

In [11]:
mse = mean_squared_error(X_original, X_scaled)
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 0.0538720301459142


In [12]:
# 복원률 계산
reconstruction_rate = 1 - (mse / (X_original ** 2).mean())
print("Reconstruction Rate:", reconstruction_rate)

Reconstruction Rate: 0.9430605247256114
