In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import recall_score
from sklearn.model_selection import RandomizedSearchCV
from timeit import default_timer
import datetime
from xgboost import XGBClassifier
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, STATUS_OK, space_eval
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
print(f"Notebook last run (end-to-end): {datetime.datetime.now()}")

Notebook last run (end-to-end): 2024-03-13 16:09:33.447583


https://www.kaggle.com/datasets/hassan06/nslkdd/data

In [2]:
SEED_ORIG = 2024
SEED_NP = 2023
SEED = SEED_ORIG
NUM_SIMS = 10
rstate = np.random.default_rng(SEED_NP)

# Loading the dataset

In [3]:
train_data = pd.read_csv("KDDTrain+.txt" , header = None)
test_data  = pd.read_csv("KDDTest+.txt" , header = None)

In [4]:
Columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
            'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
            'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count',
            'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
            'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
            'dst_host_srv_rerror_rate','attack','level'])
train_data.columns = Columns
test_data.columns  = Columns

In [5]:
len(test_data)/(len(train_data)+len(test_data))

0.1517940707124437

In [6]:
train_attack = train_data.attack.map(lambda a: 0 if a == 'normal' else 1)
test_attack = test_data.attack.map(lambda a: 0 if a == 'normal' else 1)

train_data['attack'] = train_attack
test_data['attack'] = test_attack

In [7]:
train_data.isnull().sum()


duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_h

In [8]:
# duplicates
train_data.duplicated().sum(), test_data.duplicated().sum()



(9, 3)

In [9]:
cat_columns = ['protocol_type','service','flag']

In [10]:
train_data["protocol_type"].value_counts()

tcp     102689
udp      14993
icmp      8291
Name: protocol_type, dtype: int64

In [11]:
train_data["service"].value_counts()

http         40338
private      21853
domain_u      9043
smtp          7313
ftp_data      6860
             ...  
tftp_u           3
http_8001        2
aol              2
harvest          2
http_2784        1
Name: service, Length: 70, dtype: int64

In [12]:
train_data["flag"].value_counts()

SF        74945
S0        34851
REJ       11233
RSTR       2421
RSTO       1562
S1          365
SH          271
S2          127
RSTOS0      103
S3           49
OTH          46
Name: flag, dtype: int64

In [13]:
len(train_data["flag"].value_counts())

11

In [14]:
len(train_data), train_data["attack"].value_counts()

(125973,
 0    67343
 1    58630
 Name: attack, dtype: int64)

In [15]:
train_data = pd.get_dummies(train_data,columns=cat_columns,prefix="",prefix_sep="")
test_data = pd.get_dummies(test_data,columns=cat_columns,prefix="",prefix_sep="")

In [16]:
train_data["attack"]

0         0
1         0
2         1
3         0
4         0
         ..
125968    1
125969    0
125970    0
125971    1
125972    0
Name: attack, Length: 125973, dtype: int64

In [17]:
X_test = test_data.drop("attack",axis=1)
y_test = test_data['attack']

In [18]:
X_train_ = train_data.drop("attack", axis = 1)
y_train_ = train_data["attack"]

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_, y_train_, test_size= 0.25 , random_state=SEED)

In [None]:
num_columns = []
for col_name in train_data.drop("attack",axis=1).columns:
    if col_name not in cat_columns:
        num_columns.append(col_name)
num_columns

In [21]:
"attack" in num_columns

False

In [22]:
len(X_train), len(X_valid), len(X_test), len(X_train.columns), len(X_valid.columns), len(X_test.columns)

(94479, 31494, 22544, 123, 123, 117)

In [24]:
len(X_train), y_train.value_counts()

(94479,
 0    50528
 1    43951
 Name: attack, dtype: int64)

In [25]:
len(X_valid), y_valid.value_counts()

(31494,
 0    16815
 1    14679
 Name: attack, dtype: int64)

In [26]:
len(X_test), y_test.value_counts()

(22544,
 1    12833
 0     9711
 Name: attack, dtype: int64)

In [20]:
notintestcols = [col for col in X_train.columns if col not in X_test.columns]

In [21]:
X_train = X_train.drop(notintestcols,axis=1)
X_valid = X_valid.drop(notintestcols,axis=1)

In [22]:
notintestcols = [col for col in X_train.columns if col not in X_test.columns]

In [23]:
notintestcols

[]

In [24]:
len(X_train)

94479

In [25]:
len(X_train.columns), len(X_valid.columns), len(X_test.columns)

(117, 117, 117)

In [25]:
num_columns = [col for col in num_columns if col in X_test.columns]
num_columns

['duration',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'level',
 'icmp',
 'tcp',
 'udp',
 'IRC',
 'X11',
 'Z39_50',
 'auth',
 'bgp',
 'courier',
 'csnet_ns',
 'ctf',
 'daytime',
 'discard',
 'domain',
 'domain_u',
 'echo',
 'eco_i',
 'ecr_i',
 'efs',
 'exec',
 'finger',
 'ftp',
 'ftp_data',
 'gopher',
 'hostnames',
 'http'

In [26]:
len(num_columns)

117

In [27]:
'http_8001', 'http_2784', 'red_i', 'harvest', 'aol', 'urh_i'

('http_8001', 'http_2784', 'red_i', 'harvest', 'aol', 'urh_i')

In [24]:
X_train = X_train[y_train==0].copy()

In [25]:
len(X_train)

50528

In [26]:
scaler = StandardScaler()
#scaler = RobustScaler()
#scaler = MinMaxScaler()

In [27]:
#X_train = X_train.astype(float)


In [28]:
scaler.fit(X_train)

In [29]:
X_valid

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,REJ,RSTO,RSTOS0,RSTR,S0,S1,S2,S3,SF,SH
115556,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
53585,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
22976,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
103860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
108379,0,30,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52453,0,187,13920,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
5905,0,1032,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7519,0,235,1840,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
109059,0,325,362,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [30]:
X_train[num_columns] = scaler.transform(X_train[num_columns])

In [31]:
X_valid[num_columns] = scaler.transform(X_valid[num_columns])

In [32]:
X_test[num_columns] = scaler.transform(X_test[num_columns])

In [33]:
#X_train = pd.DataFrame(X_train)
#X_valid = pd.DataFrame(X_valid)
#X_test = pd.DataFrame(X_test)
X_valid['target'] = y_valid
X_test["target"] = y_test

In [34]:
X_valid["target"].value_counts(), X_test["target"].value_counts()

(0    16815
 1    14679
 Name: target, dtype: int64,
 1    12833
 0     9711
 Name: target, dtype: int64)

In [35]:
X_train.to_csv("nsl-kdd_train.csv",header=None,index=False,sep=" ")
X_valid.to_csv("nsl-kdd_valid.csv",header=None,index=False,sep=" ")
X_test.to_csv("nsl-kdd_test.csv",header=None,index=False,sep=" ")
len(X_train), len(X_valid), len(X_test), len(X_train.columns)

(50528, 31494, 22544, 117)

In [36]:
len(X_valid.columns), len(X_test.columns)

(118, 118)

In [37]:
import os


In [38]:
os.getcwd()

'/home/szekeres/Letöltések/nsl_kdd'