In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

seed = 123 # to ensure we always get the same results
np.random.seed(seed) # to ensure we always get the same results

In [66]:
features = ['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted',
'num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'intrusion_type'] #headers for the columns
#https://github.com/Saurabh2805/kdd_cup_99/blob/master/KDD_CUP_99_dataset_1.ipynb

In [67]:
%%time
data = pd.read_csv('kddcup.data.corrected', names=features, header = None) #tells data data get header titled with the things in features, tells that the data doesnt have a header row inheritly, otherwise it would replace the first row with the features lines
encoded = pd.get_dummies(data, drop_first=True)

X = encoded.iloc[:, :-1]  # All rows, all columns except the last
y = encoded.iloc[:, -1]   # All rows, last column (labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) #training 80% testing 20%

print("Training size:", X_train.shape, y_train.shape)
print("Testing size:", X_test.shape, y_test.shape)


Training size: (3918744, 140) (3918744,)
Testing size: (979687, 140) (979687,)
CPU times: total: 25.2 s
Wall time: 26.4 s


In [74]:
%%time
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=123) #estimators is no of trees and depth is their state

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy_percentage = accuracy * 100
test_loss = 1 - accuracy

print(f"Accuracy: {accuracy_percentage:.4f}%") #accuracy to 4 decimals
print(f"Test Loss: {test_loss:.4f}")

Accuracy: 99.9997%
Test Loss: 0.0000
CPU times: total: 2min 27s
Wall time: 2min 29s
