# Load require library

In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load datasets with columns header

In [8]:
df = pd.read_csv('NSL-KDD/KDDTrain+.txt', sep=',')
df.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'labels', 'difficulty']

In [9]:
df.shape


(125972, 43)

In [32]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,labels,difficulty,targets
0,0,1,6,0,146,0,0,0,0,0,...,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15,0
1,0,0,1,1,0,0,0,0,0,0,...,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19,1
2,0,0,0,0,232,8153,0,0,0,0,...,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21,0
3,0,0,0,0,199,420,0,0,0,0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21,0
4,0,0,1,2,0,0,0,0,0,0,...,0.07,0.00,0.00,0.00,0.00,1.00,1.00,neptune,21,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125967,0,0,1,1,0,0,0,0,0,0,...,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20,1
125968,8,1,1,0,105,145,0,0,0,0,...,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal,21,0
125969,0,0,3,0,2231,384,0,0,0,0,...,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal,18,0
125970,0,0,40,1,0,0,0,0,0,0,...,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20,1


# Convert protocol_type to numeric value 

In [11]:
df.protocol_type.unique()

array(['udp', 'tcp', 'icmp'], dtype=object)

In [12]:
protocols = df.protocol_type.value_counts()
protocols_map = {}
for i, (prop, count) in enumerate(protocols.items()):
    protocols_map[prop] = i
#     protocols_map[prop] = round(count/sum(protocols), 3)
df.protocol_type = df.protocol_type.map(protocols_map)
df.protocol_type

0         1
1         0
2         0
3         0
4         0
         ..
125967    0
125968    1
125969    0
125970    0
125971    0
Name: protocol_type, Length: 125972, dtype: int64

# Convert service to numeric value

In [13]:
df.service.unique()

array(['other', 'private', 'http', 'remote_job', 'ftp_data', 'name',
       'netbios_ns', 'eco_i', 'mtp', 'telnet', 'finger', 'domain_u',
       'supdup', 'uucp_path', 'Z39_50', 'smtp', 'csnet_ns', 'uucp',
       'netbios_dgm', 'urp_i', 'auth', 'domain', 'ftp', 'bgp', 'ldap',
       'ecr_i', 'gopher', 'vmnet', 'systat', 'http_443', 'efs', 'whois',
       'imap4', 'iso_tsap', 'echo', 'klogin', 'link', 'sunrpc', 'login',
       'kshell', 'sql_net', 'time', 'hostnames', 'exec', 'ntp_u',
       'discard', 'nntp', 'courier', 'ctf', 'ssh', 'daytime', 'shell',
       'netstat', 'pop_3', 'nnsp', 'IRC', 'pop_2', 'printer', 'tim_i',
       'pm_dump', 'red_i', 'netbios_ssn', 'rje', 'X11', 'urh_i',
       'http_8001', 'aol', 'http_2784', 'tftp_u', 'harvest'], dtype=object)

In [14]:
services = df.service.value_counts()
service_map = {}
for i, (ser, count) in enumerate(services.items()):
#     service_map[ser] = round(count/sum(services), 6)
    service_map[ser] = i
df.service = df.service.map(service_map)
df.service

0          6
1          1
2          0
3          0
4          1
          ..
125967     1
125968     1
125969     3
125970    40
125971     4
Name: service, Length: 125972, dtype: int64

# Convert flag to numeric value

In [15]:
df.flag.unique()

array(['SF', 'S0', 'REJ', 'RSTR', 'SH', 'RSTO', 'S1', 'RSTOS0', 'S3',
       'S2', 'OTH'], dtype=object)

In [16]:
flag_map = {}
flags = df.flag.value_counts()
for i, (flag, count) in enumerate(flags.items()):
#     flag_map[flag] = round(count/sum(flags), 3)
    flag_map[flag] = i
df.flag = df.flag.map(flag_map)
df.flag

0         0
1         1
2         0
3         0
4         2
         ..
125967    1
125968    0
125969    0
125970    1
125971    0
Name: flag, Length: 125972, dtype: int64

# Convert labels to numerical values as targets

In [27]:
df.labels.unique()
df['targets'] = df.labels.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})

In [28]:
df.labels.value_counts()

normal             67342
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: labels, dtype: int64

In [196]:
labels_map = {}
for i, label in enumerate(df.labels.unique()):
    labels_map[label] = i
df['targets'] = df.labels.map(labels_map)
df.targets

0         0
1         1
2         0
3         0
4         1
         ..
125967    1
125968    0
125969    0
125970    1
125971    0
Name: targets, Length: 125972, dtype: int64

In [197]:
len(df.targets.unique())

23

# Check the data sets

In [198]:
df.difficulty.unique(), len(df.difficulty.unique())

(array([15, 19, 21, 18, 20, 17, 16, 12, 14, 11,  2, 13, 10,  9,  8,  7,  3,
         5,  1,  6,  0,  4]), 22)

In [266]:
df.values.astype(np.float32)

array([[ 0.,  1.,  6., ...,  0., 15.,  0.],
       [ 0.,  0.,  1., ...,  1., 19.,  1.],
       [ 0.,  0.,  0., ...,  0., 21.,  0.],
       ...,
       [ 0.,  0.,  3., ...,  0., 18.,  0.],
       [ 0.,  0., 40., ...,  1., 20.,  1.],
       [ 0.,  0.,  4., ...,  0., 21.,  0.]], dtype=float32)

# Check is there any null vlaue or missing value

In [200]:
df.isna().count()

duration                       125972
protocol_type                  125972
service                        125972
flag                           125972
src_bytes                      125972
dst_bytes                      125972
land                           125972
wrong_fragment                 125972
urgent                         125972
hot                            125972
num_failed_logins              125972
logged_in                      125972
num_compromised                125972
root_shell                     125972
su_attempted                   125972
num_root                       125972
num_file_creations             125972
num_shells                     125972
num_access_files               125972
num_outbound_cmds              125972
is_host_login                  125972
is_guest_login                 125972
count                          125972
srv_count                      125972
serror_rate                    125972
srv_serror_rate                125972
rerror_rate 

In [29]:
df.columns

Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'labels', 'difficulty', 'targets'],
      dtype='object')

In [31]:
scaler = StandardScaler()
before_scale = df.drop(['labels'], axis=1)
scaled_df = scaler.fit_transform(before_scale)
scaled_df

array([[-0.11024967,  1.32579886, -0.07506482, ..., -0.37638873,
        -1.96554725, -0.82838537],
       [-0.11024967, -0.44346853, -0.49565219, ..., -0.37638873,
        -0.21996766,  0.61232263],
       [-0.11024967, -0.44346853, -0.57976966, ..., -0.34508583,
         0.65282213, -0.82838537],
       ...,
       [-0.11024967, -0.44346853, -0.32741724, ..., -0.37638873,
        -0.65636256, -0.82838537],
       [-0.11024967, -0.44346853,  2.78492928, ..., -0.37638873,
         0.21642723,  0.61232263],
       [-0.11024967, -0.44346853, -0.24329977, ..., -0.37638873,
         0.65282213, -0.82838537]])

In [202]:
train_data = df.drop(['labels', 'difficulty', 'targets'], axis=1)
target_data = df['targets']

In [203]:
train_data.shape, target_data.shape

((125972, 41), (125972,))

In [204]:
train_data_x, train_data_y = train_data[:int(len(train_data)*0.8)], target_data[:int(len(target_data)*0.8)]
val_data_x, val_data_y = train_data[int(len(train_data)*0.8):], target_data[int(len(target_data)*0.8):]

In [205]:
train_data_x.shape, val_data_x.shape

((100777, 41), (25195, 41))

In [206]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert the data to tensor form

In [242]:
train_data = TensorDataset(torch.from_numpy(train_data_x.to_numpy()).to(torch.float32), torch.from_numpy(train_data_y.to_numpy()))
val_data = TensorDataset(torch.from_numpy(val_data_x.to_numpy()), torch.from_numpy(val_data_y.to_numpy()))

TypeError: __init__() got an unexpected keyword argument 'requires_grad'

# Convert into dataloader

In [232]:
batch_size = 50

In [233]:
train_data_loader = DataLoader(train_data, batch_size=batch_size)
valid_data_loader = DataLoader(val_data, batch_size=batch_size)

In [234]:
dataiter = iter(train_data_loader)
sample_x, sample_y = dataiter.next()
print(sample_x.shape)
print(sample_y.shape)
print(sample_x)

torch.Size([50, 41])
torch.Size([50])
tensor([[0.0000, 1.0000, 6.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0100, 0.0000, 0.0100],
        ...,
        [0.0000, 0.0000, 4.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 4.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])


In [235]:
sample_y

tensor([0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 0, 3, 0, 0, 1, 1, 0, 0, 1,
        0, 1, 0, 0, 0, 3, 1, 0, 4, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 5, 0, 2,
        1, 0])

# import necessary library for model development

In [236]:
import torch.nn as nn
from torch.autograd import Variable

In [237]:
class simple_rnn(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(simple_rnn, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        
        h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
        
        x = x.unsqueeze(1)
        
        output, hn = self.rnn(x, h0)
        
        output = self.fc(output[:, -1, :])
        
        return output

In [238]:
input_size = 41
hidden_size = 64
num_layers = 2
output_size = 23
model = simple_rnn(input_size, hidden_size, num_layers, output_size)

In [239]:
model

simple_rnn(
  (rnn): RNN(41, 64, num_layers=2, batch_first=True)
  (fc): Linear(in_features=64, out_features=23, bias=True)
)

In [240]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr= 0.01)

In [245]:

for epoch in range(5):
    model.train()
    valid_acc = []
    train_loss = []
    for i, (X_label, y_label) in enumerate(train_data_loader):
        optimizer.zero_grad()
        r_out = model(X_label.float())
        loss = criterion(r_out, y_label)
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())
        
        
        
        if i==2014:
#             print('loss: ', loss.item())
            break
            
    print('EPOCH', epoch, 'Training LOSS: ', np.mean(train_loss))
    
    model.eval()
    for i, (X_label, y_label) in enumerate(valid_data_loader):
        
        r_out = model(X_label.float())
        pred_y = torch.max(r_out, 1)[1]
#         print(pred_y)
#         print(y_label)
        ac = sum(pred_y == y_label)
#         print(ac)
        valid_acc.append(ac)
        if i==503:
            break
    print('Valid accuracy: ', np.sum(valid_acc)/float(25150)*100)

EPOCH 0 Training LOSS:  0.24095662978498575
Valid accuracy:  92.8548707753479
EPOCH 1 Training LOSS:  0.2512122353543493
Valid accuracy:  93.85288270377734
EPOCH 2 Training LOSS:  0.22592297290491112
Valid accuracy:  93.46322067594434
EPOCH 3 Training LOSS:  0.24067920008974691
Valid accuracy:  91.60636182902584
EPOCH 4 Training LOSS:  0.23839312037845922
Valid accuracy:  93.48707753479125
