In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import random

In [2]:
columns = ["duration",
"protocol_type",
"service",
"flag",
"src_bytes",
"dst_bytes",
"land",
"wrong_fragment",
"urgent",
"hot",
"num_failed_logins",
"logged_in",
"num_compromised",
"root_shell",
"su_attempted",
"num_root",
"num_file_creations",
"num_shells",
"num_access_files",
"num_outbound_cmds",
"is_host_login",
"is_guest_login",
"count",
"srv_count",
"serror_rate",
"srv_serror_rate",
"rerror_rate",
"srv_rerror_rate",
"same_srv_rate",
"diff_srv_rate",
"srv_diff_host_rate",
"dst_host_count",
"dst_host_srv_count",
"dst_host_same_srv_rate",
"dst_host_diff_srv_rate",
"dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate",
"dst_host_serror_rate",
"dst_host_srv_serror_rate",
"dst_host_rerror_rate",
"dst_host_srv_rerror_rate",
"attack_type"]

In [3]:
attack_category = {'back': 'dos', 'buffer_overflow': 'u2r', 'ftp_write': 'r2l', 'guess_passwd': 'r2l', 
                   'imap': 'r2l', 'ipsweep': 'probe', 'land': 'dos', 'loadmodule': 'u2r', 'multihop': 'r2l', 
                   'neptune': 'dos', 'nmap': 'probe', 'perl': 'u2r', 'phf': 'r2l', 'pod': 'dos', 'portsweep': 'probe', 
                   'rootkit': 'u2r', 'satan': 'probe', 'smurf': 'dos', 'spy': 'r2l', 'teardrop': 'dos', 
                   'warezclient': 'r2l', 'warezmaster': 'r2l', 'normal': 'normal'}

In [4]:
train_data_uncleaned = pd.read_csv("kddcup.data_10_percent.gz",names=columns)
test_data_uncleaned = pd.read_csv("corrected.gz",names=columns)

In [5]:
train_data_uncleaned.attack_type.value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: attack_type, dtype: int64

In [6]:
def clean_data(data,attack_category,isTest):
    
    df = data
    # Since target column has a period at the end we remove it  
    df['attack_type'] = df['attack_type'].apply(lambda x:x[:-1])
    
    # Test data has some extra attacks not provided in training ,remove the extras
    if isTest:
        df = df.loc[df.attack_type.isin(list(attack_category.keys()))]
        
    # add attack_type column
    df['attack_category'] = df['attack_type'].apply(lambda x:attack_category[x])
    
    # Convert categorical data to numeric and type cast them int64
    for col in ['protocol_type','service','flag','attack_category']:
        df[col] = pd.Categorical(df[col])
        
        
    print(dict( enumerate(df['attack_category'].cat.categories ) ))
    
    df['flag'] = df.flag.cat.codes
    df['protocol_type'] = df.protocol_type.cat.codes
    df['service'] = df.service.cat.codes
    df['attack_category'] = df.attack_category.cat.codes
    
    
    
    df['service'] = df['service'].astype(np.int64)
    df['flag'] = df['flag'].astype(np.int64)
    df['protocol_type'] = df['protocol_type'].astype(np.int64)
    df['attack_category'] = df['attack_category'].astype(np.int64)
    
    # Filter data to remove attack types with less then 20 rows
    if not isTest:
        df = df.groupby('attack_type').filter(lambda x : len(x)>21)
        
    # Delete the corresponsding attack types in test data
    delete_values = ['land','ftp_write','imap','multihop','phf','spy','warezmaster','loadmodule','rootkit','perl']
    if(isTest):
        df = df.loc[~df.attack_type.isin(delete_values)]
    
    return df

In [7]:
data = clean_data(train_data_uncleaned, attack_category, False)
test_data = clean_data(test_data_uncleaned,attack_category,True)

{0: 'dos', 1: 'normal', 2: 'probe', 3: 'r2l', 4: 'u2r'}
{0: 'dos', 1: 'normal', 2: 'probe', 3: 'r2l', 4: 'u2r'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

In [8]:
data.groupby(["attack_category","attack_type" ]).size()

attack_category  attack_type    
0                back                 2203
                 neptune            107201
                 pod                   264
                 smurf              280790
                 teardrop              979
1                normal              97278
2                ipsweep              1247
                 nmap                  231
                 portsweep            1040
                 satan                1589
3                guess_passwd           53
                 warezclient          1020
4                buffer_overflow        30
dtype: int64

In [9]:
data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,attack_category
0,0,1,22,9,181,5450,0,0,0,0,...,1.0,0.0,0.11,0.00,0.00,0.00,0.0,0.0,normal,1
1,0,1,22,9,239,486,0,0,0,0,...,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal,1
2,0,1,22,9,235,1337,0,0,0,0,...,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal,1
3,0,1,22,9,219,1337,0,0,0,0,...,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal,1
4,0,1,22,9,217,2032,0,0,0,0,...,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494016,0,1,22,9,310,1881,0,0,0,0,...,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal,1
494017,0,1,22,9,282,2286,0,0,0,0,...,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal,1
494018,0,1,22,9,203,1200,0,0,0,0,...,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal,1
494019,0,1,22,9,291,1200,0,0,0,0,...,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal,1


In [10]:
data.shape

(493925, 43)

In [11]:
df = data.drop(['attack_type',], axis=1)

In [12]:
df.attack_category.value_counts()

0    391437
1     97278
2      4107
3      1073
4        30
Name: attack_category, dtype: int64

In [13]:
df.describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_category
count,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,...,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0
mean,47.980465,0.467022,23.408181,7.842397,3026.167,705.2202,2e-06,0.006434,6e-06,0.034392,...,188.698887,0.753776,0.030902,0.60192,0.006656,0.176735,0.176434,0.058125,0.057422,0.220339
std,707.813437,0.575603,13.538028,2.250941,988314.1,16738.92,0.001423,0.134818,0.004269,0.781379,...,106.021683,0.410784,0.109241,0.481316,0.04187,0.380585,0.380924,0.230607,0.230161,0.449648
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,14.0,9.0,45.0,0.0,0.0,0.0,0.0,0.0,...,46.0,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,14.0,9.0,520.0,0.0,0.0,0.0,0.0,0.0,...,255.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,42.0,9.0,1032.0,0.0,0.0,0.0,0.0,0.0,...,255.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,58329.0,2.0,65.0,10.0,693375600.0,5134218.0,1.0,3.0,3.0,30.0,...,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0


In [14]:
from sklearn.preprocessing import MinMaxScaler

In [21]:
y_train = df[['attack_category']].values.ravel()
x_train = df.drop(['attack_category',], axis=1)

sc = MinMaxScaler()
x_train = sc.fit_transform(x_train)

In [29]:
df_train = pd.DataFrame(data = x_train, columns = columns[0:41])

In [31]:
df_train.describe()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
count,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,...,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0,493925.0
mean,0.000823,0.233511,0.360126,0.78424,4.364397e-06,0.000137,2e-06,0.002145,2e-06,0.001146,...,0.911784,0.739996,0.753776,0.030902,0.60192,0.006656,0.176735,0.176434,0.058125,0.057422
std,0.012135,0.287801,0.208277,0.225094,0.001425366,0.00326,0.001423,0.044939,0.001423,0.026046,...,0.253687,0.415771,0.410784,0.109241,0.481316,0.04187,0.380585,0.380924,0.230607,0.230161
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.215385,0.9,6.489989e-08,0.0,0.0,0.0,0.0,0.0,...,1.0,0.180392,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.215385,0.9,7.499542e-07,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.5,0.646154,0.9,1.488371e-06,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.04,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [108]:
population = []
populationSize = 100

for current in range(0,populationSize):
    chromosome = []
#     for col in df_train.columns:
#         r = random.randint(df_train[col].min(),df_train[col].max()+1)
#         #print(col,df[col].min(),df[col].max(),r)
#         chromosome.append(r)
    chromosome = np.random.rand(42)
    chromosome[-1] = random.randint(0,5)
    population.append(chromosome)
    #print(chromosome)
print(population)

[array([0.95243104, 0.47270005, 0.64194249, 0.53236427, 0.58274878,
       0.61064462, 0.08506484, 0.72162793, 0.33566885, 0.23872851,
       0.68700896, 0.17957024, 0.40762139, 0.31145545, 0.32121766,
       0.05441248, 0.01194316, 0.258839  , 0.05653221, 0.70428621,
       0.18638147, 0.90360981, 0.14720032, 0.74985305, 0.35047585,
       0.32219814, 0.48877191, 0.0979561 , 0.62614924, 0.88407443,
       0.42855228, 0.43458334, 0.31528153, 0.56154853, 0.94343169,
       0.61841371, 0.73116185, 0.13238624, 0.14093993, 0.49857725,
       0.11115869, 0.        ]), array([0.53617472, 0.72168015, 0.43189952, 0.31786018, 0.95455423,
       0.18283441, 0.97139304, 0.87249141, 0.81496122, 0.03707577,
       0.11308833, 0.7407536 , 0.3105892 , 0.73938535, 0.94072475,
       0.89329845, 0.85168341, 0.48644112, 0.46749384, 0.86718868,
       0.83101055, 0.06851108, 0.7078073 , 0.0803828 , 0.14195898,
       0.41776357, 0.658406  , 0.89153495, 0.95154392, 0.00995603,
       0.01757072, 0.5928603

In [114]:
def fitness(chromosome):
    A = df.loc[df["attack_category"] != 1].shape[0]
    B = df.loc[df["attack_category"] == 1].shape[0]
    
    a = 0
    b = 0
    for index,rows in df_train.iterrows(): 
        total = 0
        k = 0
#         if(index > 500):
#             break
        for col in df_train.columns:
            total = total + abs(rows[col] - chromosome[k])
            k += 1
        if total < 16:
            if y_train[index] != 1 and chromosome[-1] != 1:
                a += 1
            if y_train[index] == 1 and chromosome[-1] != 1:
                b += 1
    
    F = (a/A) - (b/B)
    return F

In [160]:
def crossOver(parentOne, parentTwo):
    
    for i in range(len(parentOne)):
        left = parentOne[i]
        right = parentTwo[i]
        
        r = random.randint(0,len(left)-1)
        left[r],right[r] = right[r],left[r]
        
    parentOne.extend(parentTwo)
    return parentOne

In [157]:
def mutate(chromosome):
    r = np.random.rand()
    #if random number is greater than 0.6, change any random gene
    if r > 0.6:
        #dividing by 2
        gene = random.randint(0,len(chromosome)-1)
        chromosome[gene] /= 2
        
    return chromosome

In [163]:
maxGenerations = 30
currentGen = 1

while currentGen < maxGenerations:
    print("Generation: ", currentGen)
    
    fitnessVals = []
    
    #calculating fitness
    for chromosome in population:
        fitnessVals.append([fitness(chromosome),chromosome])
    
    #Selecting top 10 fit chromosomes
    fitnessVals.sort(reverse=True, key = lambda x:x[0])
    topFitness = fitnessVals[0:10]
    
    print("Top Fitness: ", topFitness[0])
    topChromosomes = []
    
    for top in topFitness:
        topChromosomes.append(top[1])
    
    nextGen = []
    #Cloning 5 times
    for i in range(5):
        nextGen.extend(topChromosomes)
  
    parentOne = nextGen[0:25]
    parentTwo = nextGen[25:]
    
    #Crossover
    nextGen = crossOver(parentOne, parentTwo)
    
    #Mutation
    for chromosome in nextGen:
        chromosome = mutate(chromosome)
        
        
    currentGen += 1
    
    

Generation:  1
Top Fitness:  [0.0, array([0.53617472, 0.20342589, 0.87528456, 0.31786018, 0.95455423,
       0.69610281, 0.97139304, 0.87249141, 0.23151353, 0.28528311,
       0.89542833, 0.7407536 , 0.1552946 , 0.19694387, 0.83539719,
       0.89329845, 0.25242684, 0.48644112, 0.46749384, 0.78324287,
       0.83101055, 0.06851108, 0.33834944, 0.0803828 , 0.14195898,
       0.41776357, 0.658406  , 0.89153495, 0.95154392, 0.00995603,
       0.01757072, 0.58457668, 0.15472328, 0.26641834, 0.67148043,
       0.04036491, 0.24010132, 0.25262353, 0.40572188, 0.95707767,
       0.83193593, 3.        ])]
Generation:  2
Top Fitness:  [0.0, array([0.53617472, 0.20342589, 0.87528456, 0.31786018, 0.95455423,
       0.69610281, 0.97139304, 0.44390338, 0.23151353, 0.28528311,
       0.89542833, 0.7407536 , 0.1552946 , 0.19694387, 0.83539719,
       0.89329845, 0.25242684, 0.48644112, 0.62917606, 0.78324287,
       0.83101055, 0.79241612, 0.33834944, 0.0803828 , 0.14195898,
       0.41776357, 0.65840

Top Fitness:  [0.0, array([0.75921114, 0.05085647, 0.66947933, 0.31786018, 0.79676505,
       0.0970927 , 0.11493166, 0.44390338, 0.95116003, 0.44036681,
       0.89542833, 0.7407536 , 0.1552946 , 0.09847193, 0.26571561,
       0.75234124, 0.23060517, 0.18230154, 0.62917606, 0.22746157,
       0.41550527, 0.79241612, 0.9173111 , 0.89865452, 0.63202221,
       0.72624597, 0.65646254, 0.89153495, 0.42437303, 0.33685563,
       0.06689963, 0.21165937, 0.23818458, 0.26641834, 0.422568  ,
       0.04036491, 0.31450349, 0.99704881, 0.48894758, 0.95707767,
       0.73651456, 3.        ])]
Generation:  15
Top Fitness:  [0.0, array([0.75921114, 0.05085647, 0.66947933, 0.9671992 , 0.79676505,
       0.0970927 , 0.11493166, 0.44390338, 0.95116003, 0.44036681,
       0.89542833, 0.7407536 , 0.1552946 , 0.08432135, 0.46810962,
       0.75234124, 0.23060517, 0.18230154, 0.62917606, 0.11373079,
       0.41550527, 0.79241612, 0.9173111 , 0.89865452, 0.63202221,
       0.72624597, 0.65646254, 0.4457674

Top Fitness:  [0.0, array([0.1106021 , 0.78320508, 0.55159505, 0.13309107, 0.04441441,
       0.4269073 , 0.97139304, 0.385517  , 0.53271596, 0.74294236,
       0.16805617, 0.121801  , 0.07520461, 0.55614304, 0.42108668,
       0.96253732, 0.15245223, 0.0067245 , 0.62917606, 0.40174447,
       0.61700962, 0.16213677, 0.1950466 , 0.32686352, 0.07097949,
       0.13087057, 0.51064144, 0.42261385, 0.42437303, 0.34157827,
       0.06689963, 0.36575991, 0.21105818, 0.17008843, 0.30899813,
       0.61841371, 0.164744  , 0.07542249, 0.21567428, 0.00867844,
       0.20798398, 1.        ])]
Generation:  29
Top Fitness:  [0.0, array([0.06500631, 0.02787694, 0.87528456, 0.13839397, 0.4393429 ,
       0.52059883, 0.28169027, 0.43624571, 0.07029701, 0.03707577,
       0.61279365, 0.46897005, 0.40762139, 0.23439088, 0.17747859,
       0.0280184 , 0.42299987, 0.12161028, 0.27065121, 0.34966796,
       0.30102621, 0.65877913, 0.7078073 , 0.2980134 , 0.77593729,
       0.75975535, 0.45125487, 0.7764172