# SWARM PROJECT

## Network Intrusion Detection using Genetic Algorithm
### Team Members
Tanmaya Yadav - 2K18/CO/368
Tanuj Chandolia - 2K18/CO/369
Vagish Shanker Yagnik - 2K18/CO/381

In [1]:
#common imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

# Cleaning the Dataset

In [2]:
columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'attack_type'
]

In [3]:
attack_category = {'back': 'dos', 'buffer_overflow': 'u2r', 'ftp_write': 'r2l', 'guess_passwd': 'r2l', 'imap': 'r2l', 'ipsweep': 'probe', 'land': 'dos', 'loadmodule': 'u2r', 'multihop': 'r2l', 'neptune': 'dos', 'nmap': 'probe', 'perl': 'u2r', 'phf': 'r2l', 'pod': 'dos', 'portsweep': 'probe', 'rootkit': 'u2r', 'satan': 'probe', 'smurf': 'dos', 'spy': 'r2l', 'teardrop': 'dos', 'warezclient': 'r2l', 'warezmaster': 'r2l', 'normal': 'normal'}

In [4]:
# Reading the dataset
path = "kddcup.data_10_percent_corrected"
test_path = "corrected"
train_data_uncleaned = pd.read_csv(path,names=columns)
test_data_uncleaned = pd.read_csv(test_path,names=columns)

In [5]:
def clean_data(data,attack_category,isTest):
    
    df = data
    # Since target column has a period at the end we remove it  
    df['attack_type'] = df['attack_type'].apply(lambda x:x[:-1])
    
    # Test data has some extra attacks not provided in training ,remove the extras
    if isTest:
        df = df.loc[df.attack_type.isin(list(attack_category.keys()))]
        
    # add attack_type column
    df['attack_category'] = df['attack_type'].apply(lambda x:attack_category[x])
    
    # Convert categorical data to numeric and type cast them int64
    for col in ['protocol_type','service','flag']:
        df[col] = pd.Categorical(df[col])
        
    df['flag'] = df.flag.cat.codes
    df['protocol_type'] = df.protocol_type.cat.codes
    df['service'] = df.service.cat.codes
    
    df['service'] = df['service'].astype(np.int64)
    df['flag'] = df['flag'].astype(np.int64)
    df['protocol_type'] = df['protocol_type'].astype(np.int64)
    
    # Filter data to remove attack types with less then 20 rows
    if not isTest:
        df = df.groupby('attack_type').filter(lambda x : len(x)>21)
        
    # Delete the corresponsding attack types in test data
    delete_values = ['land','ftp_write','imap','multihop','phf','spy','warezmaster','loadmodule','rootkit','perl']
    if(isTest):
        df = df.loc[~df.attack_type.isin(delete_values)]
    
    return df

In [None]:
data = clean_data(train_data_uncleaned, attack_category, False)
test_data = clean_data(test_data_uncleaned,attack_category,True)


data.to_excel('data.xlsx')
test_data.to_excel('test_data.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentatio

In [None]:
data = data[data.attack_category != 'r2l']
data = data[data.attack_category != 'probe']
data = data[data.attack_category != 'u2r']

#Getting some statistics on the attack_type and the categories number
data.groupby(["attack_category","attack_type" ]).size()

In [101]:
test_data = test_data[test_data.attack_category != 'r2l']
test_data = test_data[test_data.attack_category != 'probe']
test_data = test_data[test_data.attack_category != 'u2r']

#Getting some statistics on the attack_type and the categories number
test_data.groupby(["attack_category","attack_type" ]).size()

attack_category  attack_type
dos              back             1098
                 neptune         58001
                 pod                87
                 smurf          164091
                 teardrop           12
normal           normal          60593
dtype: int64

In [102]:
attack_category_map = {
    'dos': 1,
    'normal': 2,
}
attack_type_map = {
    'back': 1,
    'neptune': 2, 
    'pod': 3,
    'smurf': 4,
    'teardrop': 5,
    'normal': 6
}

In [103]:
data['attack_category'] = data['attack_category'].map(attack_category_map)
data['attack_type'] = data['attack_type'].map(attack_type_map)

# Delete coulmns with all 0 value
data.drop('num_outbound_cmds',axis='columns', inplace=True)
data.drop('is_host_login',axis='columns', inplace=True)

# Normalize data
data = (data-data.mean())/data.std()

# Shuffle the training_dataset
data = data.sample(frac = 1)

data.head(20)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,attack_category
114686,-0.070404,0.932262,1.603505,-1.290138,-0.064555,-0.042242,-0.00143,-0.047979,-0.00143,-0.043685,...,-1.756202,0.613253,-1.250393,-0.195372,2.151401,2.152639,-0.241669,-0.239783,-1.487445,-0.498512
432327,-0.070404,-0.805475,-0.690568,0.510747,-0.031415,-0.042242,-0.00143,-0.047979,-0.00143,-0.043685,...,0.593796,-0.29918,0.826847,-0.195372,-0.465863,-0.465137,-0.241669,-0.239783,0.04023,-0.498512
484982,-0.044284,0.932262,1.973516,0.510747,0.14818,-0.020076,-0.00143,-0.047979,-0.00143,-0.043685,...,-0.09162,0.613253,-1.188076,1.789616,-0.465863,-0.465137,-0.241669,-0.239783,1.567906,2.005964
57397,-0.070404,0.932262,1.603505,-1.290138,-0.064555,-0.042242,-0.00143,-0.047979,-0.00143,-0.043685,...,-1.780681,0.499199,-1.250393,-0.195372,2.151401,2.152639,-0.241669,-0.239783,-1.487445,-0.498512
36482,-0.070404,0.932262,-0.098549,0.510747,-0.044225,0.003873,-0.00143,-0.047979,-0.00143,-0.043685,...,0.593796,-0.29918,-1.208848,0.201626,-0.465863,-0.465137,-0.241669,-0.239783,1.567906,2.005964
294178,-0.070404,-0.805475,-0.690568,0.510747,0.001216,-0.042242,-0.00143,-0.047979,-0.00143,-0.043685,...,0.593796,-0.29918,0.826847,-0.195372,-0.465863,-0.465137,-0.241669,-0.239783,0.04023,-0.498512
299738,-0.070404,-0.805475,-0.690568,0.510747,0.001216,-0.042242,-0.00143,-0.047979,-0.00143,-0.043685,...,0.593796,-0.29918,0.826847,-0.195372,-0.465863,-0.465137,-0.241669,-0.239783,0.04023,-0.498512
485101,-0.070404,2.669998,-0.912575,0.510747,-0.062516,-0.036656,-0.00143,-0.047979,-0.00143,-0.043685,...,0.593796,-0.185126,-1.250393,-0.195372,-0.465863,-0.465137,-0.241669,-0.239783,1.567906,2.005964
409891,-0.070404,-0.805475,-0.690568,0.510747,-0.031415,-0.042242,-0.00143,-0.047979,-0.00143,-0.043685,...,0.593796,-0.29918,0.826847,-0.195372,-0.465863,-0.465137,-0.241669,-0.239783,0.04023,-0.498512
398052,-0.070404,-0.805475,-0.690568,0.510747,-0.031415,-0.042242,-0.00143,-0.047979,-0.00143,-0.043685,...,0.593796,-0.29918,0.826847,-0.195372,-0.465863,-0.465137,-0.241669,-0.239783,0.04023,-0.498512


In [104]:
test_data['attack_category'] = test_data['attack_category'].map(attack_category_map)
test_data['attack_type'] = test_data['attack_type'].map(attack_type_map)

# Delete coulmns with all 0 value
test_data.drop('num_outbound_cmds',axis='columns', inplace=True)
test_data.drop('is_host_login',axis='columns', inplace=True)

# Normalize data
test_data = (test_data-test_data.mean())/test_data.std()
    
# Shuffle the test_dataset
test_data = test_data.sample(frac = 1)

test_data.head(20)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type,attack_category
12752,-0.009159,-0.79188,-0.687215,0.486107,0.010345,-0.047497,,-0.019759,,-0.03713,...,0.531166,-0.406652,0.839681,-0.194018,-0.258141,-0.256819,-0.412137,-0.410389,-0.004895,-0.520927
109423,-0.009159,0.867961,1.640815,-2.312803,-0.053504,-0.047497,,-0.019759,,-0.03713,...,-1.928375,1.119334,-1.220134,-0.194018,-0.258141,-0.256819,2.450191,2.447617,-1.536055,-0.520927
270301,-0.009159,-0.79188,-0.687215,0.486107,-0.021332,-0.047497,,-0.019759,,-0.03713,...,0.531166,-0.406652,0.839681,-0.194018,-0.258141,-0.256819,-0.412137,-0.410389,-0.004895,-0.520927
168867,-0.009159,0.867961,1.640815,-2.312803,-0.053504,-0.047497,,-0.019759,,-0.03713,...,-2.030856,1.119334,-1.220134,-0.194018,-0.258141,-0.256819,2.450191,2.447617,-1.536055,-0.520927
48817,-0.009159,-0.79188,-0.687215,0.486107,0.010345,-0.047497,,-0.019759,,-0.03713,...,0.531166,-0.406652,0.839681,-0.194018,-0.258141,-0.256819,-0.412137,-0.410389,-0.004895,-0.520927
33631,-0.009159,-0.79188,-0.687215,0.486107,0.010345,-0.047497,,-0.019759,,-0.03713,...,0.531166,-0.406652,0.839681,-0.194018,-0.258141,-0.256819,-0.412137,-0.410389,-0.004895,-0.520927
178256,-0.009159,-0.79188,-0.687215,0.486107,0.010345,-0.047497,,-0.019759,,-0.03713,...,0.531166,-0.406652,0.839681,-0.194018,-0.258141,-0.256819,-0.412137,-0.410389,-0.004895,-0.520927
185861,-0.009159,-0.79188,-0.687215,0.486107,0.010345,-0.047497,,-0.019759,,-0.03713,...,0.531166,-0.406652,0.839681,-0.194018,-0.258141,-0.256819,-0.412137,-0.410389,-0.004895,-0.520927
221107,-0.009159,-0.79188,-0.687215,0.486107,0.010345,-0.047497,,-0.019759,,-0.03713,...,0.531166,-0.406652,0.839681,-0.194018,-0.258141,-0.256819,-0.412137,-0.410389,-0.004895,-0.520927
274717,-0.009159,-0.79188,-0.687215,0.486107,-0.021332,-0.047497,,-0.019759,,-0.03713,...,0.531166,-0.406652,0.839681,-0.194018,-0.258141,-0.256819,-0.412137,-0.410389,-0.004895,-0.520927


In [105]:
data.shape, test_data.shape

((488715, 41), (283882, 41))

# Genetic Algorithm

In [106]:
# Maximum and Minimum Value
data_range = [data.min(), data.max()]
test_data_range = [test_data.min(), test_data.max()]

print(data_range)

[duration                      -0.070404
protocol_type                 -0.805475
service                       -1.726600
flag                          -3.541244
src_bytes                     -0.064555
dst_bytes                     -0.042242
land                          -0.001430
wrong_fragment                -0.047979
urgent                        -0.001430
hot                           -0.043685
num_failed_logins             -0.003973
logged_in                     -0.416148
num_compromised               -0.005555
root_shell                    -0.006860
su_attempted                  -0.004516
num_root                      -0.005524
num_file_creations            -0.010373
num_shells                    -0.009380
num_access_files              -0.027440
is_guest_login                -0.027563
count                         -1.576975
srv_count                     -1.204182
serror_rate                   -0.465614
srv_serror_rate               -0.465473
rerror_rate                   -0.237424

In [None]:
# 2. Class to carry out Genetic Algorithm

class GeneticAlgorithm {
    # default constructor
    def __init__(self):
        self.population = "GeekforGeeks"
}