# Data Prepreprocessing
In this, we are going to preprocess data. The following conventions are followed:
1. An example labeled as normal will be labeled as 1.
2. An example labeled as attack will be labeled as 0.
3. The columns that feature categoritcal data will be divided into multiple columns using one-hot vector

In [1]:
## INCLUDING LIBRARIES
import pandas as pd
import time
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
#LOADING THE DATASET
dataset1 = pd.read_csv('attack.csv')
dataset2 = pd.read_csv('normal.csv')
# dataset1 = dataset1.iloc[:150000,:]
# dataset2 = dataset2.iloc[:150000,:]
dataset = pd.concat([dataset1, dataset2], ignore_index=True) #concatinating two dataset
dataset = dataset.sample(frac=1).reset_index(drop=True)  #shuffling
dataset.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,duration,protocol,Plength,flag,Mlength,HoP,LifeTime,MsgType,DSN,Sno,...,land,Tmode,Neighbors,Hflow,AvgFlow,Lflow,AvgHopCount,failedConnection,Failed Rate,Label
0,0.00706,AODV,76,0,20,-1,-1,Route Error,0,3,...,2,1,10,6142,1072.4,2,0.237377,3101,62.91337,normal
1,0.000999,ICMP,92,-1,28,-1,-1,-1,-1,10,...,2,0,12,7389,1722.5,303,0.22392,4209,58.466454,attack
2,0.001714,AODV,76,0,20,-1,-1,Route Error,0,4,...,2,1,14,7759,1416.428571,4,0.186595,4556,69.324407,normal
3,0.000931,AODV,76,0,20,-1,-1,Route Error,0,3,...,2,1,20,8145,1483.4,27,0.197428,5624,63.049327,normal
4,0.00015,AODV,84,0,28,0,2000,Route Reply,0,7,...,0,1,14,7758,1414.5,1,0.198336,4515,68.357305,normal


## Removing unwanted or unnecessary features

In [3]:
dataset = dataset.drop(['flag','DSN','LifeTime'], axis=1)

### Label encoding of Categorical data

In [4]:
le = LabelEncoder()
protocol_labels = le.fit_transform(dataset['protocol'])
protocol_mappings = {index: label for index, label in 
                  enumerate(le.classes_)}
protocol_mappings

{0: 'AODV', 1: 'ICMP', 2: 'UDP'}

In [5]:
dataset['protocol'] = protocol_labels

In [6]:
pro_ohe = OneHotEncoder()
pro_feature_arr = pro_ohe.fit_transform(dataset[['protocol']]).toarray()
pro_feature_labels = list(le.classes_)
pro_features = pd.DataFrame(pro_feature_arr, 
                            columns=pro_feature_labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
dataset = pd.concat([dataset, pro_features], axis=1)

In [8]:
dataset = dataset.drop(['protocol'], axis=1)
dataset.head()

Unnamed: 0,duration,Plength,Mlength,HoP,MsgType,Sno,Sindex,land,Tmode,Neighbors,Hflow,AvgFlow,Lflow,AvgHopCount,failedConnection,Failed Rate,Label,AODV,ICMP,UDP
0,0.00706,76,20,-1,Route Error,3,1213,2,1,10,6142,1072.4,2,0.237377,3101,62.91337,normal,1.0,0.0,0.0
1,0.000999,92,28,-1,-1,10,1125,2,0,12,7389,1722.5,303,0.22392,4209,58.466454,attack,0.0,1.0,0.0
2,0.001714,76,20,-1,Route Error,4,683,2,1,14,7759,1416.428571,4,0.186595,4556,69.324407,normal,1.0,0.0,0.0
3,0.000931,76,20,-1,Route Error,3,211,2,1,20,8145,1483.4,27,0.197428,5624,63.049327,normal,1.0,0.0,0.0
4,0.00015,84,28,0,Route Reply,7,838,0,1,14,7758,1414.5,1,0.198336,4515,68.357305,normal,1.0,0.0,0.0


In [9]:
#X = dataset.iloc[:, :-1].values
le = LabelEncoder()
msgtype_labels = le.fit_transform(dataset['MsgType'])
genre_mappings = {index: label for index, label in 
                  enumerate(le.classes_)}
genre_mappings

{0: '-1',
 1: 'Route Error',
 2: 'Route Reply',
 3: 'Route Reply Acknowledgment',
 4: 'Route Request'}

In [10]:
dataset['labels'] = msgtype_labels

In [11]:
msg_ohe = OneHotEncoder()
msg_feature_arr = msg_ohe.fit_transform(dataset[['MsgType']]).toarray()
msg_feature_labels = list(le.classes_)
msg_features = pd.DataFrame(msg_feature_arr, 
                            columns=msg_feature_labels)

In [12]:
dataset_transformed = pd.concat([dataset, msg_features], axis=1)

In [13]:
dataset_transformed.head()

Unnamed: 0,duration,Plength,Mlength,HoP,MsgType,Sno,Sindex,land,Tmode,Neighbors,...,Label,AODV,ICMP,UDP,labels,-1,Route Error,Route Reply,Route Reply Acknowledgment,Route Request
0,0.00706,76,20,-1,Route Error,3,1213,2,1,10,...,normal,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
1,0.000999,92,28,-1,-1,10,1125,2,0,12,...,attack,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0,0.0
2,0.001714,76,20,-1,Route Error,4,683,2,1,14,...,normal,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
3,0.000931,76,20,-1,Route Error,3,211,2,1,20,...,normal,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
4,0.00015,84,28,0,Route Reply,7,838,0,1,14,...,normal,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0,0.0


In [14]:
dataset_transformed = dataset_transformed.rename(columns={"-1": "Unkown MsgType"})
dataset_transformed = dataset_transformed.drop(['MsgType'], axis=1)

In [15]:
dataset_transformed.head()

Unnamed: 0,duration,Plength,Mlength,HoP,Sno,Sindex,land,Tmode,Neighbors,Hflow,...,Label,AODV,ICMP,UDP,labels,Unkown MsgType,Route Error,Route Reply,Route Reply Acknowledgment,Route Request
0,0.00706,76,20,-1,3,1213,2,1,10,6142,...,normal,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
1,0.000999,92,28,-1,10,1125,2,0,12,7389,...,attack,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0,0.0
2,0.001714,76,20,-1,4,683,2,1,14,7759,...,normal,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
3,0.000931,76,20,-1,3,211,2,1,20,8145,...,normal,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
4,0.00015,84,28,0,7,838,0,1,14,7758,...,normal,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0,0.0


In [16]:
Y = pd.DataFrame(dataset_transformed['Label'])
Y['Label'] = le.fit_transform(Y['Label'])
X = dataset_transformed.drop(['Label'], axis=1)
X.to_csv('X-data.csv',index = False)
Y.to_csv('Y-data.csv',index = False)
X.head()

Unnamed: 0,duration,Plength,Mlength,HoP,Sno,Sindex,land,Tmode,Neighbors,Hflow,...,Failed Rate,AODV,ICMP,UDP,labels,Unkown MsgType,Route Error,Route Reply,Route Reply Acknowledgment,Route Request
0,0.00706,76,20,-1,3,1213,2,1,10,6142,...,62.91337,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
1,0.000999,92,28,-1,10,1125,2,0,12,7389,...,58.466454,0.0,1.0,0.0,0,1.0,0.0,0.0,0.0,0.0
2,0.001714,76,20,-1,4,683,2,1,14,7759,...,69.324407,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
3,0.000931,76,20,-1,3,211,2,1,20,8145,...,63.049327,1.0,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0
4,0.00015,84,28,0,7,838,0,1,14,7758,...,68.357305,1.0,0.0,0.0,2,0.0,0.0,1.0,0.0,0.0


In [17]:
#print(Y.head())
X_ = X.iloc[:, :].values
Y_ = Y.iloc[:, :].values

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X_,Y_, test_size=0.2)

In [19]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [20]:
X_train

array([[-0.13535943,  0.07382882,  0.06821165, ..., -0.47738449,
        -0.0538048 , -0.08951045],
       [-0.13280692,  0.07382882,  0.06821165, ..., -0.47738449,
        -0.0538048 , -0.08951045],
       [-0.11916594,  0.07382882, -0.17697658, ..., -0.47738449,
        -0.0538048 , -0.08951045],
       ...,
       [-0.13577279,  0.07382882,  0.06821165, ..., -0.47738449,
        -0.0538048 , -0.08951045],
       [-0.13578313,  0.07382882,  0.06821165, ..., -0.47738449,
        -0.0538048 , -0.08951045],
       [-0.13294126,  0.07382882,  0.06821165, ..., -0.47738449,
        -0.0538048 , -0.08951045]])