<a href="https://colab.research.google.com/github/veleronie/NetworkFlowAnomalyDetection/blob/main/preprocessing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import time
import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing

path = os.getcwd()
path_to_datasets = r'/drive/MyDrive/AnomalyDetection/IDS2017/TrafficLabelling'
all_files = os.listdir(path + path_to_datasets)    
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
csv_files

['Monday-WorkingHours.pcap_ISCX.csv',
 'Wednesday-workingHours.pcap_ISCX.csv',
 'Tuesday-WorkingHours.pcap_ISCX.csv',
 'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
 'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv',
 'Friday-WorkingHours-Morning.pcap_ISCX.csv',
 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv']

The aim of this file is to make the csv files clean from any errors: drop missing/meaningless values, handling specific character encodings, etc.

In [2]:
with open('/content/drive/MyDrive/AnomalyDetection/pickled_wednesday_data.pkl', 'rb') as f:
  test_file = pickle.load(f)
test_file.head()

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.14-209.48.71.168-49459-80-6,192.168.10.14,49459,209.48.71.168,80,6,5/7/2017 8:42,38308,1,1,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.3-192.168.10.17-389-49453-6,192.168.10.17,49453,192.168.10.3,389,6,5/7/2017 8:42,479,11,5,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.3-192.168.10.17-88-46124-6,192.168.10.17,46124,192.168.10.3,88,6,5/7/2017 8:42,1095,10,6,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.3-192.168.10.17-389-49454-6,192.168.10.17,49454,192.168.10.3,389,6,5/7/2017 8:42,15206,17,12,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.3-192.168.10.17-88-46126-6,192.168.10.17,46126,192.168.10.3,88,6,5/7/2017 8:42,1092,9,6,...,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [3]:
test_file.columns = test_file.columns.str.strip()
main_labels = set(test_file.columns)
print(main_labels)
labels_with_missing_values = []
for label in main_labels:
  if test_file[label].isnull().sum() > 0:
    labels_with_missing_values.append(label)

{'Bwd Packet Length Max', 'Down/Up Ratio', 'Average Packet Size', 'Total Fwd Packets', 'Avg Bwd Segment Size', 'min_seg_size_forward', 'Label', 'Idle Min', 'Flow IAT Max', 'Fwd Header Length', 'Packet Length Mean', 'Source IP', 'Fwd URG Flags', 'Fwd Packets/s', 'Fwd Packet Length Min', 'Bwd URG Flags', 'Fwd Packet Length Max', 'Fwd IAT Min', 'Bwd Avg Bulk Rate', 'Active Std', 'Protocol', 'Idle Max', 'Flow Packets/s', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'Flow Duration', 'Min Packet Length', 'Bwd IAT Std', 'Bwd Packet Length Std', 'Avg Fwd Segment Size', 'Active Mean', 'Bwd IAT Min', 'Bwd IAT Mean', 'URG Flag Count', 'Fwd IAT Std', 'Bwd Packet Length Mean', 'Bwd IAT Max', 'Destination IP', 'Fwd PSH Flags', 'Idle Std', 'SYN Flag Count', 'ECE Flag Count', 'Bwd Packet Length Min', 'Fwd Avg Bytes/Bulk', 'Destination Port', 'FIN Flag Count', 'Total Length of Fwd Packets', 'Fwd Header Length.1', 'Packet Length Variance', 'Fwd Packet Length Std', 'Subflow Fwd Bytes', 'Flow Bytes/s', 

In [4]:
test_file[labels_with_missing_values[0]].fillna(0, inplace = True)
numeric_columns = test_file.select_dtypes(include=np.number).columns

In [5]:
for column in numeric_columns:
  if np.isin([-np.inf, np.inf], test_file[column]).any():
      test_file[column].replace([np.inf, -np.inf], -1, inplace=True)  #replacing inf values with -1

In [6]:
non_numeric_columns = set(main_labels).difference(set(numeric_columns))
for column in non_numeric_columns:
  print(test_file[column][:5])

0    192.168.10.14-209.48.71.168-49459-80-6
1    192.168.10.3-192.168.10.17-389-49453-6
2     192.168.10.3-192.168.10.17-88-46124-6
3    192.168.10.3-192.168.10.17-389-49454-6
4     192.168.10.3-192.168.10.17-88-46126-6
Name: Flow ID, dtype: object
0    BENIGN
1    BENIGN
2    BENIGN
3    BENIGN
4    BENIGN
Name: Label, dtype: object
0    5/7/2017 8:42
1    5/7/2017 8:42
2    5/7/2017 8:42
3    5/7/2017 8:42
4    5/7/2017 8:42
Name: Timestamp, dtype: object
0    192.168.10.14
1    192.168.10.17
2    192.168.10.17
3    192.168.10.17
4    192.168.10.17
Name: Source IP, dtype: object
0    209.48.71.168
1     192.168.10.3
2     192.168.10.3
3     192.168.10.3
4     192.168.10.3
Name: Destination IP, dtype: object


In [7]:
labelencoder_X = preprocessing.LabelEncoder()
non_numeric_columns = set(main_labels).difference(set(numeric_columns))
print(test_file['Label'].unique())
non_numeric_columns.remove('Label')
for column in non_numeric_columns:
  series = labelencoder_X.fit_transform(test_file[column])

['BENIGN' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye'
 'Heartbleed']




```
for i in range(len(csv_files)):
  df = pd.read_csv(csv_files[i])
  df.columns = df.columns.str.strip()
  main_features = set(df.columns)
  for feature in main_features:
    if df[feature].isnull().sum() > 0:
      df[feature].fillna(0, inplace = True)
  numeric_columns = set(df.select_dtypes(include=np.number).columns)
  for column in numeric_columns:
    if np.isin([-np.inf, np.inf], df[feature]).any():
      df[column].replace([np.inf, -np.inf], -1, inplace=True)
  non_numeric_columns = main_features.difference(numeric_columns)
  for column in non_numeric_columns:
    df[column].str.replace('–','-')

  labelencoder = preprocessing.LabelEncoder()
  non_numeric_columns.remove('Label')
  non_numeric_columns = main_features.difference(numeric_columns)
  non_numeric_columns.remove('Label')
  for column in non_numeric_columns:
    df[column] = labelencoder_X.fit_transform(df[column])
```





```
for i in range(len(csv_files)):
  with open('{}.pkl'.format(i), 'wb') as f:
    pickle.dump(pd.read_csv(csv_files[i], f))
    
```

