In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
%matplotlib inline

from enum import Enum

In [2]:
class DATA_TYPE(Enum):
    BENIGN = 'BENIGN'
    MALICIOUS = 'MALICIOUS'

PROCESSED_FILES_PATH = '../Data/Processed-CIC-IDS2017/'
UPDATED_LABEL_DF_FILE_NAME = 'updated_label_df.pkl'

file_path = PROCESSED_FILES_PATH + UPDATED_LABEL_DF_FILE_NAME

In [3]:
ALL_COLUMNS = [
    'Flow ID',
    ' Source IP',
    ' Source Port',
    ' Destination IP',
    ' Destination Port',
    ' Protocol',
    ' Timestamp',
    ' Flow Duration',
    ' Total Fwd Packets',
    ' Total Backward Packets',
    'Total Length of Fwd Packets',
    ' Total Length of Bwd Packets',
    ' Fwd Packet Length Max',
    ' Fwd Packet Length Min',
    ' Fwd Packet Length Mean',
    ' Fwd Packet Length Std',
    'Bwd Packet Length Max',
    ' Bwd Packet Length Min',
    ' Bwd Packet Length Mean',
    ' Bwd Packet Length Std',
    'Flow Bytes/s',
    ' Flow Packets/s',
    ' Flow IAT Mean',
    ' Flow IAT Std',
    ' Flow IAT Max',
    ' Flow IAT Min',
    'Fwd IAT Total',
    ' Fwd IAT Mean',
    ' Fwd IAT Std',
    ' Fwd IAT Max',
    ' Fwd IAT Min',
    'Bwd IAT Total',
    ' Bwd IAT Mean',
    ' Bwd IAT Std',
    ' Bwd IAT Max',
    ' Bwd IAT Min',
    'Fwd PSH Flags',
    ' Bwd PSH Flags',
    ' Fwd URG Flags',
    ' Bwd URG Flags',
    ' Fwd Header Length',
    ' Bwd Header Length',
    'Fwd Packets/s',
    ' Bwd Packets/s',
    ' Min Packet Length',
    ' Max Packet Length',
    ' Packet Length Mean',
    ' Packet Length Std',
    ' Packet Length Variance',
    'FIN Flag Count',
    ' SYN Flag Count',
    ' RST Flag Count',
    ' PSH Flag Count',
    ' ACK Flag Count',
    ' URG Flag Count',
    ' CWE Flag Count',
    ' ECE Flag Count',
    ' Down/Up Ratio',
    ' Average Packet Size',
    ' Avg Fwd Segment Size',
    ' Avg Bwd Segment Size',
    ' Fwd Header Length.1',
    'Fwd Avg Bytes/Bulk',
    ' Fwd Avg Packets/Bulk',
    ' Fwd Avg Bulk Rate',
    ' Bwd Avg Bytes/Bulk',
    ' Bwd Avg Packets/Bulk',
    'Bwd Avg Bulk Rate',
    'Subflow Fwd Packets',
    ' Subflow Fwd Bytes',
    ' Subflow Bwd Packets',
    ' Subflow Bwd Bytes',
    'Init_Win_bytes_forward',
    ' Init_Win_bytes_backward',
    ' act_data_pkt_fwd',
    ' min_seg_size_forward',
    'Active Mean',
    ' Active Std',
    ' Active Max',
    ' Active Min',
    'Idle Mean',
    ' Idle Std',
    ' Idle Max',
    ' Idle Min',
    ' Label',
]

DROP_COLUMNS = [
    'Unnamed: 0',
    'Flow ID',
    ' Source IP',
    ' Destination IP',
    'Flow Bytes/s'
]

In [4]:
FILES_PATH = '../Data/CIC-IDS2017/'
FILE_NAME_LIST = [
    [
        'Monday-WorkingHours.pcap_ISCX.csv'
    ],
    [
        'Tuesday-WorkingHours.pcap_ISCX.csv'
    ],
    [
        'Wednesday-workingHours.pcap_ISCX.csv'
    ],
    [
        'Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv',
        'Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv'
    ],
    [
        'Friday-WorkingHours-Morning.pcap_ISCX.csv',
        'Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv',
        'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
    ]
]

In [5]:
def read_file(file_path: str):
    return pd.read_csv(file_path)

In [6]:
def filter_benign_data(file_df: pd.DataFrame):
    return file_df[file_df[' Label'] == 'BENIGN']

def filter_malicious_data(file_df: pd.DataFrame):
    return file_df[file_df[' Label'] != 'BENIGN']

In [7]:
def get_processed_df(file_path: str, type: str = None):
    file_df = read_file(file_path)
    if type == DATA_TYPE.BENIGN:
        file_df = filter_benign_data(file_df)
    elif type == DATA_TYPE.MALICIOUS:
        file_df = filter_malicious_data(file_df)

    return file_df

In [8]:
def get_file_by_day(day: int, type: str = None):
    file_df = get_processed_df(FILES_PATH + FILE_NAME_LIST[day][0], type)
    for i in FILE_NAME_LIST[day][1:]:
        file_df = pd.concat([file_df, get_processed_df(FILES_PATH + i, type)], axis=0)
    return file_df

In [9]:
def get_complete_df():
    file_df = get_file_by_day(0)
    for i in range(1, 5):
        file_df = pd.concat([file_df, get_file_by_day(i)], axis=0)
    
    return file_df

In [10]:
def update_target_label(file_df: pd.DataFrame):
    df = file_df.copy()
    for index, row in file_df.iterrows():
        if row[' Label'] == 'BENIGN':
            df.at[index, ' Label'] = 0
        else:
            df.at[index, ' Label'] = 1
        
    return df

In [11]:
def save_updated_labeled_data(file_df: pd.DataFrame):
    with open(file_path, 'wb') as f:
        pickle.dump(file_df, f)
        print(f"Successfully saved at: {file_path}")

In [None]:
### Need not to execute once processed and saved as pickle 
# df = get_complete_df()
# df = update_target_label(df)
# save_updated_labeled_data(df)

In [12]:
def read_updated_labeled_data():
    with open(file_path, 'rb') as f:
        file_df = pickle.load(f)
    return file_df

In [13]:
def filter_labeled_df(file_df: pd.DataFrame):
    return file_df.drop(DROP_COLUMNS, axis=1)

In [14]:
df = read_updated_labeled_data()
df = filter_labeled_df(df)
df.shape

(2830743, 81)

In [15]:
columns_names=df.columns.tolist()
print("Columns names:")
print(columns_names)

Columns names:
[' Source Port', ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', 'FIN Flag Count', ' SYN Flag Count', ' RST Flag Count', ' PSH F

In [None]:
df.head

In [16]:
for col in df.columns:
    df[col] = df[col].astype(float)

In [None]:
df.info()

In [None]:
df.corr()

In [None]:
correlation = df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='cubehelix')

plt.title('Correlation between different fearures')

In [None]:
cols = df.columns.tolist()
cols

In [None]:
cols.insert(0, cols.pop(cols.index(' Label')))

In [None]:
cols

In [None]:
df = df.reindex(columns= cols)

In [None]:
df.head

In [None]:
a = df.isnull().sum().tolist()
b = df.columns
# sum(a)

for i,j in zip(a, b):
    print(i, j)


In [None]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="median", fill_value=None)
# df[cols].fillna(np.nan, inplace=True)
df = imp.fit_transform(df)

In [None]:
X = df.iloc[:,1:].values
y = df.iloc[:,0].values
print(X.shape)
print(y.shape)

In [None]:
y

In [None]:
np.shape(X)

In [None]:
np.shape(y)

In [None]:
np.any(np.isnan(X))

In [None]:
np.all(np.isfinite(X))

In [None]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)