## Network Intrusion EDA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Attributes

### Load the data

In [None]:
#Read files
col_names = pd.read_csv('../../cyber-security-project/Field Names.csv', header=None)
train_feature_df = pd.read_csv('../../cyber-security-project/KDDTrain+.csv', header=None)
test_feature_df = pd.read_csv('../../cyber-security-project/KDDTest+.csv', header=None)

In [None]:
#Rename columns
train_feature_df.rename(columns=col_names[0], inplace=True)
test_feature_df.rename(columns=col_names[0], inplace=True)

In [None]:
#Rename column 41 and drop column 42
train_feature_df.rename(columns={41:'attack_name'}, inplace=True)
train_feature_df.drop(columns=[42], axis=1, inplace=True)
test_feature_df.rename(columns={41:'attack_name'}, inplace=True)
test_feature_df.drop(columns=[42], axis=1, inplace=True)

This dataset contains 4 main categories of attacks: DOS, R2L, U2R, Probing

In [None]:
#Create new labels for data exploration
attack_dict = {
    'normal': 'normal',
    #DOS
    'back': 'DoS',
    'land': 'DoS',
    'neptune': 'DoS',
    'pod': 'DoS',
    'smurf': 'DoS',
    'teardrop': 'DoS',
    'mailbomb': 'DoS',
    'apache2': 'DoS',
    'processtable': 'DoS',
    'udpstorm': 'DoS',
    #Probing
    'ipsweep': 'Probe',
    'nmap': 'Probe',
    'portsweep': 'Probe',
    'satan': 'Probe',
    'mscan': 'Probe',
    'saint': 'Probe',
    #R2L
    'ftp_write': 'R2L',
    'guess_passwd': 'R2L',
    'imap': 'R2L',
    'multihop': 'R2L',
    'phf': 'R2L',
    'spy': 'R2L',
    'warezclient': 'R2L',
    'warezmaster': 'R2L',
    'sendmail': 'R2L',
    'named': 'R2L',
    'snmpgetattack': 'R2L',
    'snmpguess': 'R2L',
    'xlock': 'R2L',
    'xsnoop': 'R2L',
    'worm': 'R2L',
    #U2R
    'buffer_overflow': 'U2R',
    'loadmodule': 'U2R',
    'perl': 'U2R',
    'rootkit': 'U2R',
    'httptunnel': 'U2R',
    'ps': 'U2R',    
    'sqlattack': 'U2R',
    'xterm': 'U2R'
}

def mal_identifier(labels):
    if labels == 'normal':
        return 'normal'
    else:
        return 'malicious'
    
#def mal_identifier(labels):    
#    mal = ['DoS', 'Probe', 'R2L', 'U2R']
#    if labels in mal:
#        return 'malicious'
#    else:
#        return 'normal'

In [None]:
#Write dictionary as 
import json

with open('./data/attack_dictionary.json', 'w') as file:
    json.dump(attack_dict, file)

In [None]:
train_feature_df['labels_5cat'] = train_feature_df['attack_name'].map(attack_dict)
test_feature_df['labels_5cat'] = test_feature_df['attack_name'].map(attack_dict)

In [None]:
train_feature_df['labels_2cat'] = train_feature_df.labels_5cat.apply(mal_identifier)
test_feature_df['labels_2cat'] = test_feature_df.labels_5cat.apply(mal_identifier)

## Examine the data

In [None]:
train_feature_df.head(5)

### Use .info() to see length and dtypes


In [None]:
train_feature_df.info()

In [None]:
test_feature_df.info()

### Check for duplicates

In [None]:
train_feature_df.duplicated().sum()

In [None]:
test_feature_df.duplicated().sum()

### Identify numerical and categorical variables

In [None]:
train_feature_df.columns

In [None]:
categorical_cols = ['protocol_type', 'service', 'flag']
binary_cols = ['land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login']
numeric_cols = ['duration', 'src_bytes','dst_bytes', 'wrong_fragment', 'urgent', 
                'hot', 'num_failed_logins', 'num_compromised',
                'num_root', 'num_file_creations', 'num_shells',
                'num_access_files', 'num_outbound_cmds','count', 'srv_count', 'serror_rate',
                'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
                'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
                'dst_host_srv_count', 'dst_host_same_srv_rate',
                'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
                'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
                'dst_host_srv_rerror_rate']

### Summarize categorical, numerical, and binary variables

In [None]:
train_feature_df[categorical_cols].describe()

In [None]:
train_feature_df[numeric_cols].describe().round(2).transpose()

``num_failed_logins`` has all 0 values, it will dropped due to redundancy 

In [None]:
pd.crosstab(train_feature_df.num_outbound_cmds, train_feature_df.labels_2cat)

In [None]:
#Drop num_outbound_cmds from train and test
train_feature_df.drop('num_outbound_cmds', axis=1, inplace=True)
test_feature_df.drop('num_outbound_cmds', axis=1, inplace=True)
numeric_cols.remove('num_outbound_cmds')

In [None]:
train_feature_df[binary_cols].describe().round(2)

su_attempted shows **3** variables [0,1,2] but it should be a binary feature [0,1]

**note for su_attemped**: ``1 if su root command attempted or used; 0 otherwise``


In [None]:
pd.crosstab(train_feature_df.su_attempted, train_feature_df.labels_2cat)

In [None]:
pd.crosstab(test_feature_df.su_attempted, test_feature_df.labels_2cat)

In [None]:
# 2 value is replaced to 0 for both train and test datasets
train_feature_df.su_attempted.replace(2, 0, inplace=True)
test_feature_df.su_attempted.replace(2, 0, inplace=True)

#### Count connection types( normal, malicious) and attack categories

In [None]:
from __future__ import division
from collections import Counter

def class_info(classes):
    counts = Counter(classes)
    total = sum(counts.values())
    for cls in counts.keys():
        print('{:10}: {:7} = {:5.2f}%'.format(cls, counts[cls], counts[cls]/total*100))

In [None]:
class_info(train_feature_df.labels_2cat)

In [None]:
class_info(train_feature_df.labels_5cat)

## Visualizing Normal and Malicious connections

### By connection protocols

Most connections fall on ```TCP``` and contain most of the normal and malicious connections 

```UDP``` connections are predominantly normal 

```ICMP``` connections are predominantly malicious and are commonly used for probing and DOS attakcs

In [None]:
def c_graph(x, y, title, **kwargs):
    ctab = pd.crosstab(x, y)
    ctab_sorted = ctab.sort_values('normal', ascending=False)
    return ctab_sorted.plot(kind='bar', colormap='Set1', title=title, **kwargs)

In [None]:
fig, ax_array = plt.subplots(2, 2, figsize=(16,14), sharey=True)
(ax1, ax2), (ax3, ax4) = ax_array
c_graph(train_feature_df.protocol_type, train_feature_df.labels_2cat, 'Normal vs Malicious by Protocol Type (train)', ax=ax1)
c_graph(test_feature_df.protocol_type, test_feature_df.labels_2cat, 'Normal vs Malicious by Protocol Type (test)', ax=ax2)
c_graph(train_feature_df.protocol_type, train_feature_df.labels_5cat, 'Attack Categories by Protocol Type (train)', ax=ax3)
c_graph(test_feature_df.protocol_type, test_feature_df.labels_5cat, 'Attack Categories by Protocol Type (test)', ax=ax4);

### By Connection Flags (status)

The plot below shows the connection type(normal or malicious) by status of the connection.

```SF``` - Most connections show a normal establishment and termination. Most are normal. (SYN/FIN)

```S0``` - This flag shows a connection attempt but no reply. Most are malicious

There are predominant malicious connections on the following flag states:

```REJ``` - Connection attempt rejected.

```RSTO``` - Connection established, originator aborted (sent a RST).

```RSTOS0``` - Originator sent a SYN followed by a RST, we never saw a SYN-ACK from the responder.

```RSTR``` - Responder sent a RST

```SH``` - Originator sent a SYN followed by a FIN, but never saw a SYN ACK from the responder (hence the connection was “half” open)

reference conn_state (flags): https://www.bro.org/sphinx/scripts/base/protocols/conn/main.bro.html?highlight=connection%20attempt%20rejected

In [None]:
c = pd.crosstab(train_feature_df.flag, train_feature_df.labels_2cat).sort_values('malicious')
c.plot(kind='barh', colormap='Set1', figsize=(16,14), stacked=True)
print(pd.crosstab(train_feature_df.flag, train_feature_df.labels_2cat).sort_values('malicious', ascending=False))

#### Further analysis on flags by attack category

**DOS attacks** are dominant on ```REJ, RSTO, S0``` flags

**Probing** is dominant on ```OTH, RSTO0, RSTR, SH``` flags

In [None]:
flags_df = pd.crosstab(train_feature_df.flag, train_feature_df.labels_5cat)
flags_df.style.bar(subset=['DoS','Probe','R2L','U2R', 'normal'], color=['#5fba7d'], axis=1)

### By network service

```private, eco_i, ecr_i``` - malicious attacks are dominant on these services

```http``` - most of the connections are normal but malicious activity exist

```smtp, domain_u, urp_i``` - are primarly normal with very few malicious activity

There's a long tail of network services that are used for malicious activities 


In [None]:
ns = pd.crosstab(train_feature_df.service, train_feature_df.labels_2cat).sort_values('malicious')
ns.plot(kind='barh', colormap='Set1', figsize=(16,14), stacked=True);
#print(pd.crosstab(train_feature_df.service, train_feature_df.labels_2cat).sort_values('malicious', ascending=False))

In [None]:
service_df = pd.crosstab(train_feature_df.service, train_feature_df.labels_5cat)
service_df.style.bar( color=['#5fba7d'], axis=1)

-------------------

## Visualizing Numeric Values

In [None]:
fig = plt.figure(figsize=(30, 25))
numeric_col_corr = train_feature_df.loc[:, train_feature_df.columns.isin(numeric_cols)].corr().abs()
sns.heatmap(numeric_col_corr, cmap='Blues', annot=True)
#plt.xticks(rotation=45)
plt.show()

#### Examining the top correlated values


In [None]:
d = train_feature_df[numeric_cols].corr().values
for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        if d[i,j] > .7:
            print('{:20} {:30} {:}'.format(numeric_cols[i], numeric_cols[j], d[i,j]))

#### Related to established connections and connection attempts without reply
serror_rate
- the percentage of connections that have activated FLAG s0,s1,s2,s3, among the connections that have the same destination host IP in the past two seconds

dst_host_serror_rate
- the percentage of connections that have activated FLAG s0,s1,s2,s3, among the connections that have the same destination host IP

srv_serror_rate
- the percentage of connections that have activated FLAG s0,s1,s2,s3, among the connections that have the same Port number as the current connection in the past two seconds

dst_host_srv_serror_rate
- the percentage of connections that have activated FLAG s0,s1,s2,s3, among the connections that have the same Port number


#### Related to connection attempts but rejected
rerror_rate
- the percentage of connections that have activated FLAG REJ among the connections tha have the same destination host IP in the past two seconds

dst_host_rerror_rate
- the percentage of connections that have activated FLAG REJ among the connections tha have the same destination host IP

srv_rerror_rate
- the percentage of connections that have activated FLAG REJ among the connections tha have the same Port number as the current connnection in the past two seconds

dst_host_srv_rerror_rate
- the percentage of connections that have activated FLAG REJ among the connections tha have the same Port number

#### Related to connections with the same destination host IP and same port number (service name)
dst_host_same_srv_rate
- the percentange of connections that were to the same service among the number of connections having the same destination host ip address

dst_host_srv_count
- number of connections having the same port number (service name)

same_srv_rate
- percentage of connections that were to the same service, among the number of connections to the same destination host IP in the past two seconds

#### relation between compromised connections and number of root operations in the connections
num_compromised
- number of compromised conditions

num_root
- number of root acceses or number of operations performed as root in the connections


In [None]:
# Plot a variable factor map (PCA)