# ANALYSING CIC 2017 DATASET AND SELECTING BEST FEATURES FOR SLOWLORIS ATTACK


## Index
    1. Data Preparation
    2. Data Split
    3. Feature Selection
    4. PCA

In [1]:
import os
import subprocess
import seaborn as sns  
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from __future__ import print_function
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
def get_data():
    if os.path.exists("slow_loris_filtered.csv"):
        print("-- slow_loris.csv found locally")
    df = pd.read_csv("slow_loris_filtered.csv", index_col=False)
    return df

In [3]:
def encode_target(df, target_column):
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)
    return (df_mod, targets)

In [4]:
data = get_data()

IOError: File slow_loris_filtered.csv does not exist

# 1.0 Data processing

### We initially drop variables that won't contribute to knowlege of slowloris attack because they change from instance to instance

In [None]:
columns = ['Flow ID', ' Source IP', ' Source Port', ' Destination IP', ' Destination Port', ' Protocol', ' Timestamp',]
data.drop(columns, inplace=True, axis=1)
print(data.keys())
print(data.shape[0])
print(data.shape[1])

### We also need to convert lables into numeric values for classification

In [None]:
df2, targets = encode_target(data, " Label")



### We drop any rows that contain NaN

In [None]:
df2 = df2.dropna(axis=0, how='any')
df2.isnull().sum().sum()

### We also drop columns that have 0 variance

In [None]:
list_unq = (df2.apply(pd.Series.nunique) != 1)
list_unq[list_unq == 0]

In [None]:
df2 = df2.loc[:, df2.apply(pd.Series.nunique) != 1]

# 2.0 Data split
## We split data in features, lables

In [None]:
features = list(df2.columns[:-2])
y = df2["Target"]
X = df2[features]

In [None]:
X

In [None]:
graph = sns.countplot(y ,label="Count")
df2["Target"].value_counts()

### Heat map

In [None]:
f, ax = plt.subplots(figsize=(50, 30))
sns_plot = sns.heatmap(X.corr(), annot=True, linewidths=.5, fmt='.1f', ax=ax)

In [None]:
sns_plot.figure.savefig("output_50_30_slowloris.png")

### df2 is original data frame with neccesary processing done. From now on we make variants of data frame X and Y








# 3.0 Feature Selection


### Index
     3.1 1 Correlation
     3.2 K best features
     3.3 Recursive feature elimination with random forest
     3.4 Recursice feature elimination with cross validation and random forest
     3.5 Tree based feature selection

## 3.1 High Correlation



In [None]:
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [None]:
upper

In [None]:
to_drop = [column for column in upper.columns if any(upper[column] == 1.0)]
print(len(to_drop))
print(to_drop)

### 3.1.2 To get an idea who all were correalted perfectly i.e 1.0

In [None]:
corrMatrix = X.corr()
corrMatrix.loc[:,:] =  np.tril(corr_matrix, k=-1)

already_in = set()
result_1 = []
for col in corrMatrix:
    perfect_corr = corrMatrix[col][corrMatrix[col] == 1].index.tolist()
    if perfect_corr and col not in already_in:
        already_in.update(set(perfect_corr))
        perfect_corr.append(col)
        result_1.append(perfect_corr)

In [None]:
for element in result_1:
    print(element)

### 3.1.4 Dropping features with 1 correlation

In [None]:
X_new = X.drop(X[to_drop], axis=1)

In [None]:
X_new.head()

In [None]:
X_new.isnull().sum().sum()

In [None]:
def fix_data_frame(x_train):
    col = x_train.columns[x_train.dtypes.eq(object)]
    x_train[col] = x_train[col].apply(pd.to_numeric, errors='coerce', axis=1)
    return np.nan_to_num(x_train)

In [None]:
def generate_data(x, y, split, seed):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split, random_state=seed)
    x_train = fix_data_frame(x_train)
    x_test = fix_data_frame(x_test)
    return x_train, x_test, y_train, y_test

In [None]:
def ranking(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x,2), ranks)
    return dict(zip(names, ranks))

## 3.2 K-best

In [None]:
x_train, x_test, y_train, y_test = generate_data(X_new, y, 0, 42)
ranks = {}
y_train.value_counts()

In [None]:
select_feature_5 = SelectKBest(f_classif, k=5).fit(x_train, y_train.values)

In [None]:
print("Using f_classif with Kbest returns : \n")

print("5 Features : ", X_new.columns[select_feature_5.get_support(indices=True)])
print('\n')

#### Data feature ranking on ANOVA F-based at 30% training data

In [None]:
scores = select_feature_5.scores_
scores  = [float(i) for i in scores]
features = X_new.columns
    
k_best_f_classif = zip(scores, features)

k_best_f_classif.sort(key= lambda x : x[0], reverse=True)
for element in k_best_f_classif:
    print(element[1].lstrip())

ranks["K-Best"] = ranking(scores, X_new.columns);

## 3.3 Recursive Feature Elimination using Random Forest

In [None]:
random_forest = RandomForestClassifier()      
rfe = RFE(estimator=random_forest, n_features_to_select=10, step=1)

In [None]:
rfe = rfe.fit(x_train, y_train.values)

In [None]:
print(rfe.support_)
print(len(rfe.support_))

In [None]:
print('Chosen best 10 feature by rfe:', X_new.columns[rfe.support_])

rfe_features_correlated_10 = X_new.columns[rfe.support_]

In [None]:
scores = rfe.ranking_
scores  = [i for i in scores]
features = X_new.columns
    
rfe_rankings = zip(scores, features)

rfe_rankings.sort(key= lambda x : x[0])
for element in rfe_rankings:
    print(element)

In [None]:
random_forest = RandomForestClassifier() 
rfe = RFE(estimator=random_forest, n_features_to_select=1)
rfe.fit(x_train, y_train)

In [None]:
ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), X_new.columns, order=-1)

In [None]:
ranks

## 3.5 Recursive Feature Elimination with Corss Validation

In [None]:
clf_rf = RandomForestClassifier() 
rfecv = RFECV(estimator=clf_rf, step=1, cv=10, scoring='accuracy')   #10-fold cross-validation
rfecv = rfecv.fit(x_train, y_train.values)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_new.columns[rfecv.support_])

In [None]:
plt.figure(1, figsize=(30, 23))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.savefig("REFCV_slowloris_100.png")

In [None]:
for element in range(len(rfecv.grid_scores_)):
    print(element + 1, rfecv.grid_scores_[element])
    

In [None]:
print(rfecv.grid_scores_[-52]) #8
print(rfecv.grid_scores_[-35]) #25
print(rfecv.grid_scores_[-46]) #14
print(X_new.shape)

print(rfecv.grid_scores_[-32]) #28
print(rfecv.grid_scores_[-45]) #15


### Tree based feature selecion

In [None]:
clf_rf = RandomForestClassifier()      
clr_rf = clf_rf.fit(x_train, y_train.values)
importances = clr_rf.feature_importances_

In [None]:
std = np.std([tree.feature_importances_ for tree in clf_rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")

ranking_tree_based = []
for f in range(x_train.shape[1]):
    print(f + 1, features[indices[f]])
    ranking_tree_based.append(features[indices[f]])

ranking_tree_zip = zip(features[indices], importances[indices])
ranking_tree_zip.sort(key= lambda x : x[1], reverse=True)

for element in ranking_tree_zip:
    print(element[1])
    
ranks["RF"] = ranking(clf_rf.feature_importances_, X_new.columns)

In [None]:
plt.figure(1, figsize=(30, 23))
plt.title("Feature importances")
plt.bar(range(x_train.shape[1]), importances[indices], 0.6,
       color="g", align="edge")
plt.xticks(range(x_train.shape[1]), X_new.columns[indices],rotation=90)
plt.savefig("rf_features_slowloris_100.png")

## PCA

In [None]:
x_train_N = (x_train - x_train.mean())/(x_train.max() - x_train.min())
#x_test_N = (x_test - x_test.mean())/(x_test.max() - x_test.min())

pca = PCA().fit(x_train_N)
plt.figure(1, figsize=(30, 23))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
plt.savefig("pca_cumulative_slowloris_100.png")

In [None]:
z_scaler = StandardScaler()

z_data = z_scaler.fit_transform(x_train)
pca_trafo = PCA().fit(z_data);

plt.figure(figsize=(20,10))
plt.semilogy(pca_trafo.explained_variance_ratio_, '--o');
plt.semilogy(pca_trafo.explained_variance_ratio_.cumsum(), '--o');
plt.savefig("pca_cumulative_individual_100_slowloris.png")

In [None]:
variance = [0.999, 0.99, 0.98, 0.97, 0.96, 0.95, 0.90, 0.85, 0.80]
print("Original shape:   ", x_train.shape)

for i in variance:
    pca_var = PCA(i)
    pca_var.fit(x_train)
    x_var = pca_var.transform(x_train_N)
    print("Transformed shape for variance " + str(i) + " is ", x_var.shape)
    

## Summary

### 1. Eliminate features with 0 variance
### 2. Eliminate features with 1 correlation
### 3. Estimate feature rankings (various data set sizes) with
        K best
        Recursive feature elimination
        Recursive feature elimination with cross validation
        Random Forest
### 4. Apply PCA just to check how variance is distributed
### 5. Make a classifier with PCA and check accuracy, this will be ML solution for slowloris
### 6. Find minimal set of features for rule generation with highest accuracy
### 7. Generate rules

## Obtained Feature rankings

### 1.0 100% Split

In [None]:
print("RFECV at 100% train:")
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', X_new.columns[rfecv.support_])

In [None]:
#rfecv.score(x_test, y_test)

In [None]:
ranks

In [None]:
r = {}
for name in X_new.columns:
    r[name] = round(np.mean([ranks[method][name] 
                             for method in ranks.keys()]), 2)
 
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")
 
print("\t\t\t\t%s" % "\t".join(methods))
for name in X_new.columns:
    print("%s\t\t\t%s" % (name, "\t".join(map(str, 
                         [ranks[method][name] for method in methods]))))


In [None]:
meanplot = pd.DataFrame(list(r.items()), columns= ['Feature','Mean Ranking'])
meanplot = meanplot.sort_values('Mean Ranking', ascending=False)
y2k = sns.factorplot(x="Mean Ranking", y="Feature", data = meanplot, kind="bar", 
               size=14, aspect=1.9, palette='coolwarm')
y2k.savefig("mean_ranking_100_slowloris.png")

In [None]:
meanplot

## Testing

### Three highest grid scores were obtained at 8, 14 and 25 features. So we use these

In [79]:
#YOU CAN GENERATE DIFFERENT TRAINING SAMPLES HERE at #here. x_train, y_train is 70% by default. 

### RFE

In [80]:
for i in [4, 5, 6, 7, 8, 9, 10]:
    print("\n-------------------------------------------------------")
    random_forest = RandomForestClassifier()      
    rfe = RFE(estimator=random_forest, n_features_to_select=i, step=1)
    rfe = rfe.fit(x_train, y_train)
    print("RFE features at size " + str(i) + " are \n\n")
    #here
    for el in X_new.columns[rfe.support_]:
        print(el.lstrip())
    
    for size in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    #for size in [0.1]:
        training_x, testing_x, training_y, testing_y = generate_data(X_new, y, size, 42) #train classifier on x%
        training_x = rfe.transform(training_x)
        testing_x = rfe.transform(testing_x)
        print(training_y.value_counts())
        print(training_x.shape)

        clf_rf_2 = RandomForestClassifier()      
        clr_rf_2 = clf_rf_2.fit(training_x, training_y)
        ac_2 = accuracy_score(testing_y, clf_rf_2.predict(testing_x))
        ac_3 = classification_report(testing_y, clf_rf_2.predict(testing_x))
        print("Accuracy at " + str(size) + " for " + str(i) + " is: ", ac_2)
        print(ac_3)

    

RFE features at size 4 are 


Flow Packets/s
Flow IAT Mean
Average Packet Size
Init_Win_bytes_backward


0    22926
1     5202
Name: Target, dtype: int64
(28128, 4)
Accuracy at 0.1 for 4 is:  0.9987204094689699


0    20408
1     4595
Name: Target, dtype: int64
(25003, 4)
Accuracy at 0.2 for 4 is:  0.9987202047672372


0    17849
1     4028
Name: Target, dtype: int64
(21877, 4)
Accuracy at 0.3 for 4 is:  0.9986136290924603


0    15314
1     3438
Name: Target, dtype: int64
(18752, 4)
Accuracy at 0.4 for 4 is:  0.9984802431610942


0    12744
1     2883
Name: Target, dtype: int64
(15627, 4)
Accuracy at 0.5 for 4 is:  0.9987201638190312


0    10187
1     2314
Name: Target, dtype: int64
(12501, 4)
Accuracy at 0.6 for 4 is:  0.9984535807604117


0    7632
1    1744
Name: Target, dtype: int64
(9376, 4)
Accuracy at 0.7 for 4 is:  0.9985830514672274


0    5093
1    1157
Name: Target, dtype: int64
(6250, 4)
Accuracy at 0.8 for 4 is:  0.9975203967365222


0    2556
1     569
Name: Target, dty

### K Best

In [81]:
for i in [4, 5, 6, 7, 8, 9, 10]:
    print("\n-----------------------------------------")
    select_feature = SelectKBest(f_classif, k=i).fit(x_train, y_train)
    print("Top " + str(i) + " features :", X_new.columns[select_feature.get_support(indices=True)])
    
    for size in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    #for size in [0.1]:
        training_x, testing_x, training_y, testing_y = generate_data(X_new, y, size, 42) #train classifier on 70%
    
        training_x = select_feature.transform(training_x)
        testing_x = select_feature.transform(testing_x)
        print(training_y.value_counts())
        print(training_x.shape)
    
        clf_rf_2 = RandomForestClassifier()      
        clr_rf_2 = clf_rf_2.fit(training_x, training_y)
        ac_2 = accuracy_score(testing_y, clf_rf_2.predict(testing_x))
        ac_3 = classification_report(testing_y, clf_rf_2.predict(testing_x))
        print("Accuracy at " + str(size) + " for " + str(i) + " is: ", ac_2)
        print(ac_3)


Top 4 features : Index([u' Flow IAT Max', u' Fwd IAT Max', u' Bwd IAT Mean', u' Bwd IAT Max'], dtype='object')


0    22926
1     5202
Name: Target, dtype: int64
(28128, 4)
Accuracy at 0.1 for 4 is:  0.9817658349328215


0    20408
1     4595
Name: Target, dtype: int64
(25003, 4)
Accuracy at 0.2 for 4 is:  0.9792033274676052


0    17849
1     4028
Name: Target, dtype: int64
(21877, 4)
Accuracy at 0.3 for 4 is:  0.9761117628239309


0    15314
1     3438
Name: Target, dtype: int64
(18752, 4)
Accuracy at 0.4 for 4 is:  0.9756039033754599


0    12744
1     2883
Name: Target, dtype: int64
(15627, 4)
Accuracy at 0.5 for 4 is:  0.9768989569335125


0    10187
1     2314
Name: Target, dtype: int64
(12501, 4)
Accuracy at 0.6 for 4 is:  0.9748306937556658


0    7632
1    1744
Name: Target, dtype: int64
(9376, 4)
Accuracy at 0.7 for 4 is:  0.973992138221044


0    5093
1    1157
Name: Target, dtype: int64
(6250, 4)
Accuracy at 0.8 for 4 is:  0.9728443449048152


0    2556
1     569
Name: Targ

### Tree based

In [56]:
for i in [4, 5, 6, 7, 8, 9, 10]:
    print("\n-------------------------")
    features_selected = ranking_tree_based[:i] 
    print(features_selected)
    x_trial = X_new[features_selected]
    print(x_trial.shape)
    
    for size in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    #for size in [0.1, 0.2]:
        training_x, testing_x, training_y, testing_y = generate_data(x_trial, y, size, 42) #here
        #print(training_y.value_counts())
        clf_rf_2 = RandomForestClassifier()      
        clr_rf_2 = clf_rf_2.fit(training_x, training_y)
        ac_2 = accuracy_score(testing_y, clf_rf_2.predict(testing_x))
        ac_3 = classification_report(testing_y, clf_rf_2.predict(testing_x))
        print("Accuracy at " + str(size) + " for " + str(i) + " is: ", ac_2)
        print(ac_3)



-------------------------
[' Bwd IAT Mean', ' Flow Packets/s', ' Avg Bwd Segment Size', ' Flow IAT Min']
(31254, 4)
Accuracy at 0.1 for 4 is:  0.9798464491362764
             precision    recall  f1-score   support

          0       0.98      0.99      0.99      2540
          1       0.96      0.93      0.95       586

avg / total       0.98      0.98      0.98      3126

Accuracy at 0.2 for 4 is:  0.9816029435290353
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      5058
          1       0.98      0.92      0.95      1193

avg / total       0.98      0.98      0.98      6251


-------------------------
[' Bwd IAT Mean', ' Flow Packets/s', ' Avg Bwd Segment Size', ' Flow IAT Min', ' Fwd IAT Mean']
(31254, 5)
Accuracy at 0.1 for 5 is:  0.981445937300064
             precision    recall  f1-score   support

          0       0.99      0.99      0.99      2540
          1       0.96      0.95      0.95       586

avg / total       0.9