In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [2]:
import os
# load preprocessed dataset from a specific location on the computer
datapath = os.path.join('E:\DATASETS','IDS2017data.csv')
full_data=pd.read_csv(datapath)

##check if data_train has both attack (1) and normal (0) flows 
print(full_data['Label'].value_counts())
full_data.head(2)

0    667378
1    379737
Name: Label, dtype: int64


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3,2,0,12,0,6,6,6.0,0.0,0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0
1,109,1,1,6,6,6,6,6.0,0.0,6,...,20,0.0,0.0,0,0,0.0,0.0,0,0,0


In [4]:
full_data.columns[0:76]

Index([' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s',
       ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max',
       ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std',
       ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean',
       ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags',
       ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags',
       'Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s',
       ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length',
       ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance',
       'FIN Flag Count', ' SYN

In [5]:
X = full_data.iloc[:,0:76]  #independent columns
y = full_data.iloc[:,-1]    #target column


In [6]:
#chi square method for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
num_feats=30
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(str(len(chi_feature)), 'selected features')

30 selected features


In [7]:
#Recursive Feature Elimination method for feature selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
rfe_selector.fit(X_norm, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

Fitting estimator with 76 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 66 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 56 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 46 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fitting estimator with 36 features.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


30 selected features


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
#Logistic regression method for feature selection using L2 regularization
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
num_feats=30
embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l2"), max_features=num_feats)
embeded_lr_selector.fit(X_norm, y)

embeded_lr_support = embeded_lr_selector.get_support()
embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
print(str(len(embeded_lr_feature)), 'selected features')

27 selected features


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [9]:
#Random forest classifier method for feature selection

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
num_feats=30
embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
embeded_rf_selector.fit(X_norm, y)

embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
print(str(len(embeded_rf_feature)), 'selected features')


25 selected features


In [10]:
#Light GBM method for feature selection

from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

num_feats=30

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
embeded_lgb_selector.fit(X_norm, y)

embeded_lgb_support = embeded_lgb_selector.get_support()
embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
print(str(len(embeded_lgb_feature)), 'selected features')

23 selected features


In [11]:
#Linear SVC method for feature selection

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
num_feats=30
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_norm, y)
model = SelectFromModel(lsvc, prefit=True, max_features=num_feats)
model_support = model.get_support()
model_feature = X.loc[:,model_support].columns.tolist()
print(str(len(model_feature)), 'selected features')

30 selected features




In [12]:
#Extra Tree classifier method for feature selection

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
num_feats=30
et_classifier= SelectFromModel(ExtraTreesClassifier(n_estimators=100), max_features=num_feats)
et_classifier.fit(X_norm, y)
et_classifier_support = et_classifier.get_support()
et_classifier_feature = X.loc[:,et_classifier_support].columns.tolist()
print(str(len(et_classifier_feature)), 'selected features')

24 selected features


In [13]:
# put all selection together

feature_name = full_data.columns[0:76]

feature_selection_df = pd.DataFrame({'Feature Name':feature_name,'Chi-2 test':chi_support, 'RFE':rfe_support, 'Logistic Regression':embeded_lr_support,
                                    'Random Forest':embeded_rf_support,'LightGBM':embeded_lgb_support, 'Extra Tree':et_classifier_support, 'Linear SVC':model_support})
# count the selected times for each feature
feature_selection_df['Total Count'] = np.sum(feature_selection_df, axis=1)
# display the top features
feature_selection_df = feature_selection_df.sort_values(['Total Count','Feature Name'] , ascending=False)
feature_selection_df.index = range(1, len(feature_selection_df)+1)
feature_selection_df.head(num_feats)

Unnamed: 0,Feature Name,Chi-2 test,RFE,Logistic Regression,Random Forest,LightGBM,Extra Tree,Linear SVC,Total Count
1,Average Packet Size,True,True,True,True,True,True,True,7
2,Idle Mean,True,True,True,True,False,True,True,6
3,Packet Length Variance,True,True,True,True,False,True,True,6
4,Max Packet Length,True,True,True,True,False,True,True,6
5,Flow IAT Max,True,True,True,True,True,True,False,6
6,Bwd Packet Length Std,True,True,True,True,False,True,True,6
7,Bwd Packet Length Mean,True,True,True,True,False,True,True,6
8,Avg Bwd Segment Size,True,True,True,True,False,True,True,6
9,Init_Win_bytes_forward,True,False,False,True,True,True,True,5
10,Packet Length Std,True,True,False,True,True,True,False,5


In [3]:
# select 24 features from above
final_data=full_data.iloc[:, [5,7,9,10,11,12,15,16,17,19,20,21,22,36,38,39,40,41,51,53,64,65,72,74,76]]
final_data.head()


Unnamed: 0,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Fwd IAT Total,...,Packet Length Mean,Packet Length Std,Packet Length Variance,Average Packet Size,Avg Bwd Segment Size,Init_Win_bytes_forward,Init_Win_bytes_backward,Idle Mean,Idle Max,Label
0,6,6.0,0,0,0.0,0.0,3.0,0.0,3,3,...,6.0,0.0,0.0,9.0,0.0,33,0,0.0,0,0
1,6,6.0,6,6,6.0,0.0,109.0,0.0,109,0,...,6.0,0.0,0.0,9.0,6.0,29,256,0.0,0,0
2,6,6.0,6,6,6.0,0.0,52.0,0.0,52,0,...,6.0,0.0,0.0,9.0,6.0,29,256,0.0,0,0
3,6,6.0,6,6,6.0,0.0,34.0,0.0,34,0,...,6.0,0.0,0.0,9.0,6.0,31,329,0.0,0,0
4,6,6.0,0,0,0.0,0.0,3.0,0.0,3,3,...,6.0,0.0,0.0,9.0,0.0,32,0,0.0,0,0


In [4]:
# iterating the columns with their indices
for idx1 ,col1 in enumerate(final_data.columns): 
    print("Feature"+ " " +str(idx1) + " " + str(col1))

Feature 0  Fwd Packet Length Max
Feature 1  Fwd Packet Length Mean
Feature 2 Bwd Packet Length Max
Feature 3  Bwd Packet Length Min
Feature 4  Bwd Packet Length Mean
Feature 5  Bwd Packet Length Std
Feature 6  Flow IAT Mean
Feature 7  Flow IAT Std
Feature 8  Flow IAT Max
Feature 9 Fwd IAT Total
Feature 10  Fwd IAT Mean
Feature 11  Fwd IAT Std
Feature 12  Fwd IAT Max
Feature 13  Bwd Packets/s
Feature 14  Max Packet Length
Feature 15  Packet Length Mean
Feature 16  Packet Length Std
Feature 17  Packet Length Variance
Feature 18  Average Packet Size
Feature 19  Avg Bwd Segment Size
Feature 20 Init_Win_bytes_forward
Feature 21  Init_Win_bytes_backward
Feature 22 Idle Mean
Feature 23  Idle Max
Feature 24 Label


In [5]:
final_data.to_csv("IDS2017.csv",index=False, header=True)