In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

In [2]:
#Labels extraction 
data = pd.read_csv("Input_data.csv")
features= data.iloc[:,0:38]  #independent columns
labels = data.iloc[:,-1]    #target column

In [3]:
#chi square method for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(features)
num_feats=20

chi_sel = SelectKBest(chi2, k=num_feats)
chi_sel.fit(X_norm, labels)
chi_supp = chi_sel.get_support()
chi_feature = features.loc[:,chi_supp].columns.tolist()
print(str(len(chi_feature)), 'selected features')

20 selected features


In [4]:
#Recursive Feature Elimination method for feature selection
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
rfe_sel = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)

rfe_sel.fit(X_norm, labels)
rfe_supp = rfe_sel.get_support()
rfe_feature = features.loc[:,rfe_supp].columns.tolist()
print(str(len(rfe_feature)), 'selected features')

Fitting estimator with 38 features.




Fitting estimator with 28 features.




20 selected features


In [5]:
#Logistic regression method for feature selection using L1 regularization
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
num_feats=20

lr_sel = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats)
lr_sel.fit(X_norm, labels)
lr_supp = lr_sel.get_support()
lr_feature = features.loc[:,lr_supp].columns.tolist()
print(str(len(lr_feature)), 'selected features')



20 selected features


In [6]:
#Random forest classifier method for feature selection

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
num_feats=20

rf_sel = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
rf_sel.fit(X_norm, labels)
rf_supp = rf_sel.get_support()
rf_feature = features.loc[:,rf_supp].columns.tolist()
print(str(len(rf_feature)), 'selected features')


13 selected features


In [7]:
#Light GBM method for feature selection

from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

num_feats=20

lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
lgbm_sel = SelectFromModel(lgbc, max_features=num_feats)
lgbm_sel.fit(X_norm, labels)
lgbm_supp = lgbm_sel.get_support()
lgbm_feature = features.loc[:,lgbm_supp].columns.tolist()
print(str(len(lgbm_feature)), 'selected features')

14 selected features


In [8]:
#Linear SVC method for feature selection

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
num_feats=20
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_norm, labels)
lsvc_sel = SelectFromModel(lsvc, prefit=True, max_features=num_feats)
lsvc_supp = lsvc_sel.get_support()
lsvc_feature = features.loc[:,lsvc_supp].columns.tolist()
print(str(len(lsvc_feature)), 'selected features')

20 selected features




In [9]:
#Extra Tree classifier method for feature selection

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
num_feats=20
et_clf= SelectFromModel(ExtraTreesClassifier(n_estimators=100), max_features=num_feats)
et_clf.fit(X_norm, labels)
et_clf_supp = et_clf.get_support()
et_clf_feature = features.loc[:,et_clf_support].columns.tolist()
print(str(len(et_clf_feature)), 'selected features')

14 selected features


In [10]:
# put all selection together

feat_name = ['Flow Duration', 'Total Fwd Packets','Total Length of Fwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min','Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min','Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd Header Length','Fwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'Down/Up Ratio', 'Average Packet Size','init_Win_bytes_forward','act_data_pkt_fwd','min_seg_size_forward','Active Mean', 'Active Std', 'Active Max', 'Active Min']
feat_sel_df = pd.DataFrame({'Feature Name':feat_name,'Chi-2 test':chi_supp, 'RFE':rfe_supp, 'Logistic Regression':lr_supp,
                                    'Random Forest':rf_supp,'LightGBM':lgbm_supp, 'Extra Tree':et_clf_supp, 'Linear SVC':lsvc_supp})

# count of num of times each feature is selected
feat_sel_df['Total Count'] = np.sum(feat_sel_df, axis=1)

# display the top 20 features
feat_sel_df = feat_sel_df.sort_values(['Total Count','Feature Name'] , ascending=False)
feat_sel_df.index = range(1, len(feat_sel_df)+1)
feat_sel_df.head(num_feats)

Unnamed: 0,Feature Name,Chi-2 test,RFE,Logistic Regression,Random Forest,LightGBM,Extra Tree,Linear SVC,Total Count
1,Packet Length Variance,True,True,True,True,False,True,True,6
2,Packet Length Std,True,True,True,True,True,True,False,6
3,Max Packet Length,True,True,True,True,False,True,True,6
4,Fwd Packets/s,True,True,True,False,True,True,True,6
5,Fwd IAT Max,True,True,True,False,True,True,True,6
6,Flow IAT Max,True,True,True,True,True,True,False,6
7,init_Win_bytes_forward,True,False,False,True,True,True,True,5
8,Fwd Packet Length Mean,True,True,True,True,False,True,False,5
9,Fwd IAT Std,True,True,False,True,False,True,True,5
10,Fwd IAT Mean,True,True,True,False,True,False,True,5


In [12]:
# iterating the columns with their indices
for idx ,col in enumerate(data.columns): 
    print(str(idx) + " " + str(col))

0  Flow Duration
1  Total Fwd Packets
2 Total Length of Fwd Packets
3  Fwd Packet Length Max
4  Fwd Packet Length Min
5  Fwd Packet Length Mean
6  Fwd Packet Length Std
7  Flow Packets/s
8  Flow IAT Mean
9  Flow IAT Std
10  Flow IAT Max
11  Flow IAT Min
12 Fwd IAT Total
13  Fwd IAT Mean
14  Fwd IAT Std
15  Fwd IAT Max
16  Fwd IAT Min
17 Bwd IAT Total
18  Bwd IAT Mean
19  Bwd IAT Std
20  Bwd IAT Max
21  Bwd IAT Min
22 Fwd Header Length
23 Fwd Packets/s
24  Min Packet Length
25  Max Packet Length
26  Packet Length Mean
27  Packet Length Std
28  Packet Length Variance
29  Down/Up Ratio
30  Average Packet Size
31 Init_Win_bytes_forward
32  act_data_pkt_fwd
33  min_seg_size_forward
34 Active Mean
35  Active Std
36  Active Max
37  Active Min
38 Label


In [14]:
# select all rows and 20 cols+label
final_data=data.iloc[:, [28,27,25,23,15,10,31,5,14,13,9,26,24,8,0,30,6,4,3,12,38]]
final_data.head()

Unnamed: 0,Packet Length Variance,Packet Length Std,Max Packet Length,Fwd Packets/s,Fwd IAT Max,Flow IAT Max,Init_Win_bytes_forward,Fwd Packet Length Mean,Fwd IAT Std,Fwd IAT Mean,...,Packet Length Mean,Min Packet Length,Flow IAT Mean,Flow Duration,Average Packet Size,Fwd Packet Length Std,Fwd Packet Length Min,Fwd Packet Length Max,Fwd IAT Total,Label
0,0.0,0.0,6,666666.6667,3,3,33,6.0,0.0,3.0,...,6.0,6,3.0,3,9.0,0.0,6,6,3,0
1,0.0,0.0,6,9174.311927,0,109,29,6.0,0.0,0.0,...,6.0,6,109.0,109,9.0,0.0,6,6,0,0
2,0.0,0.0,6,19230.76923,0,52,29,6.0,0.0,0.0,...,6.0,6,52.0,52,9.0,0.0,6,6,0,0
3,0.0,0.0,6,29411.76471,0,34,31,6.0,0.0,0.0,...,6.0,6,34.0,34,9.0,0.0,6,6,0,0
4,0.0,0.0,6,666666.6667,3,3,32,6.0,0.0,3.0,...,6.0,6,3.0,3,9.0,0.0,6,6,3,0


In [15]:
final_data.to_csv("final_data.csv",index=False, header=True)