In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
import imblearn
import sklearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE

In [79]:
def drop_infinity(X):
    # check for values approaching infinity
    X = X.replace([np.inf, -np.inf], np.nan)
    print("the number of infinity:", X.isna().sum().sum())
    return X

In [80]:
ids_datatypes = {
    ' Destination Port': np.int64,
    ' Flow Duration': np.int64,
    ' Total Fwd Packets': np.int64,
    ' Total Backward Packets': np.int64,
    'Total Length of Fwd Packets': np.int64,
    ' Total Length of Bwd Packets': np.int64,
    ' Fwd Packet Length Max': np.int64,
    ' Fwd Packet Length Min': np.int64,
    ' Fwd Packet Length Mean': np.float64,
    ' Fwd Packet Length Std': np.float64,
    'Bwd Packet Length Max': np.int64,
    ' Bwd Packet Length Min': np.int64,
    ' Bwd Packet Length Mean': np.float64,
    ' Bwd Packet Length Std': np.float64,
    'Flow Bytes/s': np.float64,
    ' Flow Packets/s': np.float64,
    ' Flow IAT Mean': np.float64,
    ' Flow IAT Std': np.float64,
    ' Flow IAT Max': np.int64,
    ' Flow IAT Min': np.int64,
    'Fwd IAT Total': np.int64,
    ' Fwd IAT Mean': np.float64,
    ' Fwd IAT Std': np.float64,
    ' Fwd IAT Max': np.int64, 
    ' Fwd IAT Min': np.int64,
    'Bwd IAT Total': np.int64,
    ' Bwd IAT Mean': np.float64,
    ' Bwd IAT Std': np.float64,
    ' Bwd IAT Max': np.int64,
    ' Bwd IAT Min': np.int64,
    'Fwd PSH Flags': np.int64,
    ' Bwd PSH Flags': np.int64,
    ' Fwd URG Flags': np.int64,
    ' Bwd URG Flags': np.int64,
    ' Fwd Header Length': np.int64,
    ' Bwd Header Length': np.int64,
    'Fwd Packets/s': np.float64,
    ' Bwd Packets/s': np.float64,
    ' Min Packet Length': np.int64,
    ' Max Packet Length': np.int64,
    ' Packet Length Mean': np.float64,
    ' Packet Length Std': np.float64,
    ' Packet Length Variance': np.float64,
    'FIN Flag Count': np.int64,
    ' SYN Flag Count': np.int64,
    ' RST Flag Count': np.int64,
    ' PSH Flag Count': np.int64,
    ' ACK Flag Count': np.int64,
    ' URG Flag Count': np.int64,
    ' CWE Flag Count': np.int64,
    ' ECE Flag Count': np.int64,
    ' Down/Up Ratio': np.int64,
    ' Average Packet Size': np.float64,
    ' Avg Fwd Segment Size': np.float64,
    ' Avg Bwd Segment Size': np.float64,
    ' Fwd Header Length.1': np.int64,
    'Fwd Avg Bytes/Bulk': np.int64,
    ' Fwd Avg Packets/Bulk': np.int64,
    ' Fwd Avg Bulk Rate': np.int64,
    ' Bwd Avg Bytes/Bulk': np.int64,
    ' Bwd Avg Packets/Bulk': np.int64,
    'Bwd Avg Bulk Rate': np.int64,
    'Subflow Fwd Packets': np.int64,
    ' Subflow Fwd Bytes': np.int64,
    ' Subflow Bwd Packets': np.int64,
    ' Subflow Bwd Bytes': np.int64,
    'Init_Win_bytes_forward': np.int64, 
    ' Init_Win_bytes_backward': np.int64,
    ' act_data_pkt_fwd': np.int64,
    ' min_seg_size_forward': np.int64,
    'Active Mean': np.float64,
    ' Active Std': np.float64,
    ' Active Max': np.int64,
    ' Active Min': np.int64,
    'Idle Mean': np.float64,
    ' Idle Std': np.float64,
    ' Idle Max': np.int64,
    ' Idle Min': np.int64
}
used_cols = (ids_datatypes.keys())

In [81]:
#read data from dataset (after feature extraction )
# train model: aftcat, CV(Pipeline(imbalanced, MinMaxScaler, selector, model))
data_aftcat = pd.read_csv(r"./2-1 5388 data_aftcat.csv")
data_aftcat_y = pd.read_csv(r"./2-1 5388 data_aftcat label.csv")
testdata_kafka = pd.read_csv('./2-3 5388 data_kafka_1.csv',dtype=ids_datatypes, usecols=used_cols, low_memory=False)
testdata_kafka_label = pd.read_csv(r"./2-3 5388 data_kafka_1 label.csv")

In [82]:
# Step 1: drop features -------------------------------------------------------------
list = [' Bwd PSH Flags',' Bwd URG Flags','Fwd Avg Bytes/Bulk',' Fwd Avg Packets/Bulk',' Fwd Avg Bulk Rate',' Bwd Avg Bytes/Bulk',' Bwd Avg Packets/Bulk','Bwd Avg Bulk Rate']
testdata_kafka.drop(list, axis=1, inplace=True)

# Step 2: drop infinity
testdata_kafka = drop_infinity(testdata_kafka)
#data = drop_na(testdata_kafka)
#print("after drop infinity:", testdata_kafka.shape)

the number of infinity: 164


In [83]:
# ---------------- transform all feature with MinMaxScaler()---------------- 
mm = MinMaxScaler().fit(data_aftcat)

data_aftcat_aftmm = pd.DataFrame(mm.transform(data_aftcat))
data_aftcat_aftmm.columns = mm.get_feature_names_out()

testdata_kafka_aftmm = pd.DataFrame(mm.transform(testdata_kafka))
testdata_kafka_aftmm.columns = mm.get_feature_names_out()

In [84]:
# -------------------------functions--------------
# function score_mean_std: calculate the average and std of scores
def score_mean_std(score_sf):
    index_a1 = []
    index_a2 = []

    for i in range(score_sf.shape[1]):
        index_a1.append(np.average(score_sf[score_sf.columns[i]]))
        index_a2.append(np.std(score_sf[score_sf.columns[i]]))

    index_a1 = pd.DataFrame(index_a1, index = score_sf.columns)
    index_a2 = pd.DataFrame(index_a2, index = score_sf.columns)
    #print(index_a1.T)
    #print(index_a2.T)
    re = pd.concat([index_a1.T,index_a2.T],axis=0)
    re.index = ["avg","std"]
    #print(re)
    return re

In [85]:
# -------------- train predictive model with pipeline: SMOTE(),VarianceThreshold(), cross_validate()--------
def train_model(sm, se, model, X, y):
    #print(model, "\n")
    #pipe_steps = [('smo',sm),('mm',m),('selector',se),('model',model)]
    pipe_steps = [('smo',sm),('selector',se),('model',model)]
    id_pipeline = imblearn.pipeline.Pipeline(steps=pipe_steps)

    # evaluate the pipeline using the crossvalidation technique defined in cv

    # ------------------- score ------------------------------------
    scoring = {'f1', 'precision', 'accuracy',
            'recall', 'roc_auc'}

    score_sf = cross_validate(id_pipeline, X, y.values.ravel(),cv=10,scoring=scoring, n_jobs=-1)
    score_sf_re = score_mean_std(pd.DataFrame(score_sf))

    return score_sf,score_sf_re

In [86]:
# -------------- prediction with pipeline: SMOTE(),VarianceThreshold(), cross_validate()--------
def pred_kafka(se, model, X, y):
    #print(model, "\n")
    #pipe_steps = [('mm',m),('selector',se),('model',model)]
    pipe_steps = [('selector',se),('model',model)]
    id_pipeline = sklearn.pipeline.Pipeline(steps=pipe_steps)

    # evaluate the pipeline using the crossvalidation technique defined in cv

    # ------------------- score ------------------------------------
    scoring = {'f1', 'precision', 'accuracy',
            'recall', 'roc_auc'}

    score_sf = cross_validate(id_pipeline, X, y.values.ravel(),cv=10,scoring=scoring, n_jobs=-1)
    score_sf_re = score_mean_std(pd.DataFrame(score_sf))

    return score_sf,score_sf_re

In [88]:
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ train models: RF, GRB, MLP, DT ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# repeat several times for parameters tuning
# author:           Kun Yan
# student number:   300259303
# data:             2021-10-03
# Python version:   3.9.7
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#mm = MinMaxScaler()
smo = SMOTE()
selector = VarianceThreshold(np.median(data_aftcat_aftmm.var().values))
# Random Forest model by parameters tuning
model_rf = RandomForestClassifier(criterion='entropy',min_samples_split=2, min_samples_leaf=1, max_features=17, max_depth=23, n_estimators=115,n_jobs=-1,random_state = 90)
#model_grb = GradientBoostingClassifier(max_features=3,learning_rate=0.1,n_estimators=130,min_samples_split=100,min_samples_leaf=7,max_depth=15,random_state = 10)


# -------------------- model: RF (repeat several times for parameters tuning)-----------------------
print(model_rf)
score_sf_rf,score_sf_re_rf = train_model(smo, selector, model_rf, data_aftcat_aftmm, data_aftcat_y)
print("score_sf_re_rf: \n", pd.DataFrame(score_sf_re_rf))
RF = pd.DataFrame(score_sf_rf['test_f1'],columns = ["RF"])
#print(RF)

RandomForestClassifier(criterion='entropy', max_depth=23, max_features=17,
                       n_estimators=115, n_jobs=-1, random_state=90)
score_sf_re_rf: 
       fit_time  score_time  test_precision  test_accuracy   test_f1  \
avg  35.943109    0.182465        0.990226       0.998505  0.991753   
std   7.966647    0.017226        0.004332       0.000886  0.004891   

     test_recall  test_roc_auc  
avg     0.993313      0.999650  
std     0.007516      0.000704  


In [89]:
#  ----------------------------- Prediction method 1: kafka data +cv-----------------------------
print("= = = = = = = = = prediction score for kafka data = = = = = = = =")
pred_score_sf_kafka_rf,pred_score_sf_re_kafka_rf = pred_kafka(selector, model_rf, testdata_kafka_aftmm, testdata_kafka_label)
print("pred_score_sf_re_kafka_rf: \n", pd.DataFrame(pred_score_sf_re_kafka_rf))
pred_RF = pd.DataFrame(pred_score_sf_kafka_rf['test_f1'],columns = ["pred_RF"])
#print(RF)

= = = = = = = = = prediction score for kafka data = = = = = = = =
pred_score_sf_re_kafka_rf: 
       fit_time  score_time  test_precision  test_accuracy   test_f1  \
avg  92.763388    0.245371        0.993116        0.99881  0.993967   
std  23.549272    0.044421        0.001546        0.00037  0.001885   

     test_recall  test_roc_auc  
avg     0.994826      0.999892  
std     0.003408      0.000155  
