In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_selection import VarianceThreshold
import imblearn
import sklearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.utils.multiclass import type_of_target

In [73]:
# write the print content to file 
f1 = open("2-2-5 IoT predictive scores for an static model by target names.log",'w+')
f2 = open("2-2-5 IoT predictive scores for an static model with cv.log",'w+')
f3 = open("2-2-5 IoT predictive scores for an static model on kafka data with cv.log",'w+')
f4 = open("2-2-5 IoT predictive scores for an static model on kafka data by target names.log",'w+')

In [74]:
def drop_infinity(X):
    # check for values approaching infinity
    X = X.replace([np.inf, -np.inf], np.nan)
    print("the number of infinity:", X.isna().sum().sum())
    return X

In [75]:
ids_datatypes = {
    'MI_dir_L5_weight': np.float64,
    'MI_dir_L5_mean': np.float64,
    'MI_dir_L5_variance': np.float64,
    'MI_dir_L3_weight': np.float64,
    'MI_dir_L3_mean': np.float64,
    'MI_dir_L3_variance': np.float64,
    'MI_dir_L1_weight': np.float64,
    'MI_dir_L1_mean': np.float64,
    'MI_dir_L1_variance': np.float64,
    'MI_dir_L0.1_weight': np.float64,
    'MI_dir_L0.1_mean': np.float64,
    'MI_dir_L0.1_variance': np.float64,
    'MI_dir_L0.01_weight': np.float64,
    'MI_dir_L0.01_mean': np.float64,
    'MI_dir_L0.01_variance': np.float64,
    'H_L5_weight': np.float64,
    'H_L5_mean': np.float64,
    'H_L5_variance': np.float64,
    'H_L3_weight': np.float64,
    'H_L3_mean': np.float64,
    'H_L3_variance': np.float64,
    'H_L1_weight': np.float64,
    'H_L1_mean': np.float64,
    'H_L1_variance': np.float64,
    'H_L0.1_weight': np.float64,
    'H_L0.1_mean': np.float64,
    'H_L0.1_variance': np.float64,
    'H_L0.01_weight': np.float64,
    'H_L0.01_mean': np.float64,
    'H_L0.01_variance': np.float64,
    'HH_L5_weight': np.float64,
    'HH_L5_mean': np.float64,
    'HH_L5_std': np.float64,
    'HH_L5_magnitude': np.float64,
    'HH_L5_radius': np.float64,
    'HH_L5_covariance': np.float64,
    'HH_L5_pcc': np.float64,
    'HH_L3_weight': np.float64,
    'HH_L3_mean': np.float64,
    'HH_L3_std': np.float64,
    'HH_L3_magnitude': np.float64,
    'HH_L3_radius': np.float64,
    'HH_L3_covariance': np.float64,
    'HH_L3_pcc': np.float64,
    'HH_L1_weight': np.float64,
    'HH_L1_mean': np.float64,
    'HH_L1_std': np.float64,
    'HH_L1_magnitude': np.float64,
    'HH_L1_radius': np.float64,
    'HH_L1_covariance': np.float64,
    'HH_L1_pcc': np.float64,
    'HH_L0.1_weight': np.float64,
    'HH_L0.1_mean': np.float64,
    'HH_L0.1_std': np.float64,
    'HH_L0.1_magnitude': np.float64,
    'HH_L0.1_radius': np.float64,
    'HH_L0.1_covariance': np.float64,
    'HH_L0.1_pcc': np.float64,
    'HH_L0.01_weight': np.float64,
    'HH_L0.01_mean': np.float64,
    'HH_L0.01_std': np.float64,
    'HH_L0.01_magnitude': np.float64,
    'HH_L0.01_radius': np.float64,
    'HH_L0.01_covariance': np.float64,
    'HH_L0.01_pcc': np.float64,
    'HH_jit_L5_weight': np.float64,
    'HH_jit_L5_mean': np.float64,
    'HH_jit_L5_variance': np.float64,
    'HH_jit_L3_weight': np.float64,
    'HH_jit_L3_mean': np.float64,
    'HH_jit_L3_variance': np.float64,
    'HH_jit_L1_weight': np.float64,
    'HH_jit_L1_mean': np.float64,
    'HH_jit_L1_variance': np.float64,
    'HH_jit_L0.1_weight': np.float64,
    'HH_jit_L0.1_mean': np.float64,
    'HH_jit_L0.1_variance': np.float64,
    'HH_jit_L0.01_weight': np.float64,
    'HH_jit_L0.01_mean': np.float64,
    'HH_jit_L0.01_variance': np.float64,
    'HpHp_L5_weight': np.float64,
    'HpHp_L5_mean': np.float64,
    'HpHp_L5_std': np.float64,
    'HpHp_L5_magnitude': np.float64,
    'HpHp_L5_radius': np.float64,
    'HpHp_L5_covariance': np.float64,
    'HpHp_L5_pcc': np.float64,
    'HpHp_L3_weight': np.float64,
    'HpHp_L3_mean': np.float64,
    'HpHp_L3_std': np.float64,
    'HpHp_L3_magnitude': np.float64,
    'HpHp_L3_radius': np.float64,
    'HpHp_L3_covariance': np.float64,
    'HpHp_L3_pcc': np.float64,
    'HpHp_L1_weight': np.float64,
    'HpHp_L1_mean': np.float64,
    'HpHp_L1_std': np.float64,
    'HpHp_L1_magnitude': np.float64,
    'HpHp_L1_radius': np.float64,
    'HpHp_L1_covariance': np.float64,
    'HpHp_L1_pcc': np.float64,
    'HpHp_L0.1_weight': np.float64,
    'HpHp_L0.1_mean': np.float64,
    'HpHp_L0.1_std': np.float64,
    'HpHp_L0.1_magnitude': np.float64,
    'HpHp_L0.1_radius': np.float64,
    'HpHp_L0.1_covariance': np.float64,
    'HpHp_L0.1_pcc': np.float64,
    'HpHp_L0.01_weight': np.float64,
    'HpHp_L0.01_mean': np.float64,
    'HpHp_L0.01_std': np.float64,
    'HpHp_L0.01_magnitude': np.float64,
    'HpHp_L0.01_radius': np.float64,
    'HpHp_L0.01_covariance': np.float64,
    'HpHp_L0.01_pcc': np.float64,
    #'Source':object,
    'x0_0': np.float64,
    'x0_1': np.float64,
    'x0_2': np.float64,
    'x0_3': np.float64,
    'x0_4': np.float64,
    'x0_5': np.float64,
    'x0_6': np.float64,
    'x0_7': np.float64,
    'x0_8': np.float64,
    #'Class':object
}
used_cols = (ids_datatypes.keys())

In [76]:
#read data from dataset (after feature extraction )
# train model: aftcat, CV(Pipeline(imbalanced, MinMaxScaler, selector, model))
data_aftcat = pd.read_csv(r"./2-2-1 5388 data_aftcat.csv")
data_aftcat_y = pd.read_csv(r"./2-2-1 5388 data_aftcat class.csv")

In [77]:
testdata_kafka = pd.read_csv('./2-2-4 5388 data_kafka_2.csv',dtype=ids_datatypes, usecols=used_cols, low_memory=False)
testdata_kafka_label = pd.read_csv(r"./2-2-4 5388 data_kafka_2 label.csv")

In [78]:
# Step 1: drop infinity
testdata_kafka = drop_infinity(testdata_kafka)
#data = drop_na(testdata_kafka)
#print("after drop infinity:", testdata_kafka.shape)

the number of infinity: 0


In [79]:
# ---------------- transform all feature with MinMaxScaler()---------------- 
mm = StandardScaler().fit(data_aftcat)

data_aftcat_aftmm = pd.DataFrame(mm.transform(data_aftcat))
data_aftcat_aftmm.columns = mm.get_feature_names_out()

testdata_kafka_aftmm = pd.DataFrame(mm.transform(testdata_kafka))
testdata_kafka_aftmm.columns = mm.get_feature_names_out()

In [80]:
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ train models: RF, GRB, MLP, DT ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# repeat several times for parameters tuning
# author:           Kun Yan
# student number:   300259303
# data:             2021-10-03
# Python version:   3.9.7
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# ---------------- Split train datasets and test ---------------
# split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(data_aftcat_aftmm, data_aftcat_y, test_size=0.3, random_state=42)

In [81]:
# ---------------- train model and get score by attack types ---------------
#mm = MinMaxScaler()
smo = SMOTE()
selector = VarianceThreshold(np.median(X_train.var().values))
# Random Forest model by parameters tuning
model_rf = RandomForestClassifier(criterion='entropy',min_samples_split=2, min_samples_leaf=1, max_features=17, max_depth=23, n_estimators=115,n_jobs=-1,random_state = 90)
#model_grb = GradientBoostingClassifier(max_features=3,learning_rate=0.1,n_estimators=130,min_samples_split=100,min_samples_leaf=7,max_depth=15,random_state = 10)


In [82]:
pipe_steps = [('smo',smo),('selector',selector),('model',model_rf)]
mul_pipe = imblearn.pipeline.Pipeline(steps=pipe_steps)

mul_pipe.fit(X_train, y_train.values.ravel())

Pipeline(steps=[('smo', SMOTE()),
                ('selector', VarianceThreshold(threshold=0.9987156517276344)),
                ('model',
                 RandomForestClassifier(criterion='entropy', max_depth=23,
                                        max_features=17, n_estimators=115,
                                        n_jobs=-1, random_state=90))])

In [83]:
mul_y_pred = mul_pipe.predict(X_test)
#mul_pipe.score(X_test, y_test)
mul_score = classification_report(y_test,mul_y_pred,target_names=['BENIGN','mirai_udp_attack', 'mirai_ack_attack', 
    'gafgyt_scan_attack', 'mirai_scan_attack', 'gafgyt_tcp_attack',
    'gafgyt_udp_attack', 'gafgyt_junk_attack', 'gafgyt_combo_attack',
    'mirai_syn_attack', 'mirai_udpplain_attack'],output_dict=True)
print("= = = = = = = = = = prediction score for static model by target names")
print("= = = = = = = = = = prediction score for static model by target names",file=f1)
print(mul_score,file=f1)
print(mul_score)

= = = = = = = = = = prediction score for static model by target names
{'BENIGN': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 6639}, 'mirai_udp_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 4}, 'mirai_ack_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 104}, 'gafgyt_scan_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 53}, 'mirai_scan_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 51}, 'gafgyt_tcp_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 126}, 'gafgyt_udp_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 30}, 'gafgyt_junk_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 26}, 'gafgyt_combo_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 28}, 'mirai_syn_attack': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 522}, 'mirai_udpplain_attack': {'precision': 1.0, 'recall': 1

# train model with cv 

In [84]:
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ train models using CV ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# repeat several times for parameters tuning
# author:           Kun Yan
# student number:   300259303
# data:             2021-10-03
# Python version:   3.9.7
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

def train_model(sm, se, model, X, y):
    #print(model, "\n")
    #pipe_steps = [('smo',sm),('mm',m),('selector',se),('model',model)]
    pipe_steps = [('smo',sm),('selector',se),('model',model)]
    id_pipeline = imblearn.pipeline.Pipeline(steps=pipe_steps)

    
    # evaluate the pipeline using the crossvalidation technique defined in cv

    # ------------------- score ------------------------------------
    
    scoring = {'f1_micro', 'precision_micro', 'balanced_accuracy','recall_micro'}
    score_sf = cross_validate(id_pipeline, X, y.values.ravel(),cv=10,scoring=scoring, n_jobs=-1)
    #sorted(score_sf.keys())
    #score_sf_re = score_mean_std(pd.DataFrame(score_sf))
    return score_sf

In [85]:
# -------------- train predictive model with pipeline: SMOTE(),VarianceThreshold(), cross_validate()--------
#mm = MinMaxScaler()
smo = SMOTE()
selector = VarianceThreshold(np.median(data_aftcat_aftmm.var().values))
# Random Forest model by parameters tuning
model_rf = RandomForestClassifier(criterion='entropy',min_samples_split=2, min_samples_leaf=1, max_features=17, max_depth=23, n_estimators=115,n_jobs=-1,random_state = 90)
#model_grb = GradientBoostingClassifier(max_features=3,learning_rate=0.1,n_estimators=130,min_samples_split=100,min_samples_leaf=7,max_depth=15,random_state = 10)


In [86]:
# -------------------- model: RF (repeat several times for parameters tuning)-----------------------
print("= = = = = = = = = = = = = = prediction score for a static model RF with CV = = = = = ")
print("= = = = = = = = = = = = = = prediction score for a static model RF with CV = = = = = ",file=f2)
print(model_rf)
print(model_rf,file=f2)

#score_sf_rf,score_sf_re_rf = train_model(smo, selector, model_rf, data_aftcat_aftmm, data_aftcat_y)
score_sf_rf = train_model(smo, selector, model_rf, data_aftcat_aftmm, data_aftcat_y)

print(score_sf_rf)
print(score_sf_rf,file=f2)

= = = = = = = = = = = = = = prediction score for a static model RF with CV = = = = = 
RandomForestClassifier(criterion='entropy', max_depth=23, max_features=17,
                       n_estimators=115, n_jobs=-1, random_state=90)
{'fit_time': array([261.41063595, 263.53414512, 263.64361501, 260.66043496,
       257.13157296, 254.11399722, 255.23360205, 256.5883882 ,
       132.99567604, 131.89912009]), 'score_time': array([0.08044219, 0.11874795, 0.11082697, 0.07692003, 0.07893705,
       0.07331371, 0.09712911, 0.08731198, 0.07222176, 0.06180501]), 'test_recall_micro': array([0.99960459, 1.        , 0.99960459, 0.99960459, 0.99960443,
       0.99920886, 0.99920886, 0.99920886, 1.        , 0.99960443]), 'test_f1_micro': array([0.99960459, 1.        , 0.99960459, 0.99960459, 0.99960443,
       0.99920886, 0.99920886, 0.99920886, 1.        , 0.99960443]), 'test_precision_micro': array([0.99960459, 1.        , 0.99960459, 0.99960459, 0.99960443,
       0.99920886, 0.99920886, 0.99920886, 

In [87]:
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ prediction with kafka data +cv ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# repeat several times for parameters tuning
# author:           Kun Yan
# student number:   300259303
# data:             2021-10-03
# Python version:   3.9.7
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# -------------- prediction with pipeline: SMOTE(),VarianceThreshold(), cross_validate()--------
def pred_kafka(se, model, X, y):
    #print(model, "\n")
    #pipe_steps = [('mm',m),('selector',se),('model',model)]
    pipe_steps = [('selector',se),('model',model)]
    id_pipeline = sklearn.pipeline.Pipeline(steps=pipe_steps)

    # evaluate the pipeline using the crossvalidation technique defined in cv

    # ------------------- score ------------------------------------
    scoring = {'f1_micro', 'precision_micro', 'balanced_accuracy','recall_micro'}
    score_sf = cross_validate(id_pipeline, X, y.values.ravel(),cv=10,scoring=scoring, n_jobs=-1)
    return score_sf


print("= = = = = = = = = prediction score for kafka data = = = = = = = =")
print("= = = = = = = = = prediction score for kafka data = = = = = = = =",file=f3)
pred_score_sf_kafka_rf = pred_kafka(selector, model_rf, testdata_kafka_aftmm, testdata_kafka_label)
print(pred_score_sf_kafka_rf)
print(pred_score_sf_kafka_rf,file=f3)

= = = = = = = = = prediction score for kafka data = = = = = = = =
{'fit_time': array([202.76798391, 191.77870178, 179.03235507, 193.20384288,
       167.80275702, 157.58115101, 144.62086105, 149.98726892,
        83.72340894,  77.20401812]), 'score_time': array([0.13840199, 0.16613913, 0.18430591, 0.14023304, 0.18555593,
       0.17454219, 0.13685918, 0.13795304, 0.13421106, 0.12044597]), 'test_recall_micro': array([0.99070093, 0.99110089, 0.99080092, 0.9911    , 0.991     ,
       0.9997    , 0.9909    , 0.9903    , 0.9905    , 0.9906    ]), 'test_f1_micro': array([0.99070093, 0.99110089, 0.99080092, 0.9911    , 0.991     ,
       0.9997    , 0.9909    , 0.9903    , 0.9905    , 0.9906    ]), 'test_precision_micro': array([0.99070093, 0.99110089, 0.99080092, 0.9911    , 0.991     ,
       0.9997    , 0.9909    , 0.9903    , 0.9905    , 0.9906    ]), 'test_balanced_accuracy': array([0.84936742, 0.90908045, 0.88358789, 0.90791495, 0.90779441,
       0.99775865, 0.90573998, 0.81938479, 0.

In [90]:
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ prediction with kafka by target names ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# repeat several times for parameters tuning
# author:           Kun Yan
# student number:   300259303
# data:             2021-10-30
# Python version:   3.9.7
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#target_names = ['class 0', 'class 1', 'class 2']
#print(classification_report(y_true, y_pred, target_names=target_names))
print("= = = = = = = =  prediction score for kafka data by target names = = = = =  ")
print("= = = = = = = =  prediction score for kafka data by target names = = = = =  ",file=f4)


#pre_pipe_steps = [('selector',selector),('model',model_rf)]
pre_pipe_steps = [('model',model_rf)]
pre_id_pipeline = sklearn.pipeline.Pipeline(steps=pre_pipe_steps)
pre_id_pipeline.fit(data_aftcat_aftmm, data_aftcat_y.values.ravel())
pred_kafka_y = pre_id_pipeline.predict(testdata_kafka_aftmm)

mul_score_kafka = classification_report(testdata_kafka_label,pred_kafka_y,target_names=['BENIGN','mirai_udp_attack', 'mirai_ack_attack', 
    'gafgyt_scan_attack', 'mirai_scan_attack', 'gafgyt_tcp_attack',
    'gafgyt_udp_attack', 'gafgyt_junk_attack', 'gafgyt_combo_attack',
    'mirai_syn_attack', 'mirai_udpplain_attack'],output_dict=True)
print(mul_score_kafka,file=f4)
print(mul_score_kafka)

= = = = = = = =  prediction score for kafka data by target names = = = = =  
{'BENIGN': {'precision': 0.9998619150086878, 'recall': 0.9999884914606638, 'f1-score': 0.9999251992289767, 'support': 86892}, 'mirai_udp_attack': {'precision': 0.95, 'recall': 0.5135135135135135, 'f1-score': 0.6666666666666667, 'support': 37}, 'mirai_ack_attack': {'precision': 0.9885568976478067, 'recall': 0.9987154784842646, 'f1-score': 0.9936102236421726, 'support': 1557}, 'gafgyt_scan_attack': {'precision': 1.0, 'recall': 0.9974424552429667, 'f1-score': 0.998719590268886, 'support': 782}, 'mirai_scan_attack': {'precision': 0.9988518943742825, 'recall': 0.9965635738831615, 'f1-score': 0.9977064220183487, 'support': 873}, 'gafgyt_tcp_attack': {'precision': 0.9994089834515366, 'recall': 0.9988186650915535, 'f1-score': 0.9991137370753324, 'support': 1693}, 'gafgyt_udp_attack': {'precision': 1.0, 'recall': 0.996996996996997, 'f1-score': 0.9984962406015038, 'support': 333}, 'gafgyt_junk_attack': {'precision': 1.0