In [1]:
import time

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential

In [None]:
def load_data_binary():
    # Default values.
    train_set = 'data/UNSW_NB15_training-set.csv'
    test_set = 'data/UNSW_NB15_testing-set.csv'
    train = pd.read_csv(train_set, index_col='id')  # 指定“id”这一列数据作为行索引
    test = pd.read_csv(test_set, index_col='id')  # 指定“id”这一列数据作为行索引

    # 二分类数据
    training_label = train['label']  # 将train的“label”这一列的值单独取出来
    testing_label = test['label']  # 将test的“label”这一列的值单独取出来

    # Creates new dummy columns from each unique string in a particular feature 创建新的虚拟列
    labels = pd.concat([training_label, testing_label])  # 将train和test拼接在一起
    unsw = pd.concat([train, test])  # 将train和test拼接在一起
    unsw = pd.get_dummies(data=unsw,
                          columns=['proto', 'service', 'state'])  # 将'proto', 'service', 'state'这三列使用one-hot-encoder转变
    # Normalising all numerical features:
    unsw.drop(['label', 'attack_cat'], axis=1,
              inplace=True)  # 删除'label', 'attack_cat'这两列，其中(inplace=True)是直接对原dataFrame进行操作
    unsw_value = unsw.values

    scaler = MinMaxScaler(feature_range=(0, 1))  # 初始化MinMaxScaler
    unsw_value = scaler.fit_transform(unsw_value)  # 将待处理数据矩阵进行归一化

    return unsw_value, labels

def load_data_multi():
    # Default values.
    train_set = 'data/UNSW_NB15_training-set.csv'
    test_set = 'data/UNSW_NB15_testing-set.csv'
    train = pd.read_csv(train_set, index_col='id')  # 指定“id”这一列数据作为行索引
    test = pd.read_csv(test_set, index_col='id')  # 指定“id”这一列数据作为行索引

    # 多分类数据
    training_label = train['attack_cat']  # 将train的“attack_cat”这一列的值单独取出来
    testing_label = test['attack_cat']  # 将test的“attack_cat”这一列的值单独取出来
    labels = pd.concat([training_label, testing_label])  # 将train和test拼接在一起
    le = LabelEncoder()
    le.fit(["Normal","Generic","Analysis","Reconnaissance","Fuzzers","DoS",
            "Exploits","Shellcode","Backdoor","Worms"])
    labels = np.array(le.transform(labels)).reshape(-1,1)
    scaler = MinMaxScaler(feature_range=(0, 1))  # 初始化MinMaxScaler
    labels = scaler.fit_transform(labels)  # 将待处理数据矩阵进行归一化
    return labels

In [17]:
# load data
print("Load data...")
train, train_label_binary = load_data_binary()
train_label_multi = load_data_multi()
print("train shape: ", train.shape)
train_label_binary = np.array(train_label_binary).reshape((-1, 1))
train_label_multi = np.array(train_label_multi).reshape((-1, 1))
print("train_label_binary shape: ", train_label_binary.shape)
print("train_label_multi shape: ", train_label_multi.shape)


np.save('data/encoded_train.npy', train)
np.save('data/train_label_binary.npy', train_label_binary)
np.save('data/train_label_multi.npy', train_label_multi)

Load data...
train shape:  (257673, 196)
train_label_binary shape:  (257673, 1)
train_label_multi shape:  (257673, 1)


In [None]:
# Load the dataset and labels

x = np.load('data/encoded_train.npy')
y = np.load('data/train_label_binary.npy')
y = y.reshape(y.shape[0])
# Calculate the rank of each feature
R = []
for h in range(x.shape[1]):
    kmeans = KMeans(init='k-means++', n_clusters=np.unique(y).shape[0], n_init=10)
    ff = kmeans.fit_predict(x[:, h].reshape(-1, 1))
    r = metrics.homogeneity_score(y, ff)  # Use the homogeneity score as a rank of the feature
    R.append(r)

# Arrange feature accroding to thier ranks
Rnk = np.argsort(np.array(R))
np.save('data/Rnk_binary.npy', Rnk)

In [None]:
# Initiate the cross-validation splitter
kfolds = StratifiedKFold(n_splits=5, shuffle=True)

# Per each set of ranks, use cross-validation to calculate accuracy.
smr = []
et=0
print(x.shape)
for j in range(Rnk.shape[0]):
    fd = x[:, Rnk[j:]]
    pp = 0
    lpa = np.zeros((0, 2))
    for train, test in kfolds.split(fd, y):
        train_t_x = fd[train]
        train_t_y = y[train]
        test_t_x=fd[test]
        test_t_y=y[test]
        model = Sequential()
        model.add(Dense(units=128, input_dim=fd.shape[1], activation='relu', use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(units=64, activation='relu', use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(units=32, activation='relu', use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(units=1, activation='sigmoid',
                        use_bias=True))  # The number of neurons is equal to the number of classes
        # model.summary()
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        history = model.fit(train_t_x,train_t_y, epochs=40, verbose=2,validation_data=(test_t_x,test_t_y))
        st = time.time()
        ff=model.predict(test_t_x)
        et+=time.time()-st
        ts = np.array(list(map(lambda x: x[0], ff.reshape(-1, 1))))
        pp = pp + metrics.accuracy_score(y[test].reshape(-1, 1), (ts >= 0.5).astype(int))
        lpa = np.vstack((lpa, np.hstack((y[test].reshape(-1, 1), ts.reshape(-1, 1)))))

    pp = pp / kfolds.n_splits
    np.savetxt('F%d_binary.csv' % j, lpa, delimiter=',')
    smr.append([j, pp, et*1000000/x.shape[0]]) #Calculate the time required to predict a label per each object in uS.

In [10]:
#Per each set of ranks, use cross-validation to calculate accuracy.
kfolds = StratifiedKFold(n_splits=5, shuffle=True)
smr=[]
et=0
for j in range(Rnk.shape[0]):
    fd=x[:,Rnk[j:]]
    pp=0
    lpa=np.zeros((0,2))
    for train,test in kfolds.split(fd,y):
        train_t_x = fd[train]
        train_t_y = y[train]
        test_t_x=fd[test]
        test_t_y=y[test]
        model = GradientBoostingClassifier()
        model.fit(train_t_x,train_t_y)
        st = time.time()
        predictions = model.predict(test_t_x)
        labelsAndPredictions = np.array([test_t_y,predictions])
        lpa=np.vstack((lpa,labelsAndPredictions.reshape(-1,2)))
        et+=time.time()-st
        num=0
        si = labelsAndPredictions.shape[1]
        label_t = labelsAndPredictions[0]
        label_p = labelsAndPredictions[1]
        for i in range(si):
            if label_t[i] == label_p[i]:
                num += 1
        acc = num / float(si)
        pp=pp+acc
    pp=pp/kfolds.n_splits
    np.savetxt('F%d_GBT_B.csv'%j,lpa,delimiter=',')
    smr.append([j, pp, et*1000000/x.shape[0]]) #Calculate the time required to predict a label per each object in uS.

In [10]:
#Initiate the cross-validation splitter
kfolds=StratifiedKFold(n_splits=5,shuffle=True)
#Per each set of ranks, use cross-validation to calculate accuracy.
smr=[]
et=0
for j in range(Rnk.shape[0]):
    fd=x[:,Rnk[j:]]
    pp=0
    lpa=np.zeros((0,2))
    for train,test in kfolds.split(fd,y):
        train_t_x = fd[train]
        train_t_y = y[train]
        test_t_x=fd[test]
        test_t_y=y[test]
        model = RandomForestClassifier(n_estimators=100) #The number of classes in the dataset
        model.n_outputs_=np.unique(y).shape[0]
        model.fit(train_t_x,train_t_y)
        st = time.time()
        predictions = model.predict(test_t_x)
        labelsAndPredictions = np.array([test_t_y,predictions])
        lpa=np.vstack((lpa,labelsAndPredictions.reshape(-1,2)))
        et+=time.time()-st
        num=0
        si = labelsAndPredictions.shape[1]
        label_t = labelsAndPredictions[0]
        label_p = labelsAndPredictions[1]
        for i in range(si):
            if label_t[i] == label_p[i]:
                num += 1
        acc = num / float(si)
        print(acc)
        pp=pp+acc
    pp=pp/kfolds.n_splits
    np.savetxt('F%d_RF_B.csv'%j,lpa,delimiter=',')
    smr.append([j, pp, et*1000000/x.shape[0]]) #Calculate the time required to predict a label per each object in uS.

0.9504026389832153
0.9501115746579994
0.9496846803143495
0.9505375092172158
0.9526138083595296


In [19]:
# Load the dataset and labels

x = np.load('data/encoded_train.npy')
y = np.load('data/train_label_multi.npy')
y=y.reshape(y.shape[0])
# Calculate the rank of each feature
R = []
for h in range(x.shape[1]):
    kmeans = KMeans(init='k-means++', n_clusters=np.unique(y).shape[0], n_init=10)
    ff = kmeans.fit_predict(x[:, h].reshape(-1, 1))
    r = metrics.homogeneity_score(y, ff)  # Use the homogeneity score as a rank of the feature
    R.append(r)
# Arrange feature accroding to thier ranks
Rnk = np.argsort(np.array(R))
np.save('data/Rnk_multi.npy', Rnk)

  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.fit(X, sample_weight=sample_weight).labels_
  return self.

In [13]:
x = np.load('data/encoded_train.npy')
y = np.load('data/train_label_multi.npy')
y = OneHotEncoder(sparse=False).fit_transform(y.reshape(-1,1))

Rnk=np.load('data/Rnk_multi.npy')
# Initiate the cross-validation splitter
kfolds = KFold(n_splits=5, shuffle=True)
et=0
# Per each set of ranks, use cross-validation to calculate accuracy.
smr = []
print(x.shape)
features_count=1
for j in range(Rnk.shape[0]):
    print('Processing feature #',features_count)
    features_count+=1
    fd = x[:, Rnk[j:]]
    pp = 0
    lpa = np.zeros((0, 2))
    fold=1
    for train, test in kfolds.split(fd, y):
        print('Processing fold #',fold)
        fold+=1
        train_t_x = fd[train]
        train_t_y = y[train]
        test_t_x=fd[test]
        test_t_y=y[test]
        model = Sequential()
        model.add(Dense(units=128, input_dim=fd.shape[1], activation='relu', use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(units=64, activation='relu', use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(units=32, activation='relu', use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(10,activation='softmax',use_bias=True)) #The number of neurons is equal to the number of classes
        #model.summary()
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
        history = model.fit(train_t_x,train_t_y, epochs=1, verbose=2,validation_data=(test_t_x,test_t_y))
        st = time.time()
        ff=model.predict(test_t_x)
        et+=time.time()-st
        ts = np.array(list(map(lambda x: x[0], ff.reshape(-1, 1))))
        pp = pp + metrics.accuracy_score(y[test].reshape(-1, 1), (ts >= 0.5).astype(int))
        lpa = np.vstack((lpa, np.hstack((y[test].reshape(-1, 1), ts.reshape(-1, 1)))))
    pp = pp / kfolds.n_splits
    np.savetxt('F%d_multi.csv' % j, lpa, delimiter=',')
    smr.append([j, pp, et*1000000/x.shape[0]]) #Calculate



(257673, 196)
Processing feature # 1
Processing fold # 1
6442/6442 - 18s - loss: 0.7521 - categorical_accuracy: 0.7186 - val_loss: 0.5973 - val_categorical_accuracy: 0.7603
0.9558144950033958
Processing fold # 2
6442/6442 - 17s - loss: 0.7552 - categorical_accuracy: 0.7164 - val_loss: 0.5932 - val_categorical_accuracy: 0.7575
1.9113476278257495
Processing fold # 3
6442/6442 - 18s - loss: 0.7624 - categorical_accuracy: 0.7147 - val_loss: 0.6032 - val_categorical_accuracy: 0.7588
2.866550887746192
Processing fold # 4
6442/6442 - 18s - loss: 0.7572 - categorical_accuracy: 0.7184 - val_loss: 0.6005 - val_categorical_accuracy: 0.7597
3.8213787683686933
Processing fold # 5
6442/6442 - 18s - loss: 0.7552 - categorical_accuracy: 0.7168 - val_loss: 0.6116 - val_categorical_accuracy: 0.7587
4.77506177376319
Processing feature # 2
Processing fold # 1


KeyboardInterrupt: 

In [16]:
x = np.load('data/encoded_train.npy')
y = np.load('data/train_label_multi.npy')
y = OneHotEncoder(sparse=False).fit_transform(y.reshape(-1,1))

Rnk=np.load('data/Rnk_multi.npy')
#Initiate the cross-validation splitter
kfolds=KFold(n_splits=5,shuffle=True)

#Per each set of ranks, use cross-validation to calculate accuracy.
smr=[]
et=0
print(x.shape)
features_count=1
for j in range(Rnk.shape[0]):
    print('Processing feature #',features_count)
    features_count+=1
    fd=x[:,Rnk[j:]]
    pp=0
    lpa=np.zeros((0,2))
    fold=1
    for train, test in kfolds.split(fd, y):
        print('Processing fold #',fold)
        fold+=1
        train_t_x = fd[train]
        train_t_y = y[train]
        test_t_x=fd[test]
        test_t_y=y[test]
        model = RandomForestClassifier(n_estimators=100) #The number of classes in the dataset
        model.n_outputs_=10
        model.fit(train_t_x,train_t_y)
        st = time.time()
        predictions = model.predict(test_t_x)
        labelsAndPredictions = np.array([test_t_y,predictions])
        lpa=np.vstack((lpa,labelsAndPredictions.reshape(-1,2)))
        et+=time.time()-st
        num=0
        si = labelsAndPredictions.shape[1]
        label_t = labelsAndPredictions[0]
        label_p = labelsAndPredictions[1]
        for i in range(si):
            if np.array_equal(label_t[i],label_p[i]):
                num += 1
        acc = num / float(si)
        print(acc)
        pp=pp+acc
    pp=pp/kfolds.n_splits
    np.savetxt('F%d_RF_B.csv'%j,lpa,delimiter=',')
    smr.append([j, pp, et*1000000/x.shape[0]]) #Calculate the time required to predict a label per each object in uS.

(257673, 196)
Processing feature # 1
Processing fold # 1
0.7663335597166974
Processing fold # 2
0.771786164742408
Processing fold # 3
0.769108372950422
Processing fold # 4
0.7705786471067645
Processing fold # 5
0.7692009159001824
Processing feature # 2
Processing fold # 1
0.769671097312506
Processing fold # 2
0.7676336470359949
Processing fold # 3
0.7659842825264384
Processing fold # 4
0.7704816237823573
Processing fold # 5
0.7680172313424147
Processing feature # 3
Processing fold # 1
0.7652275152808771
Processing fold # 2
0.7711846318036286
Processing fold # 3
0.76699330552052
Processing fold # 4
0.7658827182054566
Processing fold # 5
0.7686187759537393
Processing feature # 4
Processing fold # 1
0.7709711846318036
Processing fold # 2
0.769108372950422
Processing fold # 3
0.7640244494033182
Processing fold # 4
0.7705204331121202
Processing fold # 5
0.7657468855512866
Processing feature # 5
Processing fold # 1
0.7684292228582517
Processing fold # 2
0.7662559425633065
Processing fold # 3

KeyboardInterrupt: 