In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing as scale
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn import mixture
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [20]:
input_file = [ "biflow_sparta.csv", "biflow_scan_sU.csv", 'biflow_scan_A.csv', "biflow_mqtt_bruteforce.csv"]
columns_to_drop_bi = ['proto', 'ip_src', 'ip_dst']

The Federated Learner:

Parameters:

x: The data to be decided (array)

clfs:  The binary classifiers;(List of Classifiers)

p_nodes: The proportion of the nodes;(List)

gms: Gassian Mixture Models (List of GMMs)



In [21]:
def pred(x, clfs, p_nodes, gms):
    p_x_node = np.zeros((len(x), len(clfs))) 
    p_y_given_node = np.zeros((len(x), len(clfs))) 
    for i in range(len(clfs)):
        p_x_node[:, i] = p_nodes[i] * np.exp(gms[i].score_samples(x))
        p_y_given_node[:, i] = clfs[i].predict_proba(x)[:, 1]
    
    p_y = p_y_given_node * p_x_node / (np.sum(p_x_node, axis=-1, keepdims=True) + 1e-10)

    p_y = np.hstack([1-np.sum(p_y, axis=-1, keepdims=True), p_y])

    return np.argmax(p_y, axis=-1)

Binary=SVM

In [16]:
def Fed_SVC_learn(input_file, columns_to_drop_bi, random_state,  K, test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data.drop(columns='is_attack')
        data_list.append(data)

    rs = random_state




    from sklearn.svm import SVC
    clfs = [] # This is going to contain 14 different classifiers
    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    for i in range(len(input_file)):
        classifier = SVC(kernel = 'rbf', random_state = 41, gamma='scale',max_iter=-1, probability=True)
        classifier.fit(x_train_list[i], y_train_list[i])
        clfs.append(classifier)

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    from sklearn.mixture import GaussianMixture
    gms = []
    for i in range(len(input_file)):
        x = x_train_list[i]
        gm = GaussianMixture(n_components = K).fit(x)  
        gms.append(gm)

    prediction = pred(x_test, clfs, p_nodes, gms)

    correct = prediction == y_test
    accuracy = np.mean(correct)


    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))



    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


The process for SVC:

In [17]:
def test(epoch, input_file, columns_to_drop_bi, K):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Fed_SVC_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs, K=K)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))
   


In [18]:
test(epoch=20, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi, K=15)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

Binary = LR 

In [90]:
def Fed_LR_learn(input_file, columns_to_drop_bi, random_state,  K, test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state


    clfs = [] # This is going to contain 14 different classifiers
    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    for i in range(len(input_file)):
        classifier = LogisticRegression(random_state = random_state+1)
        classifier.fit(x_train_list[i], y_train_list[i])
        clfs.append(classifier)

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    from sklearn.mixture import GaussianMixture
    gms = []
    for i in range(len(input_file)):
        x = x_train_list[i]
        gm = GaussianMixture(n_components = K).fit(x)  
        gms.append(gm)

    prediction = pred(x_test, clfs, p_nodes, gms)

    correct = prediction == y_test
    accuracy = np.mean(correct)


    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


In [91]:
def test_LR(epoch, input_file, columns_to_drop_bi, K):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Fed_LR_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs, K=K)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))

In [92]:
test_LR(epoch=20, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi, K=15)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))



Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.99900724667685
Precision std= 0.0005441799033404899
Precision mean= 0.9999553951908009
Precision std= 9.585381378880726e-05
Precision mean= 0.9998695627050864
Precision std= 0.0001309211282717683
Precision mean= 0.9996700310410496
Precision std= 0.00027019716063137383
Recall mean of  0  = 0.9997812996550225
Recall std of  0  = 8.335136205153832e-05
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


binary=kNN

In [93]:
from sklearn.neighbors import KNeighborsClassifier
def Fed_knn_learn(input_file, columns_to_drop_bi, random_state,  K, test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state


    clfs = [] # This is going to contain 14 different classifiers
    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    for i in range(len(input_file)):
        classifier = KNeighborsClassifier()
        classifier.fit(x_train_list[i], y_train_list[i])
        clfs.append(classifier)

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    from sklearn.mixture import GaussianMixture
    gms = []
    for i in range(len(input_file)):
        x = x_train_list[i]
        gm = GaussianMixture(n_components = K).fit(x)  
        gms.append(gm)

    prediction = pred(x_test, clfs, p_nodes, gms)

    correct = prediction == y_test
    accuracy = np.mean(correct)


    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


In [94]:
def test_knn(epoch, input_file, columns_to_drop_bi, K):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Fed_knn_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs, K=K)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))

In [95]:
test_knn(epoch=20, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi, K=15)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))



Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9990214606517028
Precision std= 0.0006641493547936209
Precision mean= 0.9999642261696978
Precision std= 9.131589347814903e-05
Precision mean= 0.9998695627050864
Precision std= 0.0001309211282717683
Precision mean= 0.9996836912400646
Precision std= 0.00027946922233560047
Recall mean of  0  = 0.9997871108252431
Recall std of  0  = 9.79181038858179e-05
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


Binary=RF

In [96]:
from sklearn.ensemble import RandomForestClassifier
def Fed_RF_learn(input_file, columns_to_drop_bi, random_state,  K, test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state


    clfs = [] # This is going to contain 14 different classifiers
    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    for i in range(len(input_file)):
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = random_state)
        classifier.fit(x_train_list[i], y_train_list[i])
        clfs.append(classifier)

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    from sklearn.mixture import GaussianMixture
    gms = []
    for i in range(len(input_file)):
        x = x_train_list[i]
        gm = GaussianMixture(n_components = K).fit(x)  
        gms.append(gm)

    prediction = pred(x_test, clfs, p_nodes, gms)

    correct = prediction == y_test
    accuracy = np.mean(correct)


    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


In [98]:
def test_RF(epoch, input_file, columns_to_drop_bi, K):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Fed_RF_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs, K=K)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))

In [99]:
test_RF(epoch=20, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi, K=15)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))



Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9989502127768637
Precision std= 0.000508913366553724
Precision mean= 0.9999644002398025
Precision std= 9.104124232673337e-05
Precision mean= 0.9998695627050864
Precision std= 0.0001309211282717683
Precision mean= 0.999518946096234
Precision std= 0.000387281299552906
Recall mean of  0  = 0.9997538965969325
Recall std of  0  = 9.451342906151792e-05
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


In [22]:
from sklearn.ensemble import RandomForestClassifier
def Centralized_RF_learn(input_file, columns_to_drop_bi, random_state,  test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state

    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    classifier = RandomForestClassifier(random_state = random_state)
    classifier.fit(x_train, y_train)

    prediction = classifier.predict(x_test)

    correct = prediction == y_test
    accuracy = np.mean(correct)

    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


In [23]:
def test_RF_centralized(epoch, input_file, columns_to_drop_bi):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Centralized_RF_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))

In [24]:
test_RF_centralized(epoch=20, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))


Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9999820595622534
Precision std= 7.820055514067361e-05
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9999036448166143
Precision std= 0.00020002555094997482
Recall mean of  0  = 1.0
Recall std of  0  = 0.0
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 0.9999372614305095
Recall std of  2  = 0.0001300557409563111
Recall mean of  3  = 0.9999798630688683
Recall std of  3  = 8.777484783610927e-05
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


In [25]:
from sklearn.ensemble import RandomForestClassifier
def Centralized_knn_learn(input_file, columns_to_drop_bi, random_state,  test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state

    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    classifier = KNeighborsClassifier()
    classifier.fit(x_train, y_train)

    prediction = classifier.predict(x_test)

    correct = prediction == y_test
    accuracy = np.mean(correct)

    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 



def test_knn_centralized(epoch, input_file, columns_to_drop_bi):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Centralized_knn_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))


In [28]:
test_knn_centralized(epoch=20, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))


Precision mean= 0.999955120588093
Precision std= 4.320130644528789e-05
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9999176920181038
Precision std= 0.0001528938704623798
Recall mean of  0  = 0.9999902281361015
Recall std of  0  = 2.0957411812910564e-05
Recall mean of  1  = 0.9999716312056737
Recall std of  1  = 0.00012365670761816562
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 0.9997789836381866
Recall std of  3  = 0.00019974698760390254
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


In [29]:

def Centralized_SVC_learn(input_file, columns_to_drop_bi, random_state,  test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state

    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    classifier = SVC(kernel = 'rbf', random_state = random_state + 1, gamma='scale',max_iter=-1, probability=True)
    classifier.fit(x_train, y_train)

    prediction = classifier.predict(x_test)

    correct = prediction == y_test
    accuracy = np.mean(correct)

    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 



def test_SVC_centralized(epoch, input_file, columns_to_drop_bi):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Centralized_SVC_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))


In [30]:
test_SVC_centralized(epoch=20, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))


Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9999799719607451
Precision std= 8.730019914961257e-05
Precision mean= 0.9941722875284078
Precision std= 0.0012816573848171001
Recall mean of  0  = 0.9999765668381739
Recall std of  0  = 2.5909502719107945e-05
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 0.9963149562509972
Recall std of  2  = 0.0008132200504967324
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


In [31]:

def Centralized_LR_learn(input_file, columns_to_drop_bi, random_state,  test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state

    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    classifier = LogisticRegression(max_iter=100)
    classifier.fit(x_train, y_train)

    prediction = classifier.predict(x_test)

    correct = prediction == y_test
    accuracy = np.mean(correct)

    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 



def test_LR_centralized(epoch, input_file, columns_to_drop_bi):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Centralized_LR_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))


In [32]:
test_LR_centralized(epoch=20, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


Precision mean= 0.9999785533042159
Precision std= 2.8827375399858643e-05
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9992935519882588
Precision std= 0.0003277069990335185
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9953818282393699
Precision std= 0.0013086199317330425
Recall mean of  0  = 1.0
Recall std of  0  = 0.0
Recall mean of  1  = 0.9999571174175751
Recall std of  1  = 0.00013590384414920804
Recall mean of  2  = 0.9972451627994587
Recall std of  2  = 0.0008250962249233801
Recall mean of  3  = 0.9997290427872668
Recall std of  3  = 0.000271852892391403
Recall mean of  4  = 0.9988006005608827
Recall std of  4  = 0.0005660146490656415


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
