In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing as scale
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn import mixture
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [23]:
input_file = [ "biflow_sparta.csv", "biflow_scan_sU.csv", 'biflow_scan_A.csv', "biflow_mqtt_bruteforce.csv"]
columns_to_drop_bi = ['proto', 'ip_src', 'ip_dst']

The Federated Learner:

Parameters:

x: The data to be decided (array)

clfs:  The binary classifiers;(List of Classifiers)

p_nodes: The proportion of the nodes;(List)

gms: Gassian Mixture Models (List of GMMs)



In [24]:
def pred(x, clfs, p_nodes, gms):
    p_x_node = np.zeros((len(x), len(clfs))) 
    p_y_given_node = np.zeros((len(x), len(clfs))) 
    for i in range(len(clfs)):
        p_x_node[:, i] = p_nodes[i] * np.exp(gms[i].score_samples(x))
        p_y_given_node[:, i] = clfs[i].predict_proba(x)[:, 1]
    
    p_y = p_y_given_node * p_x_node / (np.sum(p_x_node, axis=-1, keepdims=True) + 1e-10)

    p_y = np.hstack([1-np.sum(p_y, axis=-1, keepdims=True), p_y])

    return np.argmax(p_y, axis=-1)

Binary=SVM

In [25]:
def Fed_SVC_learn(input_file, columns_to_drop_bi, random_state,  K, test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data = data.drop(columns='is_attack')
        data_list.append(data)

    rs = random_state

    from sklearn.svm import SVC
    clfs = [] # This is going to contain 14 different classifiers
    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    for i in range(len(input_file)):
        classifier = SVC(kernel = 'rbf', random_state = 41, gamma='scale',max_iter=-1, probability=True)
        classifier.fit(x_train_list[i], y_train_list[i])
        clfs.append(classifier)

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    from sklearn.mixture import GaussianMixture
    gms = []
    for i in range(len(input_file)):
        x = x_train_list[i]
        gm = GaussianMixture(n_components = K).fit(x)  
        gms.append(gm)

    prediction = pred(x_test, clfs, p_nodes, gms)

    correct = prediction == y_test
    accuracy = np.mean(correct)


    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))



    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


The process for SVC:

In [26]:
def test(epoch, input_file, columns_to_drop_bi, K):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Fed_SVC_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs, K=K)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))
   


In [27]:
test(epoch=1, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi, K=15)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Precision mean= 0.9999609801779303
Precision std= 0.0
Precision mean= 0.999712395743457
Precision std= 0.0
Precision mean= 0.9991109530583214
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9721456150027579
Precision std= 0.0
Recall mean of  0  = 0.9958420766301391
Recall std of  0  = 0.0
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 0.9998220640569395
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


Binary = LR 

In [7]:
def Fed_LR_learn(input_file, columns_to_drop_bi, random_state,  K, test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state


    clfs = [] # This is going to contain 14 different classifiers
    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x = x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    for i in range(len(input_file)):
        classifier = LogisticRegression(random_state = random_state+1)
        classifier.fit(x_train_list[i], y_train_list[i])
        clfs.append(classifier)

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    from sklearn.mixture import GaussianMixture
    gms = []
    for i in range(len(input_file)):
        x = x_train_list[i]
        gm = GaussianMixture(n_components = K).fit(x)  
        gms.append(gm)

    prediction = pred(x_test, clfs, p_nodes, gms)

    correct = prediction == y_test
    accuracy = np.mean(correct)


    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


In [8]:
def test_LR(epoch, input_file, columns_to_drop_bi, K):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Fed_LR_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs, K=K)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))

In [9]:
test_LR(epoch=1, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi, K=15)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Precision mean= 0.9980099890744498
Precision std= 0.0
Precision mean= 0.999712395743457
Precision std= 0.0
Precision mean= 0.9987553342816501
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9721456150027579
Precision std= 0.0
Recall mean of  0  = 0.9957564431986295
Recall std of  0  = 0.0
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 0.9910021171489062
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


binary=kNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier
def Fed_knn_learn(input_file, columns_to_drop_bi, random_state,  K, test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state


    clfs = [] # This is going to contain 14 different classifiers
    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x = x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    for i in range(len(input_file)):
        classifier = KNeighborsClassifier()
        classifier.fit(x_train_list[i], y_train_list[i])
        clfs.append(classifier)

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    from sklearn.mixture import GaussianMixture
    gms = []
    for i in range(len(input_file)):
        x = x_train_list[i]
        gm = GaussianMixture(n_components = K).fit(x)  
        gms.append(gm)

    prediction = pred(x_test, clfs, p_nodes, gms)

    correct = prediction == y_test
    accuracy = np.mean(correct)


    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


In [11]:
def test_knn(epoch, input_file, columns_to_drop_bi, K):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Fed_knn_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs, K=K)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))

In [12]:
test_knn(epoch=1, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi, K=15)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Precision mean= 0.9936397690026534
Precision std= 0.0
Precision mean= 0.999424791486914
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9859349145063431
Precision std= 0.0
Recall mean of  0  = 0.9979230347205894
Recall std of  0  = 0.0
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 0.9563937934724451
Recall std of  4  = 0.0


Binary=RF

In [13]:
from sklearn.ensemble import RandomForestClassifier
def Fed_RF_learn(input_file, columns_to_drop_bi, random_state,  K, test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state


    clfs = [] # This is going to contain 14 different classifiers
    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x = x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    for i in range(len(input_file)):
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = random_state)
        classifier.fit(x_train_list[i], y_train_list[i])
        clfs.append(classifier)

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    from sklearn.mixture import GaussianMixture
    gms = []
    for i in range(len(input_file)):
        x = x_train_list[i]
        gm = GaussianMixture(n_components = K).fit(x)  
        gms.append(gm)

    prediction = pred(x_test, clfs, p_nodes, gms)

    correct = prediction == y_test
    accuracy = np.mean(correct)


    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


In [14]:
def test_RF(epoch, input_file, columns_to_drop_bi, K):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Fed_RF_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs, K=K)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))

In [15]:
test_RF(epoch=1, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi, K=15)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Precision mean= 0.999492742313095
Precision std= 0.0
Precision mean= 0.999424791486914
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9834528405956977
Precision std= 0.0
Recall mean of  0  = 0.9975853877010554
Recall std of  0  = 0.0
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 0.996367700474993
Recall std of  4  = 0.0


In [31]:
from sklearn.ensemble import RandomForestClassifier
def Centralized_RF_learn(input_file, columns_to_drop_bi, random_state,  test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state

    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x = x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    classifier = RandomForestClassifier(random_state = random_state)
    classifier.fit(x_train, y_train)

    prediction = classifier.predict(x_test)

    correct = prediction == y_test
    accuracy = np.mean(correct)

    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 


In [32]:
def test_RF_centralized(epoch, input_file, columns_to_drop_bi):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Centralized_RF_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))

In [33]:
test_RF_centralized(epoch=1, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Precision mean= 0.9996878414234431
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9966905681191396
Precision std= 0.0
Recall mean of  0  = 0.9995318352059925
Recall std of  0  = 0.0
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 0.9977912755383765
Recall std of  4  = 0.0


In [34]:
from sklearn.ensemble import RandomForestClassifier
def Centralized_knn_learn(input_file, columns_to_drop_bi, random_state,  test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state

    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x = x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    classifier = KNeighborsClassifier()
    classifier.fit(x_train, y_train)

    prediction = classifier.predict(x_test)

    correct = prediction == y_test
    accuracy = np.mean(correct)

    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 



def test_knn_centralized(epoch, input_file, columns_to_drop_bi):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Centralized_knn_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))


In [35]:
test_knn_centralized(epoch=1, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Precision mean= 0.9986343062275636
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9801434087148373
Precision std= 0.0
Recall mean of  0  = 0.9971946230274693
Recall std of  0  = 0.0
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 0.999599278701663
Recall std of  3  = 0.0
Recall mean of  4  = 0.9908001115137999
Recall std of  4  = 0.0


In [36]:

def Centralized_SVC_learn(input_file, columns_to_drop_bi, random_state,  test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state

    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x = x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    classifier = SVC(kernel = 'rbf', random_state = random_state + 1, gamma='scale',max_iter=-1, probability=True)
    classifier.fit(x_train, y_train)

    prediction = classifier.predict(x_test)

    correct = prediction == y_test
    accuracy = np.mean(correct)

    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 



def test_SVC_centralized(epoch, input_file, columns_to_drop_bi):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Centralized_SVC_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))


In [37]:
test_SVC_centralized(epoch=1, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Precision mean= 1.0
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9907539118065434
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9721456150027579
Precision std= 0.0
Recall mean of  0  = 0.9940653969977891
Recall std of  0  = 0.0
Recall mean of  1  = 1.0
Recall std of  1  = 0.0
Recall mean of  2  = 1.0
Recall std of  2  = 0.0
Recall mean of  3  = 1.0
Recall std of  3  = 0.0
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


In [38]:

def Centralized_LR_learn(input_file, columns_to_drop_bi, random_state,  test_size = 0.25):
    
    data_list = []
    for i in range(len(input_file)):
        data = pd.read_csv(input_file[i])
        data.drop(columns = columns_to_drop_bi, inplace = True)
        data_list.append(data)

    rs = random_state

    n_samples = []
    x_train_list = []
    y_train_list = []
    x_test_list = []
    y_test_list = []
    y = [np.ones(len(data_list[i])) * (i + 1) for i in range(len(data_list))]
    for i in range(len(input_file)): # reading the data
        data0 = pd.read_csv(input_file[i])
        y[i][data0.is_attack == 0] = 0
        n_samples.append(len(y[i])) # the number of this node
        x = data_list[i]
        x = x.drop(columns='is_attack')
        x_train, x_test, y_train, y_test = train_test_split(x, y[i], test_size = test_size, random_state = rs)
        x_train_list.append(x_train)
        y_train_list.append(y_train)
        x_test_list.append(x_test)
        y_test_list.append(y_test)

    x_train_full = pd.concat(x_train_list, ignore_index=True)
    scaler = scale.StandardScaler().fit(x_train_full)

    for i in range(len(input_file)):
        x_train_list[i] = scaler.transform(x_train_list[i])

    total_n_samples = np.sum(n_samples)

    p_nodes =  np.array(n_samples) / total_n_samples


    for i in range(len(input_file)):
        x_test_list[i] = scaler.transform(x_test_list[i])

    x_test = np.vstack(x_test_list)
    y_test = np.hstack(y_test_list)
    x_train = np.vstack(x_train_list)
    y_train = np.hstack(y_train_list)

    classifier = LogisticRegression(max_iter=100)
    classifier.fit(x_train, y_train)

    prediction = classifier.predict(x_test)

    correct = prediction == y_test
    accuracy = np.mean(correct)

    accs = []
    for i in range(len(input_file)+1):
        if len(correct[y_test==i]) == 0:
            accs.append(0)
        else:
            accs.append(np.mean(correct[y_test==i]))

    recalls = []
    for i in range(len(input_file)+1):
        if len(correct[prediction==i]) == 0:
            recalls.append(0)
        else:
            recalls.append(np.mean(correct[prediction==i]))

    return recalls, accs 



def test_LR_centralized(epoch, input_file, columns_to_drop_bi):

   recall , precision = np.ones((epoch,len(input_file)+1))*0 , np.ones((epoch,len(input_file)+1))*0

   print('main process:')

   for rs in tqdm(range(epoch)):
      recalls, accs = Centralized_LR_learn(input_file = input_file, columns_to_drop_bi=columns_to_drop_bi, random_state=rs)
      recall[rs,:] = recalls
      precision[rs,:] = accs

   for i in range(len(input_file)+1):
      print('Precision mean=',np.mean(precision[:,i]))
      print('Precision std=',np.std(precision[:,i]))

   for i in range(len(input_file)+1):
      print('Recall mean of ',i,' =',np.mean(recall[:,i]))
      print('Recall std of ',i,' =',np.std(recall[:,i]))


In [39]:
test_LR_centralized(epoch=1, input_file=input_file, columns_to_drop_bi=columns_to_drop_bi)

main process:


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


Precision mean= 0.9985172467613548
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9932432432432432
Precision std= 0.0
Precision mean= 1.0
Precision std= 0.0
Precision mean= 0.9721456150027579
Precision std= 0.0
Recall mean of  0  = 0.9946361940298507
Recall std of  0  = 0.0
Recall mean of  1  = 0.9997124784358827
Recall std of  1  = 0.0
Recall mean of  2  = 0.9937733499377335
Recall std of  2  = 0.0
Recall mean of  3  = 0.9993990384615384
Recall std of  3  = 0.0
Recall mean of  4  = 1.0
Recall std of  4  = 0.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
