Federated Learning using PyTorch and PySyft with Trusted FedAvg on DNS traffic datasets.

## Add libraries, define FL clients

In [None]:
!pip install syft==0.2.9

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as sched
from torch.nn import BCELoss
import torch.utils.data as tud
import pandas as pd
from numpy.linalg import norm
from statistics import median
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
import syft as sy

# hook PyTorch to PySyft, i.e. add extra functionalities to support Federated Learning and other private AI tools
hook = sy.TorchHook(torch)

In [None]:
# create clients
clients = []
clients.append(sy.VirtualWorker(hook, id="bob"))
clients.append(sy.VirtualWorker(hook, id="alice"))
clients.append(sy.VirtualWorker(hook, id="untrustful"))

## Load and preprocess data

In [None]:
# load datasets
# there is no problem with unbalanced training data, as long as we have many samples of both classes
# however, the test set is better to be relatively balanced
df1 = pd.read_csv("booter1.csv", nrows=160710)
df2 = pd.read_csv("booter2.csv", nrows=115340)
df3 = pd.read_csv("booter3.csv", nrows=130200)
df4 = pd.read_csv("booter4.csv", nrows=195070)
df5 = pd.read_csv("booter5.csv", nrows=30150)
df6 = pd.read_csv("booter6.csv", nrows=53050)
df7 = pd.read_csv("booter7.csv", nrows=90730)
dfg = pd.read_csv("wideg.csv", nrows=247220)
dff = pd.read_csv("widef.csv", nrows=278370)

In [None]:
# pytorch requires float type values
concatenated = pd.concat([df1, df2, df3, df4, df5, df6, df7, dfg, dff], ignore_index=True).astype('float32')
concatenated = shuffle(concatenated)

In [None]:
concatenated.head()

Unnamed: 0,ip.len,udp.length,dns.flags.authoritative,dns.flags.recdesired,dns.flags.recavail,dns.count.answers,dns.count.add_rr,dns.qry.name,dns.qry.type,target
535543,1500.0,4086.0,0.0,1.0,1.0,252.0,1.0,70575136.0,0.0,1.0
161225,1054.0,1034.0,0.0,1.0,1.0,14.0,23.0,61960000.0,1.0,1.0
459694,1500.0,4103.0,0.0,1.0,1.0,252.0,1.0,70575136.0,0.0,1.0
348837,1500.0,1742.0,0.0,1.0,1.0,17.0,23.0,61960000.0,1.0,1.0
1135511,136.0,116.0,1.0,0.0,0.0,1.0,1.0,55004640.0,0.0,0.0


In [None]:
trainset = concatenated.iloc[:len(concatenated)*7//10,:]
testset = concatenated.iloc[len(concatenated)*7//10:,:]

In [None]:
# we need to normalize data
scaler = StandardScaler().fit(trainset.iloc[:,:9])
train_scaled = scaler.transform(trainset.iloc[:,:9])
# we scale test set using train set distribution,
# otherwise we violate rule "never use test set for training"
test_scaled = scaler.transform(testset.iloc[:,:9])

In [None]:
# transform to tensors
target_train = torch.tensor(trainset['target'].to_numpy())
features_train = torch.tensor(train_scaled)

target_test = torch.tensor(testset['target'].to_numpy())
features_test = torch.tensor(test_scaled)

In [None]:
# final datasets for training with pytorch
train_dataset = tud.TensorDataset(features_train, target_train)
test_dataset = tud.TensorDataset(features_test, target_test)

## Define training parameters and model, send data to clients

In [None]:
# define the args
args = {
    'use_cuda' : True,
    'batch_size' : 128,
    'test_batch_size' : 1000,
    'lr' : 0.1,
    'log_interval' : 200,
    'epochs' : 10
}

# check to use GPU or not
use_cuda = args['use_cuda'] and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
# create a simple feedforward network
# n features as input, 2*n hidden layer neurons, 1 output for binary classification
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        
        self.layers = nn.Sequential(
            nn.Linear(in_features=9, out_features=18),
            nn.ReLU(),
            nn.Linear(in_features=18, out_features=1),
            nn.Sigmoid()
        )
            
    def forward(self, x):
        return self.layers(x)

In [None]:
# prepare and distribute the data across workers
# normally there is no need to distribute data, since it is already at the clients
# this is more of a simulation of federated learning

# federate function of PySyft exhausts RAM if we use most part of our datasets 
federated_dataset = train_dataset.federate(tuple(clients))

"""
# below is a memory efficient implementation based on the source code of PySyft's federate function
datasets = [sy.BaseDataset(torch.tensor([]).send(c), torch.tensor([]).send(c)) for c in clients]
data_loader = tud.DataLoader(train_dataset, batch_size=1024)
for dataset_idx, (datas, targetas) in enumerate(data_loader):
    worker = clients[dataset_idx % len(clients)]
    datas = datas.send(worker)
    targetas = targetas.send(worker)
    datasets[dataset_idx % len(clients)].data = torch.cat((datasets[dataset_idx % len(clients)].data, datas))
    datasets[dataset_idx % len(clients)].targets = torch.cat((datasets[dataset_idx % len(clients)].targets, targetas))

federated_dataset = sy.FederatedDataset(datasets)
"""

federated_train_loader = sy.FederatedDataLoader(federated_dataset, batch_size=args['batch_size'], shuffle=True)

# test data remains at the central entity
test_loader = tud.DataLoader(test_dataset, batch_size=args['test_batch_size'], shuffle=True)

## Train, test, aggregation, trust computation functions

In [None]:
# classic torch code for training except for the federated part
def train_locally(args, models, device, train_loader, optimizers, epoch):
    for c, m in models.items():
        m.train()
        # send models to workers
        m.send(c)

    # iterate over federated data
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)

        optimizers[data.location].zero_grad()
        output = models[data.location](data)
        # we intentionally change the output of a client's model to make that client behave as untrustful
        if data.location.id == 'untrustful':
            output = torch.ones([len(output), 1], dtype=torch.float32, device=device).send(data.location) - output
        # loss is a ptr to the tensor loss at the remote location
        loss = BCELoss()(output, torch.reshape(target, [len(target),1]))
        # call backward() on the loss ptr, that will send the command to call
        # backward on the actual loss tensor present on the remote machine
        loss.backward()
        optimizers[data.location].step()

        if batch_idx % args['log_interval'] == 0:

            # get back loss, that was created at remote worker
            loss = loss.get()

            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tWorker: {}'.format(
                    epoch, 
                    batch_idx * args['batch_size'], # number of packets done
                    len(train_loader) * args['batch_size'], # total packets left
                    100. * batch_idx / len(train_loader),
                    loss,
                    data.location.id
                )
            )
    
    # get back models for aggregation
    for m in models.values():
        m = m.get()

In [None]:
# classic torch code for testing
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)

            # add losses together
            test_loss += BCELoss(reduction='sum')(output, torch.reshape(target, [len(target),1])).item()

            # get the index of the max probability class
            pred = pred = torch.round(output)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [None]:
def aggregate(central_model, models, weights, trust):
    with torch.no_grad():
        # firstly compute new weight values
        dataXtrust = 0
        for c in models:
            weights['hidden_mean_weight'] += models[c].layers[0].weight.data.clone()*len(federated_dataset.__getitem__(c.id))*trust[c]
            weights['hidden_mean_bias'] += models[c].layers[0].bias.data.clone()*len(federated_dataset.__getitem__(c.id))*trust[c]
            weights['output_mean_weight'] += models[c].layers[2].weight.data.clone()*len(federated_dataset.__getitem__(c.id))*trust[c]
            weights['output_mean_bias'] += models[c].layers[2].bias.data.clone()*len(federated_dataset.__getitem__(c.id))*trust[c]

            dataXtrust += len(federated_dataset.__getitem__(c.id))*trust[c]

        weights['hidden_mean_weight'] = weights['hidden_mean_weight']/dataXtrust
        weights['hidden_mean_bias'] = weights['hidden_mean_bias']/dataXtrust
        weights['output_mean_weight'] = weights['output_mean_weight']/dataXtrust
        weights['output_mean_bias'] = weights['output_mean_bias']/dataXtrust

        # then copy them to the local models
        for m in models.values():
            m.layers[0].weight.data = weights['hidden_mean_weight'].data.clone()
            m.layers[0].bias.data = weights['hidden_mean_bias'].data.clone()
            m.layers[2].weight.data = weights['output_mean_weight'].data.clone()
            m.layers[2].bias.data = weights['output_mean_bias'].data.clone()

        # and to the central model for the test set
        central_model.layers[0].weight.data = weights['hidden_mean_weight'].data.clone()
        central_model.layers[0].bias.data = weights['hidden_mean_bias'].data.clone()
        central_model.layers[2].weight.data = weights['output_mean_weight'].data.clone()
        central_model.layers[2].bias.data = weights['output_mean_bias'].data.clone()

In [None]:
def computeTrust(models, trust, r, s):
    # dev[i] shows how the weights of model of client i differ from the models of all other clients
    dev = [0 for i in clients]
    for n, i in enumerate(clients):
        for j in clients:
            dev[n] += norm(models[j].layers[0].weight.data.cpu()-models[i].layers[0].weight.data.cpu())**2
            dev[n] += norm(models[j].layers[2].weight.data.cpu()-models[i].layers[2].weight.data.cpu())**2
        dev[n] /= len(clients)

    # I[i] = 1 if client i acts normally and 0 if malicious or malfunctions
    I = [1 if d <= 1.3 * median(sorted(dev)) else 0 for d in dev]
    #print("dev: ",dev) # testing
    #print("median*1.3: ", 1.3*median(sorted(dev))) # testing
    #print("I: ", I) # testing
 
    for i in range(len(clients)):
        p1 = 0.5
        p2 = lambda x: x/median(sorted(dev)) if x/median(sorted(dev)) > 3 and x > 30 else (x/1000 if x > 1000 else (0.01 if I[i] == 1 and s[i] > 10 else 0.7))
        r[i] = p1*r[i] + I[i]
        s[i] = p2(dev[i])*s[i] + 1 - I[i]

    for i, c in enumerate(clients):
        trust[c] = (r[i]+1)/(r[i]+s[i]+2)

## FL training and results

In [None]:
# central model
central_model = Net().to(device)
# optimizer for central model not needed if model is not trained
#optimizer = optim.SGD(central_model.parameters(), lr=args['lr'])

# clients' models, optimizers and schedulers for learning rate
models = {i:Net().to(device) for i in clients}
optimizers = {i:optim.SGD(models[i].parameters(), lr=args['lr']) for i in clients}
#lamda = lambda epoch: 10 if epoch < 2 else (1 if epoch < 6 else 0.1)
#schedulers = {i:sched.LambdaLR(optimizers[i], lr_lambda=lamda) for i in clients}

# initialization of dictionary for models aggregation
weights = {'hidden_mean_weight' : torch.zeros(size=central_model.layers[0].weight.shape).to(device),
           'hidden_mean_bias' : torch.zeros(size=central_model.layers[0].bias.shape).to(device),
           'output_mean_weight' : torch.zeros(size=central_model.layers[2].weight.shape).to(device),
           'output_mean_bias' : torch.zeros(size=central_model.layers[2].bias.shape).to(device)}

# trust values
trust = {i:0 for i in clients}
r = [0 for i in clients]
s = [0 for i in clients]

for epoch in range(1, args['epochs'] + 1):
    # below function is modified to simulate untrustful behavior of a client
    train_locally(args, models, device, federated_train_loader, optimizers, epoch)
    # we also shift the weights of the untrustful client to have more influence on the aggregated model
    models[clients[2]].layers[0].weight.data *= 1.5
    models[clients[2]].layers[2].weight.data *= 1.5
    #for scheduler in schedulers.values():
        #scheduler.step()
    computeTrust(models, trust, r, s)
    aggregate(central_model, models, weights, trust)
    test(central_model, device, test_loader)


Test set: Average loss: 1.2456, Accuracy: 54180/390252 (14%)


Test set: Average loss: 0.0875, Accuracy: 379981/390252 (97%)


Test set: Average loss: 0.0602, Accuracy: 380418/390252 (97%)


Test set: Average loss: 0.0429, Accuracy: 382044/390252 (98%)


Test set: Average loss: 0.0368, Accuracy: 384119/390252 (98%)


Test set: Average loss: 0.0315, Accuracy: 386195/390252 (99%)


Test set: Average loss: 0.0257, Accuracy: 387604/390252 (99%)


Test set: Average loss: 0.0232, Accuracy: 387564/390252 (99%)


Test set: Average loss: 1.0139, Accuracy: 315665/390252 (81%)


Test set: Average loss: 0.0165, Accuracy: 388208/390252 (99%)



## Training without trust

In [None]:
# results of training with an untrustful client same as above, but with simple FedAvg

# central model
central_model = Net().to(device)

# clients' models, optimizers and schedulers for learning rate
models = {i:Net().to(device) for i in clients}
optimizers = {i:optim.SGD(models[i].parameters(), lr=args['lr']) for i in clients}

# initialization of dictionary for models aggregation
weights = {'hidden_mean_weight' : torch.zeros(size=central_model.layers[0].weight.shape).to(device),
           'hidden_mean_bias' : torch.zeros(size=central_model.layers[0].bias.shape).to(device),
           'output_mean_weight' : torch.zeros(size=central_model.layers[2].weight.shape).to(device),
           'output_mean_bias' : torch.zeros(size=central_model.layers[2].bias.shape).to(device)}

# trust values
trust = {i:1 for i in clients}

for epoch in range(1, args['epochs'] + 1):
    # below function is modified to simulate untrustful behavior of a client
    train_locally(args, models, device, federated_train_loader, optimizers, epoch)
    # we also shift the weights of the untrustful client to have more influence on the aggregated model
    models[clients[2]].layers[0].weight.data *= 1.5
    models[clients[2]].layers[2].weight.data *= 1.5
    aggregate(central_model, models, weights, trust)
    test(central_model, device, test_loader)


Test set: Average loss: 0.7162, Accuracy: 191070/390252 (49%)


Test set: Average loss: 0.2269, Accuracy: 375419/390252 (96%)


Test set: Average loss: 0.2229, Accuracy: 380003/390252 (97%)


Test set: Average loss: 0.2611, Accuracy: 377319/390252 (97%)


Test set: Average loss: 0.3599, Accuracy: 308265/390252 (79%)


Test set: Average loss: 0.4762, Accuracy: 294724/390252 (76%)


Test set: Average loss: 0.4342, Accuracy: 305649/390252 (78%)


Test set: Average loss: 0.7058, Accuracy: 291269/390252 (75%)


Test set: Average loss: 0.9808, Accuracy: 279224/390252 (72%)


Test set: Average loss: 0.9856, Accuracy: 288053/390252 (74%)

