In [1]:
from google.colab import drive
drive.mount(('/content/drive'))

Mounted at /content/drive


In [26]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import glob
import copy
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [27]:
# Two-hidden-layer MLP model
class MLP(nn.Module):
    def __init__(self, dim_in, dim_hidden1, dim_hidden2, dim_out):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(dim_in, dim_hidden1)
        self.bn1 = nn.BatchNorm1d(dim_hidden1)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(dim_hidden1, dim_hidden2)
        self.bn2 = nn.BatchNorm1d(dim_hidden2)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(dim_hidden2, dim_out)

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = nn.ReLU()(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = nn.ReLU()(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return torch.log_softmax(x, dim=1)

In [28]:
# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [29]:
# Data quality check and preparation function
def prepare_data(file_path):
    df = pd.read_csv(file_path)
    df = df.replace([np.inf, -np.inf], np.nan).dropna() # Handling Inf and -Inf
    df['marker_encoded'] = df['marker'].map({'Attack': 1, 'Natural': 0})
    df = df.drop(columns=['marker']).dropna()
    X = df.drop(columns=['marker_encoded']).values
    y = df['marker_encoded'].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, y

In [30]:
# Function to average the weights of models
def average_weights(model_weights):
    global_weights = copy.deepcopy(model_weights[0])
    for key in global_weights.keys():
        for i in range(1, len(model_weights)):
            global_weights[key] += model_weights[i][key]
        global_weights[key] = torch.div(global_weights[key], len(model_weights))
    return global_weights

In [31]:
def federated_training(models, datasets, epochs, l1_lambda=0.001, l2_lambda=0.001):
    global_model = models[0]
    criterion = nn.NLLLoss()


    for idx, (model, dataset) in enumerate(zip(models, datasets)):
        best_loss = np.inf
        no_improve_epoch = 0
        early_stopping_thresh = 10  # Early stopping threshold

        optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=l2_lambda)  # L2 regularization
        scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5, verbose=True)

        print(f"Training model for Dataset {idx + 1}/{len(datasets)}")

        for epoch in range(epochs):
            model.train()
            local_loss = 0

            loader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)
            for data, target in loader:
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                l1_regularization = sum(param.abs().sum() for param in model.parameters())  # L1 regularization
                total_loss = loss + l1_lambda * l1_regularization
                total_loss.backward()
                optimizer.step()
                local_loss += total_loss.item()

            # Early stopping
            scheduler.step(local_loss)
            if local_loss < best_loss:
                best_loss = local_loss
                no_improve_epoch = 0
            else:
                no_improve_epoch += 1
                if no_improve_epoch >= early_stopping_thresh:
                    print(f"Early stopping triggered for Dataset {idx + 1} model after epoch {epoch+1}")
                    break

            print(f'Dataset {idx + 1} model training, Epoch {epoch+1}, Loss: {local_loss / len(loader)}')

        print(50*"-")

        # Updating the global model after each model is trained for 'epochs' or early stopped
        global_model.load_state_dict(model.state_dict(), strict=False)

    local_weights = [model.state_dict() for model in models]
    global_weights = average_weights(local_weights)
    global_model.load_state_dict(global_weights)

    return global_model

In [32]:
def evaluate(model, test_dataset):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        loader = DataLoader(test_dataset, batch_size=32)
        for data, target in loader:
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            total += target.size(0)
    accuracy = 100. * correct / total
    print(f'Test set: Accuracy: {accuracy}%')

In [33]:
# Loading datasets
folder_path = '/content/drive/MyDrive/GRA/Project/dataset'
file_paths = glob.glob(os.path.join(folder_path, '*.csv'))
datasets = [CustomDataset(*prepare_data(file)) for file in file_paths]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['marker_encoded'] = df['marker'].map({'Attack': 1, 'Natural': 0})


In [47]:
# Preparing test dataset

# X_train, X_test, y_train, y_test = train_test_split(*prepare_data(file_paths[14]), test_size=0.2)
# test_dataset = CustomDataset(X_test, y_test)


all_features = []
all_labels = []

for dataset in datasets:
    all_features.extend(dataset.features.numpy())
    all_labels.extend(dataset.labels.numpy())

all_features = np.array(all_features)
all_labels = np.array(all_labels)

X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)

test_dataset = CustomDataset(X_test, y_test)

In [48]:
num_features = X_train.shape[1]
num_features

128

In [36]:
# Initializing models for each dataset
models = [MLP(dim_in=num_features, dim_hidden1=100, dim_hidden2=50, dim_out=2) for _ in datasets]

In [37]:
# Training the models in a federated way with aggregation
global_model = federated_training(models, datasets, epochs=200)



Training model for Dataset 1/15
Dataset 1 model training, Epoch 1, Loss: 1.3224870327271914
Dataset 1 model training, Epoch 2, Loss: 1.0595907180717117
Dataset 1 model training, Epoch 3, Loss: 0.9137440327751009
Dataset 1 model training, Epoch 4, Loss: 0.8310335505949823
Dataset 1 model training, Epoch 5, Loss: 0.7868308358286557
Dataset 1 model training, Epoch 6, Loss: 0.7576124695570845
Dataset 1 model training, Epoch 7, Loss: 0.7365399563783094
Dataset 1 model training, Epoch 8, Loss: 0.7189139908081607
Dataset 1 model training, Epoch 9, Loss: 0.7021303506273973
Dataset 1 model training, Epoch 10, Loss: 0.6827802148304487
Dataset 1 model training, Epoch 11, Loss: 0.667420935277876
Dataset 1 model training, Epoch 12, Loss: 0.6578408524786171
Dataset 1 model training, Epoch 13, Loss: 0.643944878523287
Dataset 1 model training, Epoch 14, Loss: 0.626386142482883
Dataset 1 model training, Epoch 15, Loss: 0.6173758951849059
Dataset 1 model training, Epoch 16, Loss: 0.6081043891608715
Data

In [49]:
# Evaluating the aggregated model
evaluate(global_model, test_dataset)

Test set: Accuracy: 70.90530697190427%
