In [1]:
!pip install -q flwr[simulation] torch torchvision

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.1/330.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-metadata 1.15.0 requires protobuf<4.21,>=3.20.3; python_version < "3.11", but you have protobuf 4.25.3 which is incompatible.[0m[31m
[0m

In [1]:
from collections import OrderedDict
from typing import Dict, List, Optional, Tuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import CIFAR10
import time
import flwr as fl

DEVICE = torch.device("cuda")  # Try "cuda" to train on GPU
print(
    f"Training on {DEVICE} using PyTorch {torch.__version__} and Flower {fl.__version__}"
)

Training on cuda using PyTorch 1.13.1 and Flower 1.4.0


In [2]:
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
   process = psutil.Process(os.getpid())
   print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
   print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Gen RAM Free: 125.8 GB  | Proc size: 303.7 MB
GPU RAM Free: 15926MB | Used: 242MB | Util   1% | Total 16376MB


In [3]:
NUM_CLIENTS = 10


def load_datasets(num_clients: int):
    # Download and transform CIFAR-10 (train and test)
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
    )
    trainset = CIFAR10("./dataset", train=True, download=True, transform=transform)
    testset = CIFAR10("./dataset", train=False, download=True, transform=transform)

    # Split training set into `num_clients` partitions to simulate different local datasets
    partition_size = len(trainset) // num_clients
    lengths = [partition_size] * num_clients
    datasets = random_split(trainset, lengths, torch.Generator().manual_seed(42))

    # Split each partition into train/val and create DataLoader
    trainloaders = []
    valloaders = []
    for ds in datasets:
        len_val = len(ds) // 10  # 10 % validation set
        len_train = len(ds) - len_val
        lengths = [len_train, len_val]
        ds_train, ds_val = random_split(ds, lengths, torch.Generator().manual_seed(42))
        trainloaders.append(DataLoader(ds_train, batch_size=32, shuffle=True))
        valloaders.append(DataLoader(ds_val, batch_size=32))
    testloader = DataLoader(testset, batch_size=32)
    return trainloaders, valloaders, testloader


trainloaders, valloaders, testloader = load_datasets(NUM_CLIENTS)

Files already downloaded and verified
Files already downloaded and verified


In [4]:
class Net(nn.Module):
    def __init__(self) -> None:
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


def get_parameters(net) -> List[np.ndarray]:
    return [val.cpu().numpy() for _, val in net.state_dict().items()]


def set_parameters(net, parameters: List[np.ndarray]):
    params_dict = zip(net.state_dict().keys(), parameters)
    state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
    net.load_state_dict(state_dict, strict=True)


def train(net, trainloader, epochs: int):
    """Train the network on the training set."""
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters())
    net.train()
    for epoch in range(epochs):  # Use the passed 'epochs' variable here
        correct, total, epoch_loss = 0, 0, 0.0
        for images, labels in trainloader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = net(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # Metrics
            epoch_loss += loss.item()  # Make sure to call .item() to get the scalar value
            total += labels.size(0)
            correct += (torch.max(outputs.data, 1)[1] == labels).sum().item()
        epoch_loss /= len(trainloader.dataset)
        epoch_acc = correct / total
        print(f"Epoch {epoch}: train loss {epoch_loss:.6f}, accuracy {epoch_acc:.6f}")



def test(net, testloader):
    """Evaluate the network on the entire test set."""
    criterion = torch.nn.CrossEntropyLoss()
    correct, total, loss = 0, 0, 0.0
    net.eval()
    with torch.no_grad():
        for images, labels in testloader:
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = net(images)
            loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    loss /= len(testloader.dataset)
    accuracy = correct / total
    return loss, accuracy

In [5]:
class FlowerClient(fl.client.NumPyClient):
    def __init__(self, cid, net, trainloader, valloader):
        self.cid = cid
        self.net = net
        self.trainloader = trainloader
        self.valloader = valloader

    def get_parameters(self):
        print(f"[Client {self.cid}] get_parameters")
        return get_parameters(self.net)

    def fit(self, parameters, config):
        print(f"[Client {self.cid}] fit, config: {config}")
        set_parameters(self.net, parameters)
        epochs = config.get("epochs", 1)
        start_time = time.time()  # Start time measurement
        train(self.net, self.trainloader, epochs)
        training_time = time.time() - start_time  # Calculate duration
        print(f"Training time for Client {self.cid}: {training_time:.2f} seconds")
        return get_parameters(self.net), len(self.trainloader), {"training_time": training_time}



    def evaluate(self, parameters, config):
        print(f"[Client {self.cid}] evaluate, config: {config}")
        set_parameters(self.net, parameters)
        loss, accuracy = test(self.net, self.valloader)
        return float(loss), len(self.valloader), {"accuracy": float(accuracy)}


def client_fn(cid) -> FlowerClient:
    net = Net().to(DEVICE)
    trainloader = trainloaders[int(cid)]
    valloader = valloaders[int(cid)]
    return FlowerClient(cid, net, trainloader, valloader)

In [6]:
from typing import Callable, Union

from flwr.common import (
    EvaluateIns,
    EvaluateRes,
    FitIns,
    FitRes,
    MetricsAggregationFn,
    NDArrays,
    Parameters,
    Scalar,
    ndarrays_to_parameters,
    parameters_to_ndarrays,
)
from flwr.server.client_manager import ClientManager
from flwr.server.client_proxy import ClientProxy
from flwr.server.strategy.aggregate import aggregate, weighted_loss_avg


class FedCustom(fl.server.strategy.Strategy):
    def __init__(
        self,
        fraction_fit: float = 1.0,
        fraction_evaluate: float = 1.0,
        min_fit_clients: int = 2,
        min_evaluate_clients: int = 2,
        min_available_clients: int = 2,
    ) -> None:
        super().__init__()
        self.fraction_fit = fraction_fit
        self.fraction_evaluate = fraction_evaluate
        self.min_fit_clients = min_fit_clients
        self.min_evaluate_clients = min_evaluate_clients
        self.min_available_clients = min_available_clients
        self.client_training_times = {}
    def __repr__(self) -> str:
        return "FedCustom"

    def initialize_parameters(
        self, client_manager: ClientManager
    ) -> Optional[Parameters]:
        """Initialize global model parameters."""
        net = Net()
        ndarrays = get_parameters(net)
        return fl.common.ndarrays_to_parameters(ndarrays)

    def configure_fit(self, server_round: int, parameters: Parameters, client_manager: ClientManager):
        sample_size, min_num_clients = self.num_fit_clients(client_manager.num_available())
        clients = client_manager.sample(num_clients=sample_size, min_num_clients=min_num_clients)
        epochs_sc = 5
        epochs_hl = 3

        standard_config = {"lr": 0.001, "epochs": epochs_sc}
        higher_lr_config = {"lr": 0.0001, "epochs": epochs_hl}
        fit_configurations = []

        for client in clients:
            # Choose config based on the previous training time
            last_time = self.client_training_times.get(client.cid, 0)  # Default to 0 if no time recorded
            print(f"This is the last time {last_time}")
            


            config_to_use = standard_config if last_time < 13.8 else higher_lr_config
            fit_configurations.append((client, FitIns(parameters, config_to_use)))

        return fit_configurations

    def aggregate_fit(
        self,
        server_round: int,
        results: List[Tuple[ClientProxy, FitRes]],
        failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]],
    ) -> Tuple[Optional[Parameters], Dict[str, Scalar]]:
        """Aggregate fit results using weighted average."""
        for client, fit_res in results:
            # Update training times for each client
            self.client_training_times[client.cid] = fit_res.metrics.get("training_time", 0)
        weights_results = [
            (parameters_to_ndarrays(fit_res.parameters), fit_res.num_examples)
            for _, fit_res in results
        ]
        parameters_aggregated = ndarrays_to_parameters(aggregate(weights_results))
        metrics_aggregated = {}
        return parameters_aggregated, metrics_aggregated


    def configure_evaluate(
        self, server_round: int, parameters: Parameters, client_manager: ClientManager
    ) -> List[Tuple[ClientProxy, EvaluateIns]]:
        """Configure the next round of evaluation."""
        if self.fraction_evaluate == 0.0:
            return []
        config = {}
        evaluate_ins = EvaluateIns(parameters, config)

        # Sample clients
        sample_size, min_num_clients = self.num_evaluation_clients(
            client_manager.num_available()
        )
        clients = client_manager.sample(
            num_clients=sample_size, min_num_clients=min_num_clients
        )

        # Return client/config pairs
        return [(client, evaluate_ins) for client in clients]

    def aggregate_evaluate(
        self,
        server_round: int,
        results: List[Tuple[ClientProxy, EvaluateRes]],
        failures: List[Union[Tuple[ClientProxy, EvaluateRes], BaseException]],
    ) -> Tuple[Optional[float], Dict[str, Scalar]]:
        """Aggregate evaluation losses using weighted average."""

        if not results:
            return None, {}

        loss_aggregated = weighted_loss_avg(
            [
                (evaluate_res.num_examples, evaluate_res.loss)
                for _, evaluate_res in results
            ]
        )
        metrics_aggregated = {}
        return loss_aggregated, metrics_aggregated

    def evaluate(
        self, server_round: int, parameters: Parameters
    ) -> Optional[Tuple[float, Dict[str, Scalar]]]:
        """Evaluate global model parameters using an evalua
        tion function."""

        # Let's assume we won't perform the global model evaluation on the server side.
        return None

    def num_fit_clients(self, num_available_clients: int) -> Tuple[int, int]:
        """Return sample size and required number of clients."""
        num_clients = int(num_available_clients * self.fraction_fit)
        return max(num_clients, self.min_fit_clients), self.min_available_clients

    def num_evaluation_clients(self, num_available_clients: int) -> Tuple[int, int]:
        """Use a fraction of available clients for evaluation."""
        num_clients = int(num_available_clients * self.fraction_evaluate)
        return max(num_clients, self.min_evaluate_clients), self.min_available_clients

In [7]:
if DEVICE.type == "cuda":
    # Use a single client to train the global model
    client_resources = {"num_gpus": .25, "num_cpus": 2} 

In [8]:

fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=10,
    config=fl.server.ServerConfig(num_rounds=3),
    strategy=FedCustom(),  # <-- pass the new strategy here
    client_resources=client_resources,

)

INFO flwr 2024-06-26 19:48:33,238 | app.py:146 | Starting Flower simulation, config: ServerConfig(num_rounds=3, round_timeout=None)
2024-06-26 19:48:37,510	INFO worker.py:1752 -- Started a local Ray instance.
INFO flwr 2024-06-26 19:48:39,429 | app.py:180 | Flower VCE: Ray initialized with resources: {'node:127.0.0.1': 1.0, 'memory': 77768510055.0, 'object_store_memory': 37615075737.0, 'accelerator_type:RTX': 1.0, 'GPU': 1.0, 'CPU': 32.0, 'node:__internal_head__': 1.0}
INFO flwr 2024-06-26 19:48:39,431 | server.py:86 | Initializing global parameters
INFO flwr 2024-06-26 19:48:39,437 | server.py:269 | Using initial parameters provided by strategy
INFO flwr 2024-06-26 19:48:39,438 | server.py:88 | Evaluating initial parameters
INFO flwr 2024-06-26 19:48:39,439 | server.py:101 | FL starting
DEBUG flwr 2024-06-26 19:48:39,441 | server.py:218 | fit_round 1: strategy sampled 10 clients (out of 10)


This is the last time 0
This is the last time 0
This is the last time 0
This is the last time 0
This is the last time 0
This is the last time 0
This is the last time 0
This is the last time 0
This is the last time 0
This is the last time 0
[36m(launch_and_fit pid=22360)[0m [Client 0] fit, config: {'lr': 0.001, 'epochs': 5}
[36m(launch_and_fit pid=8476)[0m Epoch 0: train loss 0.065231, accuracy 0.226667
[36m(launch_and_fit pid=33720)[0m [Client 5] fit, config: {'lr': 0.001, 'epochs': 5}[32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(launch_and_fit pid=8476)[0m Epoch 2: train loss 0.051311, accuracy 0.399778[32m [repeated 8x across cluster][0m
[36m(launch_and_fit pid=8476)[0m Training time for Client 4: 21.68 seconds
[36m(launch_and_fit pid=33720)[0m Epoch 4: train loss 0.047180, accura

DEBUG flwr 2024-06-26 19:50:09,686 | server.py:232 | fit_round 1 received 10 results and 0 failures
DEBUG flwr 2024-06-26 19:50:09,719 | server.py:168 | evaluate_round 1: strategy sampled 10 clients (out of 10)


[36m(launch_and_fit pid=27452)[0m Epoch 4: train loss 0.047442, accuracy 0.442889[32m [repeated 4x across cluster][0m
[36m(launch_and_evaluate pid=17612)[0m [Client 8] evaluate, config: {}
[36m(launch_and_fit pid=27452)[0m Training time for Client 6: 20.49 seconds
[36m(launch_and_evaluate pid=31724)[0m [Client 1] evaluate, config: {}[32m [repeated 4x across cluster][0m
[36m(launch_and_evaluate pid=30540)[0m [Client 6] evaluate, config: {}
[36m(launch_and_evaluate pid=3460)[0m [Client 4] evaluate, config: {}
[36m(launch_and_evaluate pid=25824)[0m [Client 9] evaluate, config: {}[32m [repeated 2x across cluster][0m
[36m(launch_and_evaluate pid=31760)[0m [Client 3] evaluate, config: {}
[33m(raylet)[0m Traceback (most recent call last):
  File "python\ray\_raylet.pyx", line 1883, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1984, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\A

[36m(launch_and_evaluate pid=25824)[0m *** SIGABRT received at time=1719411777 ***
[36m(launch_and_evaluate pid=25824)[0m     @   00007FFA28B1DD61  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FF996415136  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FFA28B1D492  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FF6354C2297  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FFA285CDD31  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FFA2AF4AD6C  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FFA2AF33CC6  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FFA2AF48CDF  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FFA2AED5BEA  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824)[0m     @   00007FFA2AED2EF1  (unknown)  (unknown)
[36m(launch_and_evaluate pid=25824

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: c5b8c284d98d05d24f025d3d3fde4d76d59e7d8a01000000 Worker ID: 8208bc2c54578c5e7347132c1b03e29925363c540f03af4258d6b7e1 Node ID: 545ae7ca41736b854d81c0bc1e3553d05d65ae68e3db45c76c5d2cfb Worker IP address: 127.0.0.1 Worker port: 64911 Worker PID: 31760 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 10054. An existing connection was forcibly closed by the remote host. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[36m(launch_and_evaluate pid=4304)[0m [Client 3] evaluate, config: {}
[33m(raylet)[0m Traceback (most recent call last):
  File "python\ray\_raylet.pyx", line 1883, 

[36m(launch_and_evaluate pid=4304)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\conv.py", line 459 in _conv_forward
[36m(launch_and_evaluate pid=4304)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\conv.py", line 463 in forward
[36m(launch_and_evaluate pid=4304)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\module.py", line 1194 in _call_impl
[36m(launch_and_evaluate pid=4304)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 13 in forward
[36m(launch_and_evaluate pid=4304)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\module.py", line 1194 in _call_impl
[36m(launch_and_evaluate pid=4304)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 63 in test
[36m(launch_and_evaluate pid=4304)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380

[36m(launch_and_evaluate pid=30352)[0m [Client 9] evaluate, config: {}


[36m(launch_and_evaluate pid=10376)[0m     @   00007FF9964F1F64  (unknown)  (unknown)[32m [repeated 32x across cluster][0m
[36m(launch_and_evaluate pid=10376)[0m Stack (most recent call first):[32m [repeated 2x across cluster][0m
[36m(launch_and_evaluate pid=10376)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\worker.py", line 879 in main_loop[32m [repeated 2x across cluster][0m
[36m(launch_and_evaluate pid=10376)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\workers\default_worker.py", line 282 in <module>
[36m(launch_and_evaluate pid=30352)[0m fatal   : Memory allocation failure[32m [repeated 15x across cluster][0m
[36m(launch_and_evaluate pid=30352)[0m *** SIGABRT received at time=1719411799 ***
[36m(launch_and_evaluate pid=30352)[0m Fatal Python error: Aborted
[36m(launch_and_evaluate pid=30352)[0m 
[36m(launch_and_evaluate pid=30352)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpy

[33m(raylet)[0m Traceback (most recent call last):
  File "python\ray\_raylet.pyx", line 1883, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1984, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 160, in launch_and_evaluate
    return maybe_call_evaluate(
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 205, in maybe_call_evaluate
    return client.evaluate(evaluate_ins)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 321, in _evaluate
    results = self.numpy_client.evaluate(parameters, ins.config)  # type: ignore
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 27, in evaluate
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 61, in test
  F

[36m(launch_and_evaluate pid=30352)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\workers\default_worker.py", line 282 in <module>


[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: d7bedbccd5c5391364a6653c81777704b926f6a101000000 Worker ID: be1973de6974082cd9b90c8dd73b32e10d66908844a33f27b44cd51b Node ID: 545ae7ca41736b854d81c0bc1e3553d05d65ae68e3db45c76c5d2cfb Worker IP address: 127.0.0.1 Worker port: 64970 Worker PID: 30352 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 10054. An existing connection was forcibly closed by the remote host. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[36m(launch_and_evaluate pid=22380)[0m [Client 9] evaluate, config: {}
[33m(raylet)[0m Traceback (most recent call last):
  File "python\ray\_raylet.pyx", line 1883,

[36m(launch_and_evaluate pid=20192)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\conv.py", line 459 in _conv_forward
[36m(launch_and_evaluate pid=20192)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\conv.py", line 463 in forward
[36m(launch_and_evaluate pid=20192)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\module.py", line 1194 in _call_impl
[36m(launch_and_evaluate pid=20192)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 12 in forward
[36m(launch_and_evaluate pid=20192)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\module.py", line 1194 in _call_impl
[36m(launch_and_evaluate pid=20192)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 63 in test
[36m(launch_and_evaluate pid=20192)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_53

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: 719e9ddcb7e91b511e1767bff0d3af3d5c72d24801000000 Worker ID: 4556dccfd0eb2f3765643a4e83197e96592fee1cf82baf85db5f76f6 Node ID: 545ae7ca41736b854d81c0bc1e3553d05d65ae68e3db45c76c5d2cfb Worker IP address: 127.0.0.1 Worker port: 64988 Worker PID: 20192 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 10054. An existing connection was forcibly closed by the remote host. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[36m(launch_and_evaluate pid=20192)[0m [Client 3] evaluate, config: {}


ERROR flwr 2024-06-26 19:53:31,102 | ray_client_proxy.py:104 | The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
DEBUG flwr 2024-06-26 19:53:31,107 | server.py:182 | evaluate_round 1 received 8 results and 2 failures
DEBUG flwr 2024-06-26 19:53:31,110 | server.py:218 | fit_round 2: strategy sampled 10 clients (out of 10)


[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: bfacc84ab396c91e3623e76bb38414633d2244f801000000 Worker ID: 6d23067eb7587edfcce4c26c471636ee7c8fa1eb9490d310e44de4c2 Node ID: 545ae7ca41736b854d81c0bc1e3553d05d65ae68e3db45c76c5d2cfb Worker IP address: 127.0.0.1 Worker port: 64995 Worker PID: 22380 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 10054. An existing connection was forcibly closed by the remote host. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
This is the last time 23.490320444107056
This is the last time 21.794506311416626
This is the last time 21.924933195114136
This is the last time 20.490160942077637
This 

[36m(launch_and_fit pid=22008)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 41 in train
[36m(launch_and_fit pid=22008)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 17 in fit
[36m(launch_and_fit pid=22008)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 297 in _fit
[36m(launch_and_fit pid=22008)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 184 in maybe_call_fit
[36m(launch_and_fit pid=22008)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 148 in launch_and_fit
[36m(launch_and_fit pid=22008)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\conv.py", line 459 in _conv_forward[32m [repeated 2x across cluster][0m
[36m(launch_and_fit pid=22008)[0m   File "C:\Users\Admin\AppData\Local\Te

[33m(raylet)[0m Traceback (most recent call last):
  File "python\ray\_raylet.pyx", line 1836, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1870, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 968, in ray._raylet.raise_if_dependency_failed
ray.exceptions.RaySystemError: System error: 
traceback: Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 404, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 270, in _deserialize_object
    return self._deserialize_msgpack_data(data, metadata_fields)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 225, in _deserialize_msgpack_data
    python_objects = self._deserialize_pickle5_data(pickle5_data)
  File "c:\Users\Admin\anac

[36m(launch_and_fit pid=3264)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 41 in train[32m [repeated 2x across cluster][0m
[36m(launch_and_fit pid=3264)[0m   File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 17 in fit
[36m(launch_and_fit pid=3264)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 297 in _fit
[36m(launch_and_fit pid=23656)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 184 in maybe_call_fit[32m [repeated 2x across cluster][0m
[36m(launch_and_fit pid=23656)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 148 in launch_and_fit[32m [repeated 2x across cluster][0m
[36m(launch_and_fit pid=23656)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\conv.py", line 459 in _conv_forward[3

[33m(raylet)[0m Traceback (most recent call last):
  File "python\ray\_raylet.pyx", line 1883, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1984, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 148, in launch_and_fit
    return maybe_call_fit(
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 184, in maybe_call_fit
    return client.fit(fit_ins)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 297, in _fit
    results = self.numpy_client.fit(parameters, ins.config)  # type: ignore
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 17, in fit
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 41, in train
  File "c:\Users\Admin\anaconda3\envs\flwr

ERROR flwr 2024-06-26 19:53:53,229 | ray_client_proxy.py:87 | [36mray::launch_and_fit()[39m (pid=28252, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 1836, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1870, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 968, in ray._raylet.raise_if_dependency_failed
ray.exceptions.RaySystemError: System error: [WinError 1455] The paging file is too small for this operation to complete. Error loading "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\lib\cudnn_cnn_train64_8.dll" or one of its dependencies.
traceback: Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 404, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 270, in _deserialize_object
    return self.

[36m(launch_and_fit pid=33668)[0m [Client 4] fit, config: {'lr': 0.0001, 'epochs': 3}[32m [repeated 2x across cluster][0m


2024-06-26 19:53:55,682	ERROR serialization.py:406 -- Unable to allocate internal buffer.
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 404, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 292, in _deserialize_object
    obj = self._deserialize_msgpack_data(data, metadata_fields)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 222, in _deserialize_msgpack_data
    msgpack_data, pickle5_data = split_buffer(data)
  File "python\ray\includes/serialization.pxi", line 206, in ray._raylet.split_buffer
  File "msgpack\\_unpacker.pyx", line 372, in msgpack._cmsgpack.Unpacker.__init__
MemoryError: Unable to allocate internal buffer.
ERROR flwr 2024-06-26 19:53:55,692 | ray_client_proxy.py:87 | System error

[33m(raylet)[0m Traceback (most recent call last):
  File "python\ray\_raylet.pyx", line 1836, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1870, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 968, in ray._raylet.raise_if_dependency_failed
ray.exceptions.RaySystemError: System error: [WinError 1455] The paging file is too small for this operation to complete. Error loading "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\lib\cudnn_cnn_train64_8.dll" or one of its dependencies.
traceback: Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 404, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 270, in _deserialize_object
    return self._deserialize_msgpack_data(data, metadata_fields)
  File "c:\Users\A

[36m(launch_and_fit pid=26892)[0m Unable to allocate internal buffer.
[36m(launch_and_fit pid=26892)[0m     msgpack_data, pickle5_data = split_buffer(data)
[36m(launch_and_fit pid=26892)[0m   File "python\ray\includes/serialization.pxi", line 206, in ray._raylet.split_buffer
[36m(launch_and_fit pid=26892)[0m   File "msgpack\\_unpacker.pyx", line 372, in msgpack._cmsgpack.Unpacker.__init__
[36m(launch_and_fit pid=26892)[0m MemoryError: Unable to allocate internal buffer.
[36m(launch_and_fit pid=26892)[0m Unable to allocate internal buffer.
[36m(launch_and_fit pid=26892)[0m     msgpack_data, pickle5_data = split_buffer(data)
[36m(launch_and_fit pid=26892)[0m   File "python\ray\includes/serialization.pxi", line 206, in ray._raylet.split_buffer
[36m(launch_and_fit pid=26892)[0m   File "msgpack\\_unpacker.pyx", line 372, in msgpack._cmsgpack.Unpacker.__init__
[36m(launch_and_fit pid=26892)[0m MemoryError: Unable to allocate internal buffer.
[36m(launch_and_fit pid=12048

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: 705c9fe83b2d3f75e1191b2516690cc3b7becfda01000000 Worker ID: e9981a923491408a58948138723269cb066f43f097498fad3994f987 Node ID: 545ae7ca41736b854d81c0bc1e3553d05d65ae68e3db45c76c5d2cfb Worker IP address: 127.0.0.1 Worker port: 65316 Worker PID: 12048 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 10054. An existing connection was forcibly closed by the remote host. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
[33m(raylet)[0m Traceback (most recent call last):
  File "python\ray\_raylet.pyx", line 1836, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1870, 

[36m(launch_and_fit pid=32128)[0m     @   00007FF9964F1F64  (unknown)  (unknown)[32m [repeated 62x across cluster][0m
[36m(launch_and_fit pid=32128)[0m [WinError 1455] The paging file is too small for this operation to complete. Error loading "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\lib\cudnn_cnn_train64_8.dll" or one of its dependencies.[32m [repeated 3x across cluster][0m
[36m(launch_and_fit pid=32128)[0m Traceback (most recent call last):[32m [repeated 9x across cluster][0m
[36m(launch_and_fit pid=32128)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 404, in deserialize_objects[32m [repeated 9x across cluster][0m
[36m(launch_and_fit pid=32128)[0m     obj = self._deserialize_object(data, metadata, object_ref)[32m [repeated 9x across cluster][0m
[36m(launch_and_fit pid=32128)[0m   File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", 

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: 8f0a191b4d7a62b824fe702c1704ec31365e4d9d01000000 Worker ID: 514145b0510caba696b22469ad40d4bd86fc2c2463dbc666f2341a0d Node ID: 545ae7ca41736b854d81c0bc1e3553d05d65ae68e3db45c76c5d2cfb Worker IP address: 127.0.0.1 Worker port: 65333 Worker PID: 32128 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 10054. An existing connection was forcibly closed by the remote host. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.


ERROR flwr 2024-06-26 19:56:25,428 | ray_client_proxy.py:104 | [36mray::launch_and_evaluate()[39m (pid=2348, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 1836, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 1870, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 968, in ray._raylet.raise_if_dependency_failed
ray.exceptions.RaySystemError: System error: [WinError 1455] The paging file is too small for this operation to complete. Error loading "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\lib\cudnn_adv_train64_8.dll" or one of its dependencies.
traceback: Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 404, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\_private\serialization.py", line 270, in _deserialize_object
    return 

[36m(launch_and_evaluate pid=14820)[0m [Client 1] evaluate, config: {}


ERROR flwr 2024-06-26 19:56:32,123 | ray_client_proxy.py:104 | [36mray::launch_and_evaluate()[39m (pid=27080, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 160, in launch_and_evaluate
    return maybe_call_evaluate(
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 205, in maybe_call_evaluate
    return client.evaluate(evaluate_ins)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 321, in _evaluate
    results = self.numpy_client.evaluate(parameters, ins.config)  # type: ignore
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 26, in evaluate
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 28, in set_parameters
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packa

This is the last time 20.490160942077637
This is the last time 21.794506311416626
This is the last time 21.68092632293701
This is the last time 23.490320444107056
This is the last time 22.895992517471313
This is the last time 22.2886061668396
This is the last time 20.205161809921265
This is the last time 23.315999031066895
This is the last time 21.924933195114136
This is the last time 22.724000215530396
[36m(launch_and_evaluate pid=10324)[0m [Client 6] evaluate, config: {}[32m [repeated 6x across cluster][0m


ERROR flwr 2024-06-26 19:56:49,249 | ray_client_proxy.py:87 | [36mray::launch_and_fit()[39m (pid=8080, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 148, in launch_and_fit
    return maybe_call_fit(
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 184, in maybe_call_fit
    return client.fit(fit_ins)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 297, in _fit
    results = self.numpy_client.fit(parameters, ins.config)  # type: ignore
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 14, in fit
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 28, in set_parameters
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\module.py", line 1671, in 

[36m(launch_and_fit pid=8080)[0m [Client 6] fit, config: {'lr': 0.0001, 'epochs': 3}


ERROR flwr 2024-06-26 19:56:50,256 | ray_client_proxy.py:87 | [36mray::launch_and_fit()[39m (pid=1564, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 148, in launch_and_fit
    return maybe_call_fit(
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 184, in maybe_call_fit
    return client.fit(fit_ins)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 297, in _fit
    results = self.numpy_client.fit(parameters, ins.config)  # type: ignore
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 14, in fit
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 28, in set_parameters
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\module.py", line 1671, in 

[36m(launch_and_fit pid=32588)[0m [Client 0] fit, config: {'lr': 0.0001, 'epochs': 3}[32m [repeated 7x across cluster][0m


ERROR flwr 2024-06-26 19:56:56,464 | ray_client_proxy.py:87 | [36mray::launch_and_fit()[39m (pid=31076, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 148, in launch_and_fit
    return maybe_call_fit(
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 184, in maybe_call_fit
    return client.fit(fit_ins)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 297, in _fit
    results = self.numpy_client.fit(parameters, ins.config)  # type: ignore
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 14, in fit
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 28, in set_parameters
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\torch\nn\modules\module.py", line 1671, in

[36m(launch_and_evaluate pid=29104)[0m [Client 7] evaluate, config: {}
[36m(launch_and_fit pid=12732)[0m [Client 1] fit, config: {'lr': 0.0001, 'epochs': 3}[32m [repeated 2x across cluster][0m


ERROR flwr 2024-06-26 19:57:11,038 | ray_client_proxy.py:104 | [36mray::launch_and_evaluate()[39m (pid=28580, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 160, in launch_and_evaluate
    return maybe_call_evaluate(
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 205, in maybe_call_evaluate
    return client.evaluate(evaluate_ins)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 321, in _evaluate
    results = self.numpy_client.evaluate(parameters, ins.config)  # type: ignore
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 26, in evaluate
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 28, in set_parameters
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packa

[36m(launch_and_evaluate pid=27748)[0m [Client 1] evaluate, config: {}[32m [repeated 8x across cluster][0m


ERROR flwr 2024-06-26 19:57:18,747 | ray_client_proxy.py:104 | [36mray::launch_and_evaluate()[39m (pid=2924, ip=127.0.0.1)
  File "python\ray\_raylet.pyx", line 1889, in ray._raylet.execute_task
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\simulation\ray_transport\ray_client_proxy.py", line 160, in launch_and_evaluate
    return maybe_call_evaluate(
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\client.py", line 205, in maybe_call_evaluate
    return client.evaluate(evaluate_ins)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\flwr\client\app.py", line 321, in _evaluate
    results = self.numpy_client.evaluate(parameters, ins.config)  # type: ignore
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\1380938485.py", line 26, in evaluate
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_5312\3083245685.py", line 28, in set_parameters
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packag

History (loss, distributed):
	round 1: 0.05854799574613571

[36m(launch_and_evaluate pid=2924)[0m [Client 6] evaluate, config: {}


In [12]:
class FlowerClient(fl.client.NumPyClient):
    def __init__(self, cid, net, trainloader, valloader):
        self.cid = cid
        self.net = net
        self.trainloader = trainloader
        self.valloader = valloader

    def get_parameters(self, config):
        print(f"[Client {self.cid}] get_parameters")
        return get_parameters(self.net)

    def fit(self, parameters, config):
        print(f"[Client {self.cid}] fit, config: {config}")
        set_parameters(self.net, parameters)
        train(self.net, self.trainloader, epochs=5)
        return get_parameters(self.net), len(self.trainloader), {}

    def evaluate(self, parameters, config):
        print(f"[Client {self.cid}] evaluate, config: {config}")
        set_parameters(self.net, parameters)
        loss, accuracy = test(self.net, self.valloader)
        print(f"Client {self.cid} loss {loss}")
        print(f"Client {self.cid} accuracy {accuracy}")
        
        return float(loss), len(self.valloader), {"accuracy": float(accuracy)}


def client_fn(cid) -> FlowerClient:
    net = Net().to(DEVICE) #Load Model from here
    trainloader = trainloaders[int(cid)]
    valloader = valloaders[int(cid)]
    return FlowerClient(cid, net, trainloader, valloader)

In [None]:
fl.simulation.start_simulation(
    client_fn=client_fn,
    num_clients=10,
    config=fl.server.ServerConfig(num_rounds=3),
    client_resources=client_resources,
)

INFO flwr 2024-05-29 11:36:33,564 | app.py:146 | Starting Flower simulation, config: ServerConfig(num_rounds=3, round_timeout=None)


INFO flwr 2024-05-29 11:36:40,084 | app.py:180 | Flower VCE: Ray initialized with resources: {'GPU': 1.0, 'CPU': 32.0, 'memory': 61953181082.0, 'object_store_memory': 30837077606.0, 'node:127.0.0.1': 1.0}
INFO flwr 2024-05-29 11:36:40,087 | server.py:86 | Initializing global parameters
INFO flwr 2024-05-29 11:36:40,089 | server.py:273 | Requesting initial parameters from one random client
INFO flwr 2024-05-29 11:36:44,026 | server.py:277 | Received initial parameters from one random client
INFO flwr 2024-05-29 11:36:44,027 | server.py:88 | Evaluating initial parameters
INFO flwr 2024-05-29 11:36:44,029 | server.py:101 | FL starting
DEBUG flwr 2024-05-29 11:36:44,031 | server.py:218 | fit_round 1: strategy sampled 10 clients (out of 10)


 pid=27924)[0m [Client 3] get_parameters
 pid=27924)[0m [Client 8] fit, config: {}
 pid=27000)[0m [Client 9] fit, config: {}
 pid=26956)[0m [Client 5] fit, config: {}
 pid=19244)[0m [Client 3] fit, config: {}
 pid=27924)[0m Epoch 0: train loss 0.064283, accuracy 0.244000
 pid=27924)[0m Epoch 1: train loss 0.056345, accuracy 0.335556
 pid=27924)[0m Epoch 2: train loss 0.052596, accuracy 0.383111
 pid=27000)[0m Epoch 0: train loss 0.065390, accuracy 0.219333
 pid=26956)[0m Epoch 0: train loss 0.065386, accuracy 0.222667
 pid=27924)[0m Epoch 3: train loss 0.049705, accuracy 0.431111
 pid=19244)[0m Epoch 0: train loss 0.064883, accuracy 0.228667
 pid=27000)[0m Epoch 1: train loss 0.057237, accuracy 0.326667
 pid=26956)[0m Epoch 1: train loss 0.057874, accuracy 0.322444
 pid=27924)[0m Epoch 4: train loss 0.047198, accuracy 0.454667
 pid=27924)[0m [Client 1] fit, config: {}
 pid=19244)[0m Epoch 1: train loss 0.056371, accuracy 0.348444
 pid=27000)[0m Epoch 2: train loss 0.0

DEBUG flwr 2024-05-29 11:37:45,189 | server.py:232 | fit_round 1 received 10 results and 0 failures
DEBUG flwr 2024-05-29 11:37:45,235 | server.py:168 | evaluate_round 1: strategy sampled 10 clients (out of 10)


 pid=27000)[0m Epoch 4: train loss 0.048110, accuracy 0.436444
 pid=27000)[0m [Client 4] evaluate, config: {}
 pid=27000)[0m Client 4 loss 0.057416656255722044
 pid=27000)[0m Client 4 accuracy 0.348
 pid=27924)[0m [Client 8] evaluate, config: {}
 pid=27924)[0m Client 8 loss 0.05749241328239441
 pid=27924)[0m Client 8 accuracy 0.394
 pid=27924)[0m [Client 6] evaluate, config: {}
 pid=27000)[0m [Client 2] evaluate, config: {}
 pid=27924)[0m Client 6 loss 0.05749799847602844
 pid=27924)[0m Client 6 accuracy 0.348
 pid=27000)[0m Client 2 loss 0.058668370962142946
 pid=27000)[0m Client 2 accuracy 0.342
 pid=27000)[0m [Client 9] evaluate, config: {}
 pid=27000)[0m Client 9 loss 0.05861459255218506
 pid=27000)[0m Client 9 accuracy 0.36
 pid=27000)[0m [Client 1] evaluate, config: {}
 pid=27000)[0m Client 1 loss 0.057283300876617434
 pid=27000)[0m Client 1 accuracy 0.376


DEBUG flwr 2024-05-29 11:38:00,139 | server.py:182 | evaluate_round 1 received 10 results and 0 failures


 pid=27924)[0m [Client 5] evaluate, config: {}
 pid=27000)[0m [Client 0] evaluate, config: {}
 pid=26956)[0m [Client 7] evaluate, config: {}
 pid=19244)[0m [Client 3] evaluate, config: {}
 pid=27924)[0m Client 5 loss 0.05579921221733093
 pid=27924)[0m Client 5 accuracy 0.41
 pid=27000)[0m Client 0 loss 0.05796573948860168
 pid=27000)[0m Client 0 accuracy 0.36


DEBUG flwr 2024-05-29 11:38:00,144 | server.py:218 | fit_round 2: strategy sampled 10 clients (out of 10)


 pid=26956)[0m Client 7 loss 0.057370660543441775
 pid=26956)[0m Client 7 accuracy 0.362
 pid=19244)[0m Client 3 loss 0.057644266366958616
 pid=19244)[0m Client 3 accuracy 0.364
 pid=19244)[0m [Client 8] fit, config: {}
 pid=26956)[0m [Client 9] fit, config: {}
 pid=19244)[0m Epoch 0: train loss 0.050530, accuracy 0.418222
 pid=26956)[0m Epoch 0: train loss 0.051527, accuracy 0.400000
 pid=19244)[0m Epoch 1: train loss 0.046498, accuracy 0.465778
 pid=27924)[0m [Client 7] fit, config: {}
 pid=26956)[0m Epoch 1: train loss 0.047991, accuracy 0.431556
 pid=27000)[0m [Client 2] fit, config: {}
 pid=19244)[0m Epoch 2: train loss 0.044626, accuracy 0.484889
 pid=27924)[0m Epoch 0: train loss 0.050340, accuracy 0.412667
 pid=26956)[0m Epoch 2: train loss 0.045316, accuracy 0.465556
 pid=27000)[0m Epoch 0: train loss 0.051500, accuracy 0.403111
 pid=19244)[0m Epoch 3: train loss 0.042625, accuracy 0.503778
 pid=27924)[0m Epoch 1: train loss 0.047050, accuracy 0.448222
 pid=2

DEBUG flwr 2024-05-29 11:38:43,789 | server.py:232 | fit_round 2 received 10 results and 0 failures
DEBUG flwr 2024-05-29 11:38:43,823 | server.py:168 | evaluate_round 2: strategy sampled 10 clients (out of 10)


 pid=26956)[0m Epoch 4: train loss 0.041652, accuracy 0.512444
 pid=26956)[0m [Client 0] evaluate, config: {}
 pid=26956)[0m Client 0 loss 0.046705979347229004
 pid=26956)[0m Client 0 accuracy 0.474
 pid=26956)[0m [Client 7] evaluate, config: {}
 pid=26956)[0m Client 7 loss 0.04484793257713318
 pid=26956)[0m Client 7 accuracy 0.48
 pid=26956)[0m [Client 1] evaluate, config: {}
 pid=19244)[0m [Client 2] evaluate, config: {}
 pid=27924)[0m [Client 3] evaluate, config: {}
 pid=27000)[0m [Client 5] evaluate, config: {}
 pid=26956)[0m Client 1 loss 0.0451300802230835
 pid=26956)[0m Client 1 accuracy 0.474
 pid=27000)[0m Client 5 loss 0.043583230257034304
 pid=27000)[0m Client 5 accuracy 0.508
 pid=26956)[0m [Client 6] evaluate, config: {}
 pid=19244)[0m Client 2 loss 0.048428908586502076
 pid=19244)[0m Client 2 accuracy 0.438
 pid=27924)[0m Client 3 loss 0.04525577282905579
 pid=27924)[0m Client 3 accuracy 0.48
 pid=27000)[0m [Client 8] evaluate, config: {}
 pid=19244)

DEBUG flwr 2024-05-29 11:38:58,868 | server.py:182 | evaluate_round 2 received 10 results and 0 failures
DEBUG flwr 2024-05-29 11:38:58,870 | server.py:218 | fit_round 3: strategy sampled 10 clients (out of 10)


 pid=27924)[0m Client 9 loss 0.04738649296760559
 pid=27924)[0m Client 9 accuracy 0.486
 pid=27924)[0m [Client 7] fit, config: {}
 pid=27924)[0m Epoch 0: train loss 0.044175, accuracy 0.496000
 pid=27000)[0m [Client 9] fit, config: {}
 pid=19244)[0m [Client 4] fit, config: {}
 pid=27924)[0m Epoch 1: train loss 0.041426, accuracy 0.526000
 pid=27000)[0m Epoch 0: train loss 0.045275, accuracy 0.478444
 pid=19244)[0m Epoch 0: train loss 0.044497, accuracy 0.490667
 pid=27924)[0m Epoch 2: train loss 0.038939, accuracy 0.555778
 pid=27000)[0m Epoch 1: train loss 0.042404, accuracy 0.507333
 pid=26956)[0m [Client 5] fit, config: {}
 pid=19244)[0m Epoch 1: train loss 0.041429, accuracy 0.532222
 pid=27924)[0m Epoch 3: train loss 0.036637, accuracy 0.575778
 pid=27000)[0m Epoch 2: train loss 0.039452, accuracy 0.541556
 pid=26956)[0m Epoch 0: train loss 0.045420, accuracy 0.471778
 pid=19244)[0m Epoch 2: train loss 0.039223, accuracy 0.550667
 pid=27924)[0m Epoch 4: train los

DEBUG flwr 2024-05-29 11:39:46,728 | server.py:232 | fit_round 3 received 10 results and 0 failures
DEBUG flwr 2024-05-29 11:39:46,764 | server.py:168 | evaluate_round 3: strategy sampled 10 clients (out of 10)


 pid=27000)[0m Epoch 4: train loss 0.033772, accuracy 0.621778
 pid=27000)[0m [Client 7] evaluate, config: {}
 pid=27000)[0m Client 7 loss 0.04311247169971466
 pid=27000)[0m Client 7 accuracy 0.518
 pid=27000)[0m [Client 2] evaluate, config: {}
 pid=27924)[0m [Client 5] evaluate, config: {}
 pid=27000)[0m Client 2 loss 0.04556352210044861
 pid=27000)[0m Client 2 accuracy 0.5
 pid=26956)[0m [Client 3] evaluate, config: {}
 pid=27924)[0m Client 5 loss 0.0403660671710968
 pid=27924)[0m Client 5 accuracy 0.516
 pid=26956)[0m Client 3 loss 0.041824417114257814
 pid=26956)[0m Client 3 accuracy 0.554
 pid=26956)[0m [Client 6] evaluate, config: {}
 pid=26956)[0m Client 6 loss 0.04151818835735321
 pid=26956)[0m Client 6 accuracy 0.526
 pid=26956)[0m [Client 0] evaluate, config: {}
 pid=26956)[0m Client 0 loss 0.04397455382347107
 pid=26956)[0m Client 0 accuracy 0.51
 pid=27924)[0m [Client 9] evaluate, config: {}
 pid=27000)[0m [Client 1] evaluate, config: {}
 pid=26956)[0m

DEBUG flwr 2024-05-29 11:40:01,820 | server.py:182 | evaluate_round 3 received 10 results and 0 failures
INFO flwr 2024-05-29 11:40:01,822 | server.py:147 | FL finished in 197.79143010000007
INFO flwr 2024-05-29 11:40:01,824 | app.py:218 | app_fit: losses_distributed [(1, 0.05757532110214233), (2, 0.045601885247230536), (3, 0.042524653851985936)]
INFO flwr 2024-05-29 11:40:01,825 | app.py:219 | app_fit: metrics_distributed_fit {}
INFO flwr 2024-05-29 11:40:01,826 | app.py:220 | app_fit: metrics_distributed {}
INFO flwr 2024-05-29 11:40:01,827 | app.py:221 | app_fit: losses_centralized []
INFO flwr 2024-05-29 11:40:01,827 | app.py:222 | app_fit: metrics_centralized {}


History (loss, distributed):
	round 1: 0.05757532110214233
	round 2: 0.045601885247230536
	round 3: 0.042524653851985936

 pid=27924)[0m Client 9 loss 0.04389840006828308
 pid=27924)[0m Client 9 accuracy 0.516
 pid=27000)[0m Client 1 loss 0.04200864565372467
 pid=27000)[0m Client 1 accuracy 0.526
 pid=26956)[0m Client 4 loss 0.042090866446495054
 pid=26956)[0m Client 4 accuracy 0.498
 pid=19244)[0m Client 8 loss 0.040889406085014346
 pid=19244)[0m Client 8 accuracy 0.53


Exception in thread 2024-05-29 18:33:30,550	ERROR import_thread.py:85 -- ImportThread: <_MultiThreadedRendezvous of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "Stream removed"
	debug_error_string = "{"created":"@1716987810.545000000","description":"Error received from peer ipv4:127.0.0.1:62225","file":"src/core/lib/surface/call.cc","file_line":1075,"grpc_message":"Stream removed","grpc_status":2}"
>
ray_listen_error_messages:
Traceback (most recent call last):
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ipykernel\ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Admin\anaconda3\envs\flwrpytorch\lib\site-packages\ray\worker.py", line 1311, in listen_error_messag