In [1]:
!pip install torch transformers peft datasets accelerate bitsandbytes scipy matplotlib


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0


In [2]:
!pip install -q triton

In [3]:
from data_manager import DataManager
# Reload to ensure we use the latest fixed classes
import core_blockwise
import importlib
importlib.reload(core_blockwise)
from core_blockwise import FederatedClientBlockWise, ServerBlockWise
import torch
import gc

# 1. Setup Data
print("--- Setting up Data & Server ---")
dm = DataManager()
datasets = dm.setup_real_world_benchmark()

# 2. Setup Server
server = ServerBlockWise(num_clients=10)

# 3. Setup 10 Clients (Heterogeneous)
clients_baseline = []
for i in range(10):
    model_type = "small" if i < 5 else "large"
    clients_baseline.append(FederatedClientBlockWise(i, model_type, datasets[i]))

# 4. Run Baseline Simulation
print(f"\n==============================================")
print(f"STARTING BASELINE: GRADIENT MODE (SimGrad)")
print(f"Architecture: Block-Wise Deep Adapters")
print(f"==============================================")

personalized_globals = [None] * 10
ROUNDS = 5

for r in range(ROUNDS):
    print(f"\n>>> Round {r+1} (grad) <<<")
    updates = []

    for i, client in enumerate(clients_baseline):
        # Clean memory before training
        gc.collect(); torch.cuda.empty_cache()

        g_dict = personalized_globals[i]

        # KEY: mode="grad" runs the Baseline logic
        # The print statement inside here will show: "Client X (small) Loss: 4.xxxx"
        update = client.train_and_rela(g_dict, mode="grad")
        updates.append(update)

    # Server Aggregation
    personalized_globals = server.aggregate(updates)

    # Clean memory after round
    del updates
    gc.collect()

print("\nBaseline Simulation Complete.")

--- Setting up Data & Server ---
--- Initializing 'Scaled-Up' Benchmark (Mimicking DRAKE) ---
Loading MNLI (Relation Proxy)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

mnli/train-00000-of-00001.parquet:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

mnli/validation_matched-00000-of-00001.p(…):   0%|          | 0.00/1.21M [00:00<?, ?B/s]

mnli/validation_mismatched-00000-of-0000(…):   0%|          | 0.00/1.25M [00:00<?, ?B/s]

mnli/test_matched-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

mnli/test_mismatched-00000-of-00001.parq(…):   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

Loading CommonsenseQA (Reasoning Proxy)...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/160k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/151k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9741 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1140 [00:00<?, ? examples/s]

Loading SQuAD (VQA Proxy)...


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Loading GSM8K (Math Expert)...


README.md: 0.00B [00:00, ?B/s]

main/train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

main/test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

✅ Created 10 Clients covering 4 Distinct Cognitive Domains.

STARTING BASELINE: GRADIENT MODE (SimGrad)
Architecture: Block-Wise Deep Adapters

>>> Round 1 (grad) <<<
  > [C0] Training on 4 examples...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


    - Final Loss: 10.3846
  > [C1] Training on 4 examples...
    - Final Loss: 6.2812
  > [C2] Training on 4 examples...
    - Final Loss: 9.7976
  > [C3] Training on 4 examples...
    - Final Loss: 4.6803
  > [C4] Training on 4 examples...
    - Final Loss: 4.7952
  > [C5] Training on 4 examples...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

    - Final Loss: 3.6559
  > [C6] Training on 4 examples...
    - Final Loss: 2.3980
  > [C7] Training on 4 examples...
    - Final Loss: 3.1330
  > [C8] Training on 4 examples...
    - Final Loss: 2.7711
  > [C9] Training on 4 examples...
    - Final Loss: 1.2688
Server (BlockWise): Aggregating...

>>> Round 2 (grad) <<<
  > [C0] Training on 4 examples...
    - Final Loss: 10.5013
  > [C1] Training on 4 examples...
    - Final Loss: 6.3232
  > [C2] Training on 4 examples...
    - Final Loss: 7.5036
  > [C3] Training on 4 examples...
    - Final Loss: 4.7332
  > [C4] Training on 4 examples...
    - Final Loss: 4.6621
  > [C5] Training on 4 examples...
    - Final Loss: 3.7718
  > [C6] Training on 4 examples...
    - Final Loss: 2.5150
  > [C7] Training on 4 examples...
    - Final Loss: 1.9792
  > [C8] Training on 4 examples...
    - Final Loss: 3.7995
  > [C9] Training on 4 examples...
    - Final Loss: 1.3012
Server (BlockWise): Aggregating...

>>> Round 3 (grad) <<<
  > [C0] Trainin

In [None]:
# 1. Setup Clients for RepSim (Resetting them to start fresh)
clients_repsim = []
for i in range(10):
    model_type = "small" if i < 5 else "large"
    # Re-initialize to ensure we start from the same random seed point
    clients_repsim.append(FederatedClientBlockWise(i, model_type, datasets[i]))

# 2. Run RepSim Simulation
print(f"\n==============================================")
print(f"STARTING NOVEL METHOD: REPSIM MODE")
print(f"Architecture: Block-Wise Deep Adapters")
print(f"==============================================")

personalized_globals = [None] * 10
ROUNDS = 5

for r in range(ROUNDS):
    print(f"\n>>> Round {r+1} (repsim) <<<")
    updates = []

    for i, client in enumerate(clients_repsim):
        gc.collect(); torch.cuda.empty_cache()
        g_dict = personalized_globals[i]

        # KEY: mode="repsim" runs the Novel logic
        update = client.train_and_rela(g_dict, mode="repsim")
        updates.append(update)

    personalized_globals = server.aggregate(updates)
    del updates
    gc.collect()

print("\nRepSim Simulation Complete.")


STARTING NOVEL METHOD: REPSIM MODE
Architecture: Block-Wise Deep Adapters

>>> Round 1 (repsim) <<<
  > [C0] Training on 4 examples...
    - Final Loss: 12.2837
  > [C1] Training on 4 examples...
    - Final Loss: 5.9409
  > [C2] Training on 4 examples...
    - Final Loss: 4.8361
  > [C3] Training on 4 examples...
    - Final Loss: 4.8162
  > [C4] Training on 4 examples...
    - Final Loss: 5.1167
