In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import torch
from IPython.display import clear_output

class FedTrainConfig:
    # Training parameters
    max_steps = 10
    num_rounds = 100
    batch_size = 8
    gradient_accumulation_steps = 1
    seq_length = 512
    learning_rate = 5e-5
    
    # Federated learning parameters
    num_clients = 20
    sample_clients = 2
    fed_alg = "fedavg"
    
    # Model parameters
    model_name = "meta-llama/Llama-2-7b-hf"
    lora_r = 16
    lora_alpha = 32  # 通常设置为lora_r的2倍
    
    # Dataset parameters
    dataset_name = "vicgalle/alpaca-gpt4"
    dataset_sample = 20000
    template = "alpaca"
    
    # Device configuration
    device = "cuda" if torch.cuda.is_available() else "cpu"
    load_in_8bit = True
    
    # Output configuration
    output_dir = "./output"
    
config = FedTrainConfig()

In [None]:
from IPython.display import display
import ipywidgets as widgets

# Create widgets for hyperparameters
lr_slider = widgets.FloatLogSlider(
    value=5e-5,
    min=-6, max=-3,
    step=0.5,
    description='learning rate'
)

batch_selector = widgets.Dropdown(
    options=[4, 8, 16, 32],
    value=8,
    description='batch size'
)

round_slider = widgets.IntSlider(
    value=100,
    min=10, max=500,
    step=10,
    description='num rounds'
)

display(lr_slider, batch_selector, round_slider)

# Update config with new hyperparameters
config.learning_rate = lr_slider.value
config.batch_size = batch_selector.value
config.num_rounds = round_slider.value

In [None]:
from config import get_config, save_config

# Initialize configuration object
script_args, fed_args, peft_config = get_config()
script_args.dataset_name = config.dataset_name
script_args.dataset_sample = config.dataset_sample
script_args.model_name_or_path = config.model_name
script_args.output_dir = config.output_dir

### Load model

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, prepare_model_for_kbit_training, LoraConfig

# Initialize model
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    load_in_8bit=config.load_in_8bit,
    device_map="auto"
)

# Prepare model for 8-bit training
if config.load_in_8bit:
    model = prepare_model_for_kbit_training(model)

# Apply LoRA configuration
peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
print(f"Model is loaded to {config.device}")

[2025-03-17 00:33:07,913] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Loading model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.12433454005023165
Model is loaded to cuda


In [33]:
import torchinfo
from torchinfo import summary

# Create simple input
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
sample_text = "Hello, how are you?"
inputs = tokenizer(sample_text, return_tensors="pt").to(config.device)


summary(
    model,
    input_data=inputs["input_ids"],
    depth=6,  # Low depth to start - increase for more details
    dtypes=[torch.bfloat16]
    # col_names=["input_size", "output_size", "num_params"],
    # device=config.device,
    # verbose=1
)




Layer (type:depth-idx)                                            Output Shape              Param #
PeftModelForCausalLM                                              [1, 32, 7, 128]           --
├─LoraModel: 1-1                                                  [1, 32, 7, 128]           --
│    └─LlamaForCausalLM: 2-1                                      --                        --
│    │    └─LlamaModel: 3-1                                       [1, 32, 7, 128]           --
│    │    │    └─Embedding: 4-1                                   [1, 7, 4096]              (131,072,000)
│    │    │    └─ModuleList: 4-2                                  --                        --
│    │    │    │    └─LlamaDecoderLayer: 5-1                      [1, 7, 4096]              --
│    │    │    │    │    └─LlamaRMSNorm: 6-1                      [1, 7, 4096]              (4,096)
│    │    │    │    │    └─LlamaAttention: 6-2                    [1, 7, 4096]              67,371,008
│    │    │    │    │

In [12]:
gb_footprint = model.get_memory_footprint()/ (1024 * 1024 * 1024)
print(f"Model Memory Footprint: {gb_footprint:.2f} GB")

Model Memory Footprint: 7.10 GB


### Data preparation

In [None]:
from utils import get_dataset, process_sft_dataset
from federated_learning import split_dataset

# Load dataset
print("Loading dataset...")
dataset = get_dataset(script_args.dataset_name)
dataset = process_sft_dataset(script_args.dataset_name, dataset, config.dataset_sample)

# Split client data
print("Spliting client data...")
local_datasets = split_dataset(fed_args, script_args, dataset)
sample_num_list = [len(local_datasets[i]) for i in range(fed_args.num_clients)]
clear_output()
print("Dataset loaded and split successfully!")