In [6]:
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# 定义您的模型
class YourModel(nn.Module):
    def __init__(self):
        super(YourModel, self).__init__()
        # 模型结构定义
        ...

    def forward(self, x):
        # 前向传播逻辑
        ...

# 创建模型实例
model = YourModel()

# 如果有多个 GPU 可用，则使用 DataParallel 包装您的模型
if torch.cuda.device_count() > 1:
    print(f"Let's use {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

# 将模型移动到 GPU
model.to('cuda')

# 创建数据加载器
data_loader = DataLoader(your_dataset, batch_size=your_batch_size, shuffle=True)

# 训练循环
for data in data_loader:
    inputs, labels = data
    inputs, labels = inputs.cuda(), labels.cuda()  # 将数据移动到 GPU
    outputs = model(inputs)  # 前向传播
    loss = loss_function(outputs, labels)  # 计算损失
    # 反向传播和优化
    ...


In [3]:
!ls

'How to use Datasets.ipynb'   如何使用分布式训练模型.ipynb
 aa.py			      如何使用单机多卡训练模型.ipynb
 bb.py			      如何微调一个模型.ipynb
 data			      如何训练一个Lora模型.ipynb
 train_man.py		      如何训练一个模型.ipynb
 大模型的使用.ipynb


In [4]:
!ls

'How to use Datasets.ipynb'   如何使用分布式训练模型.ipynb
 aa.py			      如何使用单机多卡训练模型.ipynb
 bb.py			      如何微调一个模型.ipynb
 data			      如何训练一个Lora模型.ipynb
 train_man.py		      如何训练一个模型.ipynb
 大模型的使用.ipynb


In [7]:

# 初始化分布式环境
def init_distributed(rank, world_size):
    dist.init_process_group(
        backend='nccl',  # 如果使用 GPU，则推荐 'nccl'
        init_method='env://',  # 使用环境变量来初始化
        world_size=world_size,
        rank=rank
    )

In [8]:
# 创建模型并包装为 DDP 模型
def create_ddp_model(model, rank):
    model = model.to(rank)  # 将模型移动到对应的设备
    ddp_model = DDP(model, device_ids=[rank])  # 包装模型
    return ddp_model


In [None]:
# 假设您有一个已定义的模型和数据加载器
model = ...  # 您的模型定义
train_loader = ...  # 您的数据加载器

# 设置分布式环境
rank = 0  # 当前进程的排名
world_size = 4  # 总共的进程数
init_distributed(rank, world_size)

# 创建 DDP 模型
ddp_model = create_ddp_model(model, rank)

# 训练循环
for epoch in range(num_epochs):
    for data, target in train_loader:
        # 正向传播和反向传播
        ...


In [None]:

# 多机多卡训练
分布式训练
torchrun

In [None]:
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, Dataset
from torch.utils.data.distributed import DistributedSampler

# 初始化分布式环境
def setup(rank, world_size):
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

# 创建一个简单的数据集
class SimpleDataset(Dataset):
    def __getitem__(self, index):
        return torch.tensor([index]), torch.tensor([index])

    def __len__(self):
        return 100

# 创建模型
class SimpleModel(torch.nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.linear = torch.nn.Linear(1, 1)

    def forward(self, x):
        return self.linear(x)

def main(rank, world_size):
    setup(rank, world_size)
    
    # 创建数据加载器
    dataset = SimpleDataset()
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(dataset, batch_size=10, sampler=sampler)

    # 创建模型并包装为 DDP 模型
    model = SimpleModel().to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    # 训练循环
    for epoch in range(2):
        for data, target in dataloader:
            optimizer = torch.optim.SGD(ddp_model.parameters(), lr=0.01)
            outputs = ddp_model(data.to(rank))
            loss = torch.nn.functional.mse_loss(outputs, target.to(rank))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

if __name__ == "__main__":
    world_size = 2  # 假设有两个 GPU 可用
    for rank in range(world_size):
        main(rank, world_size)


In [None]:
# 分布式训练

export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=eth1
export NCCL_IB_GID_INDEX=3
export NCCL_IB_SL=3
export NCCL_NET_GDR_READ=1

export MASTER_ADDR="${CHIEF_IP:=localhost}"
export MASTER_PORT="${MASTER_PORT:=29500}"

path= #path to the project
train_path=$path/train/run_clm_lora.py

model_path=$path/model/llama2-7B-HF
model_save=$path/checkpoint/chinese-llama2-7b-4096-enzh/

torchrun --nnodes 1 --node_rank $INDEX --nproc_per_node 8 \
  --master_addr $MASTER_ADDR --master_port $MASTER_PORT  \
  ${train_path} \
  --deepspeed $path/train/deepspeed_config_bf16.json \
  --model_name_or_path ${model_path} \
  --train_file $path/data/instruction/all_instruction_hf.json \
  --validation_file $path/data/instruction/all_instruction_hf_dev.json \
  --preprocessing_num_workers 32 \
  --dataloader_num_workers 16 \
  --dataloader_pin_memory True \
  --per_device_train_batch_size 2 \
  --per_device_eval_batch_size 1 \
  --gradient_accumulation_steps 8 \
  --num_train_epochs 3 \
  --save_strategy "steps" \
  --save_steps 500 \
  --save_total_limit 1 \
  --learning_rate 2e-5 \
  --weight_decay 0. \
  --warmup_ratio 0.03 \
  --lr_scheduler_type "cosine" \
  --logging_steps 10 \
  --block_size 4096 \
  --use_lora True \
  --lora_config $path/train/lora_config.json \
  --do_train \
  --bf16 True \
  --bf16_full_eval True \
  --evaluation_strategy "no" \
  --validation_split_percentage 0 \
  --streaming \
  --ddp_timeout 72000 \
  --seed 1 \
  --overwrite_output_dir\
  --gradient_checkpointing True \
  --output_dir ${model_save}



pip install flash-attn==1.0.4

export NCCL_DEBUG=INFO
export NCCL_SOCKET_IFNAME=eth1
export NCCL_IB_GID_INDEX=3
export NCCL_IB_SL=3
export NCCL_NET_GDR_READ=1

export MASTER_ADDR="${CHIEF_IP:=localhost}"
export MASTER_PORT="${MASTER_PORT:=29500}"

export HF_HOME=
export TRANSFORMERS_CACHE=
path= # path to llama2-chinese
train_path=$path/train/run_clm_llms_mem.py
model_path=$path/model/llama2-7B-HF # place original model here
model_save=$path/checkpoint/llama2-7b-llama2_coig_dt_ca-all/

# MASTER_ADDR set to localhost
HOST_NUM=2
torchrun --nnodes $HOST_NUM --node_rank $INDEX --nproc_per_node 8 \
    --master_addr $MASTER_ADDR --master_port $MASTER_PORT  \
    ${train_path} \
    --deepspeed $path/train/deepspeed_config_bf16.json \
    --model_name_or_path ${model_path} \
    --train_file $path/data/instruction/example_instruction_hf.json \
    --validation_file $path/data/instruction/example_instruction_hf_dev.json \
    --preprocessing_num_workers 32 \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --gradient_accumulation_steps 2 \
    --num_train_epochs 3 \
    --save_strategy "steps" \
    --save_steps 500 \
    --save_total_limit 2 \
    --learning_rate 2e-5 \
    --weight_decay 0. \
    --warmup_ratio 0.03 \
    --lr_scheduler_type "cosine" \
    --logging_steps 10 \
    --block_size 4096 \
    --do_train \
    --bf16 True \
    --bf16_full_eval True \
    --evaluation_strategy "no" \
    --validation_split_percentage 0 \
    --streaming \
    --ddp_timeout 72000 \
    --seed 1 \
    --overwrite_output_dir\
    --gradient_checkpointing True \
    --output_dir ${model_save}\


In [None]:
export HCCL_OP_BASE_FFTS_MODE_ENABLE=1
export ATB_OPERATION_EXECUTE_ASYNC=1
export TASK_QUEUE_ENABLE=1
export HCCL_BUFFSIZE=110
export ATB_WORKSPACE_MEM_ALLOC_GLOBAL=0
export ATB_CONTEXT_WORKSPACE_RING=1
export PYTORCH_NPU_ALLOC_CONF="max_split_size_mb:2048"

    torchrun --nproc_per_node 2 --master_port 25641 run_llama_parallel_performance.py \
    --load_path "./llama2-7b-hf_parallel" \
    --device 2 \
    --batch 1 \
    --seqlen_in 128 \
    --seqlen_out 128 \
    --multi_case 0 \
    --model_name "LLAMA2-7B" \
    --multi_batch_size [1] \
    --set_case_pair 0 \
    --seqlen_in_range [5,11] \
    --seqlen_out_range [5,11] \
    --seqlen_in_pair [256,256,512,1024] \
    --seqlen_out_pair [64,256,512,1024]



In [2]:
!nvidia-smi

Sat Dec 16 02:11:06 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A30          Off  | 00000000:3B:00.0 Off |                    0 |
| N/A   22C    P0    27W / 165W |   3159MiB / 24576MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A30          Off  | 00000000:86:00.0 Off |                    0 |
| N/A   22C    P0    27W / 165W |   7173MiB / 24576MiB |      0%      Default |
|       

In [None]:
torch.nn.DataParallel  
nn.DistributedDataParallel

In [None]:
1 DistributedDataParallel 更优 效率更高 适应性更好 独立的Python解释器 真正实现分布式训练 适用于单机和多机情况

In [2]:
!pip310 install torchvision

Looking in indexes: http://192.168.8.125:8100/simple/
Collecting torchvision
  Downloading http://192.168.8.125:8100/simple/torchvision/torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m136.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: torchvision
Successfully installed torchvision-0.16.2
[0m开始同步

同步完毕



In [2]:
!pwd

/home/upgrade/zxfMLtools_tutl
