# Lab 10.2 Accelerating Supervised Fine Tuning with Deepspeed

## (Part II Play with the device_map)

Here we first use a automatic-configured device map to load the original llama-2 model.

In [1]:
################################################################################
# Shared parameters between inference and SFT training
################################################################################
import torch

# The base model
model_name = "/share/model/llama-2-7b-chat-hf"


In [2]:
################################################################################
# bitsandbytes parameters
################################################################################
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,    # use 4-bit precision for base model loading
    bnb_4bit_quant_type= "nf4",  # Quantization type (fp4 or nf4)
    bnb_4bit_compute_dtype= torch.bfloat16,   # Compute dtype for 4-bit base models  "float16" or torch.bfloat16
    bnb_4bit_use_double_quant= False,  # Activate nested quantization for 4-bit base models (double quantization)
)

We can print the device map of the model after loading it.

In [3]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)

First let us load the model with device_map set to auto.

In [4]:
device_map="auto"

# Load base model with bnb config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

print(model.hf_device_map)

# release GPU memory

torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()


{'model.embed_tokens': 0, 'model.layers.0': 0, 'model.layers.1': 0, 'model.layers.2': 0, 'model.layers.3': 0, 'model.layers.4': 0, 'model.layers.5': 0, 'model.layers.6': 0, 'model.layers.7': 0, 'model.layers.8': 0, 'model.layers.9': 0, 'model.layers.10': 0, 'model.layers.11': 0, 'model.layers.12': 0, 'model.layers.13': 1, 'model.layers.14': 1, 'model.layers.15': 1, 'model.layers.16': 1, 'model.layers.17': 1, 'model.layers.18': 1, 'model.layers.19': 1, 'model.layers.20': 1, 'model.layers.21': 1, 'model.layers.22': 1, 'model.layers.23': 1, 'model.layers.24': 1, 'model.layers.25': 1, 'model.layers.26': 1, 'model.layers.27': 1, 'model.layers.28': 1, 'model.layers.29': 1, 'model.layers.30': 1, 'model.layers.31': 1, 'model.norm': 1, 'lm_head': 1}


We can find that the model is splitted into two pieces, each on one GPU. 

Note that with  `device_map = "auto"` we train the model in a pipeline (but not parallel) manner with a single process. That means two GPUs on our host execute computation serially and thus work slowly. 

Instead, now we force to put the entire model into one GPU. 

In [5]:
device_map = {'':1}

# Load base model with bnb config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

print(model.hf_device_map)

# release GPU memory

torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

{'': 1}


We can also configure the device map according to some rules (or write a dictory manually, which is cumbersome). For example, we hope to put models on even-numbered GPUs via the `max_memory`: 

In [6]:
GPU_COUNT = torch.cuda.device_count()
max_memory = {}
device_map = "auto"
for i in range(GPU_COUNT):
    if i % 2 == 0:
        max_memory[i] = "24GB"
    else:
        max_memory[i] = 0



# Load base model with bnb config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    max_memory=max_memory
)
model.config.use_cache = False
model.config.pretraining_tp = 1

print(model.hf_device_map)

# release GPU memory

torch.cuda.reset_peak_memory_stats()
torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

{'': 0}


Finally, we should learn to configure the device map for distributed parallel training. 

In distributed training, each process works for a single GPU and has a unique `rank` and `local rank` from the os environment. Therefore, to put the model to the correct position for each process, we should configure the device map as follows:

In [7]:
import os

local_rank = int(os.environ.get("LOCAL_RANK", -1))
device_map = {'':local_rank}