In [1]:
import sys

In [2]:
#!{sys.executable} -m pip install --upgrade pip

In [3]:
#!{sys.executable} -m pip install --user --upgrade kfp==2.0.0b13

In [4]:
# install the ipywidgets and restart kernel the javascript widget for Huggingface download widget will show.
#!{sys.executable} -m pip install --user --upgrade ipywidgets==8.1.0

In [5]:
#!cat ./requirements.txt

In [6]:
#!{sys.executable} -m pip install --user --upgrade -r ./requirements.txt

## (optional) restart kernel

### (optional) Set huggingface cli in terminal

```shell
PATH=${PATH}:/home/jupyter/.local/bin
```

In [7]:
# (optional) uncomment the following lines to set path in python notebook cell for notebook session 
# PATH=%env PATH
# %env PATH={PATH}:/home/jupyter/.local/bin

## Introduction

Multi GPU inference: https://github.com/tloen/alpaca-lora/issues/445

Show accelerator device IDs:

```shell
nvidia-smi -L
```

Nvidia usage
```shell
nvidia-smi -q -g 0 -d UTILIZATION -l
```

python lib: gpustat
```python
gpustat -cp
```

* https://stackoverflow.com/questions/8223811/a-top-like-utility-for-monitoring-cuda-activity-on-a-gpu

### Extract the GPU Accelerator MIG UUIDs

* Extract with re.search and group: https://note.nkmk.me/en/python-str-extract/
* Extract with pattern before and after: https://stackoverflow.com/questions/4666973/how-to-extract-the-substring-between-two-markers

In [8]:
list=!nvidia-smi -L
for i in range(len(list)):
    print(list[i])

GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-51f84540-9ebb-1d44-7bb7-3c62ae55c20e)
  MIG 2g.20gb     Device  0: (UUID: MIG-0efc9f06-6dca-5886-98af-0273ca7fde51)


In [9]:
import re

def get_device_uuid(input: str) -> str:
    try:
        # r'' before the search pattern indicates it is a raw string, 
        # otherwise "" instead of single quote
        uuid = re.search(r'UUID\:\s(.+?)\)', input).group(1)
    except AttributeError:
        # "UUID\:\s" and "\)" not found
        uuid = ""
    return uuid    

# skip the first GPU ID, only get the MIG IDs, using python list slice over index access
uuid_list = [get_device_uuid(e) for e in list[1:]]
# print(uuid_list)
UUIDs = ",".join(uuid_list)
print(UUIDs)

MIG-0efc9f06-6dca-5886-98af-0273ca7fde51


### PyTorch distributed with device UUID
* https://discuss.pytorch.org/t/world-size-and-rank-torch-distributed-init-process-group/57438

In [10]:
import os, time, sys
from platform import python_version
os.environ["WORLD_SIZE"] = "1" 
os.environ["CUDA_VISIBLE_DEVICES"] = UUIDs # "0,1,2"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512" #512


print(os.environ["CUDA_VISIBLE_DEVICES"])
print(python_version())

MIG-0efc9f06-6dca-5886-98af-0273ca7fde51
3.8.10


In [11]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.mem_get_info())
for e in torch.cuda.mem_get_info():
    print(e/1024**3)

True
(20748107776, 20937965568)
19.32318115234375
19.5


In [12]:
 def gpu_usage():
    num_of_gpus = torch.cuda.device_count();
    # this shows only the gpu device, not the MIG
    print(f"num_of_gpus: {num_of_gpus}")
    # available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
    available_gpus = [torch.cuda.get_device_properties(i).name for i in range(torch.cuda.device_count())]
    print(f"device_mig_info: {available_gpus}")
    # Why is there two cuda mem info ? "avaialbe and total" ?
    max_memory=[f'{int(torch.cuda.mem_get_info()[i]/1024**3)-2}GB' for i in range(len(torch.cuda.mem_get_info()))]
    print(f"max_memory: {max_memory}")

gpu_usage()

num_of_gpus: 1
device_mig_info: ['NVIDIA A100 80GB PCIe MIG 2g.20gb']
max_memory: ['17GB', '17GB']


In [13]:
# set the model download cache directory
# DATA_ROOT="/data"
DATA_ROOT="/home/jovyan/llm-models"
os.environ['XDG_CACHE_HOME']=f"{DATA_ROOT}/core-kind/yinwang/models"

model_map = {
   "7B": "meta-llama/Llama-2-7b-chat-hf",
   "13B" : "meta-llama/Llama-2-13b-chat-hf",
   "70B" : "meta-llama/Llama-2-70b-hf"
}

import transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

In [14]:
"""
Load the token
"""
token_file_path = f"{DATA_ROOT}/core-kind/yinwang/.cache/huggingface/token"
file = open(token_file_path, "r")
# file read add a new line to the token, remove it.
token = file.read().replace('\n', '')

# print the raw string to see if there is new line in the token
# print(r'{}'.format(token))

In [15]:
model_type = "13B"
model_name = model_map.get(model_type, "7B")

print(model_name)

meta-llama/Llama-2-13b-chat-hf


In [16]:
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

In [17]:
# type(tokenizer)

In [18]:
# in Transformer 4.32.1 need to use "token" parameter
# in Transformer 4.30.x need to use "use_auth_token" parameter
generator = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    token=token,
    # use_auth_token=token,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [19]:
# type(generator)

In [20]:
# check the available GPU memory after loading the LLM
gpu_usage()

num_of_gpus: 1
device_mig_info: ['NVIDIA A100 80GB PCIe MIG 2g.20gb']
max_memory: ['0GB', '17GB']


In [21]:
def chat_gen(generator: transformers.pipelines.text_generation.TextGenerationPipeline, tokenizer: transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast):    
    def local(input: str) -> None:
        start = time.time()
        sequences = generator(
            input,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            max_length=200,
        )
        for seq in sequences:
            print(f"Result: {seq['generated_text']}")

        end = time.time()
        duration = end - start
        print("-"*20)
        print(f"walltime: {duration} in secs.")
        gpu_usage()
        
    return local
    
chat = chat_gen(generator, tokenizer)

In [22]:
input='Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?\nA: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\n'
print(input)

Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?



In [23]:
chat(input)

Result: Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A: They started with 23 apples. 20 apples were used for lunch. That leaves 23 - 20 = 3 apples left. Then they bought 6 more apples. 3 + 6 = 9. The answer is 9.
Q: If a bookshelf has 12 shelves, and
--------------------
walltime: 59.21881556510925 in secs.
num_of_gpus: 1
device_mig_info: ['NVIDIA A100 80GB PCIe MIG 2g.20gb']
max_memory: ['0GB', '17GB']
