In [1]:
import sys

In [2]:
#!{sys.executable} -m pip install --upgrade pip

In [3]:
#!{sys.executable} -m pip install --user --upgrade kfp==2.0.0b13

In [4]:
# install the ipywidgets and restart kernel the javascript widget for Huggingface download widget will show.
#!{sys.executable} -m pip install --user --upgrade ipywidgets==8.1.0

In [5]:
!cat ./requirements.txt

huggingface_hub==0.16.4
transformers==4.32.1
torch==2.0.1
accelerate==0.22.0
# huggingface_hub use_auth_token need this.
urllib3==2.0.4 
jsonschema==4.19.0
fastai==2.7.12
torchaudio==2.0.2
torchvision==0.15.2
# for showing download widget in jupyter notebook
ipywidgets==8.1.0
# for python script input arg generation
click==8.1.7
# argparse==1.4.0
#
# monitor nvidia gpu usage
# have no permission to access
# gpustat==1.1.1
# nvidia-smi --query-compute-apps=pid,process_name,used_memory --format=csv
# from a kubeflow notebook
#
# kfp==1.8.22
# 
# python method overload
multipledispatch==1.0.0

In [6]:
!{sys.executable} -m pip install --user --upgrade -r ./requirements.txt



## (optional) restart kernel

### (optional) Set huggingface cli in terminal

```shell
PATH=${PATH}:/home/jupyter/.local/bin
```

In [7]:
# (optional) uncomment the following lines to set path in python notebook cell for notebook session 
# PATH=%env PATH
# %env PATH={PATH}:/home/jupyter/.local/bin

## Introduction

Multi GPU inference: https://github.com/tloen/alpaca-lora/issues/445

Show accelerator device IDs:

```shell
nvidia-smi -L
```

Nvidia usage
```shell
nvidia-smi -q -g 0 -d UTILIZATION -l
```

python lib: gpustat
```python
gpustat -cp
```

* https://stackoverflow.com/questions/8223811/a-top-like-utility-for-monitoring-cuda-activity-on-a-gpu

Check GPU info in PyTorch
* https://stackoverflow.com/questions/48152674/how-do-i-check-if-pytorch-is-using-the-gpu
* CUDA memory management https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management

### Extract the GPU Accelerator MIG UUIDs

* Extract with re.search and group: https://note.nkmk.me/en/python-str-extract/
* Extract with pattern before and after: https://stackoverflow.com/questions/4666973/how-to-extract-the-substring-between-two-markers

In [8]:
list=!nvidia-smi -L
for i in range(len(list)):
    print(list[i])

GPU 0: NVIDIA A100 80GB PCIe (UUID: GPU-51f84540-9ebb-1d44-7bb7-3c62ae55c20e)
  MIG 2g.20gb     Device  0: (UUID: MIG-0efc9f06-6dca-5886-98af-0273ca7fde51)


In [9]:
import re

def get_device_uuid(input: str) -> str:
    try:
        # r'' before the search pattern indicates it is a raw string, 
        # otherwise "" instead of single quote
        uuid = re.search(r'UUID\:\s(.+?)\)', input).group(1)
    except AttributeError:
        # "UUID\:\s" and "\)" not found
        uuid = ""
    return uuid    

# skip the first GPU ID, only get the MIG IDs, using python list slice over index access
uuid_list = [get_device_uuid(e) for e in list[1:]]
# print(uuid_list)
UUIDs = ",".join(uuid_list)
print(UUIDs)

MIG-0efc9f06-6dca-5886-98af-0273ca7fde51


### PyTorch distributed with device UUID
* https://discuss.pytorch.org/t/world-size-and-rank-torch-distributed-init-process-group/57438

In [10]:
import os, time, sys
from platform import python_version
os.environ["WORLD_SIZE"] = "1" 
os.environ["CUDA_VISIBLE_DEVICES"] = UUIDs # "0,1,2"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512" #512


print(os.environ["CUDA_VISIBLE_DEVICES"])
print(python_version())

MIG-0efc9f06-6dca-5886-98af-0273ca7fde51
3.8.10


In [11]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.mem_get_info())
for e in torch.cuda.mem_get_info():
    print(e/1024**3)

True
(20748107776, 20937965568)
19.32318115234375
19.5


In [12]:
# https://stackoverflow.com/questions/58216000/get-total-amount-of-free-gpu-memory-and-available-using-pytorch
# torch.cuda.device_count()
# t = torch.cuda.get_device_properties(0).total_memory
# r = torch.cuda.memory_reserved(0)
# a = torch.cuda.memory_allocated(0)
# f = r-a  # free inside reserved
# print(t/1024**3)
# print(r/1024**3)
# print(a/1024**3)
# print(f/1024**3)

In [13]:
# Reference: https://stackoverflow.com/questions/58216000/get-total-amount-of-free-gpu-memory-and-available-using-pytorch
# from typing import Tuple

def byte_gb_info(byte_mem) -> str:
    """calculate the byte size to GB size for better human readable"""
    # format the f string float with :.2f to decimal digits
    # https://zetcode.com/python/fstring/
    return f"{(byte_mem/1024**3):4f} GB"


def accelerator_mem_info(device_idx: int):
    # total
    t = torch.cuda.get_device_properties(device_idx).total_memory
    # usable
    r = torch.cuda.memory_reserved(device_idx)
    # allocated
    a = torch.cuda.memory_allocated(device_idx)
    # still free
    f = r-a
    # unit = "GB"   
    print( # "GPU memory info:\n" + 
          f"Physical  memory : {byte_gb_info(t)}\n" + 
          f"Reserved  memory : {byte_gb_info(r)}\n" + 
          f"Allocated memory : {byte_gb_info(a)}\n" + 
          f"Free      memory : {byte_gb_info(f)}")

    
def accelerator_compute_info(device_idx: int):
    name = torch.cuda.get_device_properties(device_idx).name
    count = torch.cuda.get_device_properties(device_idx).multi_processor_count
    print(f"Device_name      : {name} \n" +
          f"Multi_processor  : {count}")    

    
def gpu_usage():        
    num_of_gpus = torch.cuda.device_count();
    # this shows only the gpu device, not the MIG
    print(f"num_of_gpus: {num_of_gpus}")
    # available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
    # available_gpus = [torch.cuda.get_device_properties(i).name for i in range(torch.cuda.device_count())]
    # print(f"device_mig_info: {available_gpus}")
    for device_idx in range(torch.cuda.device_count()):
        print("-"*20)
        accelerator_compute_info(device_idx)                 
        accelerator_mem_info(device_idx)
        print("-"*20)
    # Why is there two cuda mem info ? "avaialbe and total" ?
    # max_memory=[f'{int(torch.cuda.mem_get_info()[i]/1024**3)-2}GB' for i in range(len(torch.cuda.mem_get_info()))]
    # print(f"max_memory: {max_memory}")

    
gpu_usage()

num_of_gpus: 1
--------------------
Device_name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Multi_processor  : 28
Physical  memory : 19.500000 GB
Reserved  memory : 0.000000 GB
Allocated memory : 0.000000 GB
Free      memory : 0.000000 GB
--------------------


In [14]:
# set the model download cache directory
# DATA_ROOT="/data"
DATA_ROOT="/home/jovyan/llm-models"
os.environ['XDG_CACHE_HOME']=f"{DATA_ROOT}/core-kind/yinwang/models"

model_map = {
   "7B": "meta-llama/Llama-2-7b-chat-hf",
   "13B" : "meta-llama/Llama-2-13b-chat-hf",
   "70B" : "meta-llama/Llama-2-70b-hf"
}

import transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

In [15]:
"""
Load the token
"""
token_file_path = f"{DATA_ROOT}/core-kind/yinwang/.cache/huggingface/token"
file = open(token_file_path, "r")
# file read add a new line to the token, remove it.
token = file.read().replace('\n', '')

# print the raw string to see if there is new line in the token
# print(r'{}'.format(token))

In [16]:
# model_type = "13B"
model_type = "7B"
model_name = model_map.get(model_type, "7B")

print(model_name)

meta-llama/Llama-2-7b-chat-hf


In [17]:
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

In [18]:
# type(tokenizer)

In [19]:
%time
# in Transformer 4.32.1 need to use "token" parameter
# in Transformer 4.30.x need to use "use_auth_token" parameter
generator = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    token=token,
    # use_auth_token=token,
)

CPU times: user 3 µs, sys: 4 µs, total: 7 µs
Wall time: 13.1 µs


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [20]:
# type(generator)

In [21]:
# check the available GPU memory after loading the LLM
gpu_usage()

num_of_gpus: 1
--------------------
Device_name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Multi_processor  : 28
Physical  memory : 19.500000 GB
Reserved  memory : 12.615234 GB
Allocated memory : 12.613792 GB
Free      memory : 0.001442 GB
--------------------


In [22]:
def chat_gen(generator: transformers.pipelines.text_generation.TextGenerationPipeline, tokenizer: transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast):    
    def local(input: str) -> None:
        start = time.time()
        sequences = generator(
            input,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            max_length=200,
        )
        for seq in sequences:
            print(f"Result: \n{seq['generated_text']}")

        end = time.time()
        duration = end - start
        print("-"*20)
        print(f"walltime: {duration} in secs.")
        gpu_usage()
        
    return local
    
chat = chat_gen(generator, tokenizer)

In [23]:
# chain of thoughts prompting
input='Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?\nA: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\n'
print(input)

Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?



In [24]:
chat(input)

Result: 
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A: The cafeteria started with 23 apples. They used 20 to make lunch, leaving 3 apples. They bought 6 more, so now they have 3 + 6 = 9 apples. The answer is 9.
Q: A pizza parlor has 12 large pizzas to deliver. If they
--------------------
walltime: 3.8247992992401123 in secs.
num_of_gpus: 1
--------------------
Device_name      : NVIDIA A100 80GB PCIe MIG 2g.20gb 
Multi_processor  : 28
Physical  memory : 19.500000 GB
Reserved  memory : 12.974609 GB
Allocated memory : 12.621727 GB
Free      memory : 0.352882 GB
--------------------


In [25]:
from pdf_text_loader import PDFHelper

loader = PDFHelper(data_folder = "./data/medreports", file_pattern="KK-SCIVIAS-*.pdf")

In [34]:
context = loader.read_pdf(1)
input = f"Context: Patient: Fried\nFrage: Welcher name hat der Patient?\nAntwort: Name ist Fried\nContext: {context}\nFrage: Welcher name hat die Patientin?\nAntwort: die Patientin hat name "

#print(input)

In [36]:
#chat(input)