In [1]:
import sys

In [2]:
#!cat ./requirements.txt

In [3]:
#!{sys.executable} -m pip install --upgrade pip

In [5]:
!{sys.executable} -m pip install --user --upgrade -r ./requirements.txt --extra-index-url https://download.pytorch.org/whl/cu118

[0mLooking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu118
[0m

In [6]:
# !{sys.executable} -m pip list

#### Useful installation for KF notebook 1.7.0 cu111 drivers

```shell
#!{sys.executable} -m pip install --user --upgrade transformers==4.31.0
#!{sys.executable} -m pip install --user --upgrade torch==1.10.2+cu111 fastai==2.7.12 fastcore==1.5.29 fastdownload==0.0.7 torchvision==0.11.3+cu111 --extra-index-url https://download.pytorch.org/whl/cu111
#!{sys.executable} -m pip install --user --upgrade accelerate==0.20.3
```

We shall use the cuda 11.8 version (Cuda118)
```shell
#!{sys.executable} -m pip install --user --upgrade torch==2.0.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
```
`xformers==0.0.21` need `torch==2.0.1``
```shell
#!{sys.executable} -m pip install --user --upgrade xformers==0.0.21 torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
```

show js loading with ipywidgets
```shell
#!{sys.executable} -m pip install --user --upgrade ipywidgets==8.1.0 comm==0.1.4 jupyterlab-widgets==3.0.8 widgetsnbextension==4.0.8
```

uninstall
```shell
#!{sys.executable} -m pip uninstall accelerator transformers xformers torch -y 
```

## (optional) restart kernel

### (optional) Set huggingface cli in terminal

```shell
PATH=${PATH}:/home/jupyter/.local/bin
```

In [7]:
# (optional) uncomment the following lines to set path in python notebook cell for notebook session 
# PATH=%env PATH
# %env PATH={PATH}:/home/jupyter/.local/bin

## Introduction

Multi GPU inference: https://github.com/tloen/alpaca-lora/issues/445

Show accelerator device IDs:

```shell
nvidia-smi -L
```

Nvidia usage
```shell
nvidia-smi -q -g 0 -d UTILIZATION -l
```

python lib: gpustat
```python
gpustat -cp
```

* https://stackoverflow.com/questions/8223811/a-top-like-utility-for-monitoring-cuda-activity-on-a-gpu

Check GPU info in PyTorch
* https://stackoverflow.com/questions/48152674/how-do-i-check-if-pytorch-is-using-the-gpu
* CUDA memory management https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management

### Extract the GPU Accelerator MIG UUIDs

* Extract with re.search and group: https://note.nkmk.me/en/python-str-extract/
* Extract with pattern before and after: https://stackoverflow.com/questions/4666973/how-to-extract-the-substring-between-two-markers

In [None]:
list=!nvidia-smi -L
for i in range(len(list)):
    print(list[i])

In [None]:
import re

def get_device_uuid(input: str) -> str:
    try:
        # r'' before the search pattern indicates it is a raw string, 
        # otherwise "" instead of single quote
        uuid = re.search(r'UUID\:\s(.+?)\)', input).group(1)
    except AttributeError:
        # "UUID\:\s" and "\)" not found
        uuid = ""
    return uuid    

# skip the first GPU ID, only get the MIG IDs, using python list slice over index access
uuid_list = [get_device_uuid(e) for e in list[1:]]
# print(uuid_list)
UUIDs = ",".join(uuid_list)
print(UUIDs)

### PyTorch distributed with device UUID
* https://discuss.pytorch.org/t/world-size-and-rank-torch-distributed-init-process-group/57438

In [None]:
import os, time, sys
from platform import python_version
# data volume mounted in kubeflow notebook
DATA_ROOT="/home/jovyan/llm-models"
os.environ["WORLD_SIZE"] = "1" 
os.environ["CUDA_VISIBLE_DEVICES"] = UUIDs # "0,1,2"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512" #512
os.environ["CUDA_LAUNCH_BLOCKING"]="1" # for debugging
os.environ["TOKENIZERS_PARALLELISM"]="false"
os.environ['XDG_CACHE_HOME']=f"{DATA_ROOT}/core-kind/yinwang/models"


print(os.environ["CUDA_VISIBLE_DEVICES"])
print(python_version())

#### CUDA MIG memory notice
The following python command shall show the available MIG memory
```shell
print(torch.cuda.mem_get_info())
for e in torch.cuda.mem_get_info():
    print(e/1024**3)
```
The first tuple shows the availabe MIG cuda memory, if it goes to zero, and no process is attached,
this means a cuda process is hang.
```console
(20748107776, 20937965568)
19.32318115234375
19.5
```

To terminate a cuda process, log into the GPU host
```shell
nvidia-smi # find out the PID something like 830333
sudo kill -9 PID
```

In [None]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.mem_get_info())
for e in torch.cuda.mem_get_info():
    print(e/1024**3)

In [None]:
# https://stackoverflow.com/questions/58216000/get-total-amount-of-free-gpu-memory-and-available-using-pytorch
# torch.cuda.device_count()
# t = torch.cuda.get_device_properties(0).total_memory
# r = torch.cuda.memory_reserved(0)
# a = torch.cuda.memory_allocated(0)
# f = r-a  # free inside reserved
# print(t/1024**3)
# print(r/1024**3)
# print(a/1024**3)
# print(f/1024**3)

In [None]:
# Reference: https://stackoverflow.com/questions/58216000/get-total-amount-of-free-gpu-memory-and-available-using-pytorch
# from typing import Tuple

def byte_gb_info(byte_mem) -> str:
    """calculate the byte size to GB size for better human readable"""
    # format the f string float with :.2f to decimal digits
    # https://zetcode.com/python/fstring/
    return f"{(byte_mem/1024**3):4f} GB"


def accelerator_mem_info(device_idx: int):
    # total
    t = torch.cuda.get_device_properties(device_idx).total_memory
    # usable
    r = torch.cuda.memory_reserved(device_idx)
    # allocated
    a = torch.cuda.memory_allocated(device_idx)
    # still free
    f = r-a
    # unit = "GB"   
    print( # "GPU memory info:\n" + 
          f"Physical  memory : {byte_gb_info(t)}\n" + 
          f"Reserved  memory : {byte_gb_info(r)}\n" + 
          f"Allocated memory : {byte_gb_info(a)}\n" + 
          f"Free      memory : {byte_gb_info(f)}")

    
def accelerator_compute_info(device_idx: int):
    name = torch.cuda.get_device_properties(device_idx).name
    count = torch.cuda.get_device_properties(device_idx).multi_processor_count
    print(f"Device_name      : {name} \n" +
          f"Multi_processor  : {count}")    

    
def gpu_usage():        
    num_of_gpus = torch.cuda.device_count();
    # this shows only the gpu device, not the MIG
    print(f"num_of_gpus: {num_of_gpus}")
    # available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
    # available_gpus = [torch.cuda.get_device_properties(i).name for i in range(torch.cuda.device_count())]
    # print(f"device_mig_info: {available_gpus}")
    for device_idx in range(torch.cuda.device_count()):
        print("-"*20)
        accelerator_compute_info(device_idx)                 
        accelerator_mem_info(device_idx)
        print("-"*20)
    # Why is there two cuda mem info ? "avaialbe and total" ?
    # max_memory=[f'{int(torch.cuda.mem_get_info()[i]/1024**3)-2}GB' for i in range(len(torch.cuda.mem_get_info()))]
    # print(f"max_memory: {max_memory}")

    
gpu_usage()

In [None]:
# set the model download cache directory
# DATA_ROOT="/data"
# DATA_ROOT="/home/jovyan/llm-models"
# os.environ['XDG_CACHE_HOME']=f"{DATA_ROOT}/core-kind/yinwang/models"

model_map = {
        "7B": "meta-llama/Llama-2-7b-chat-hf",
        "13B" : "meta-llama/Llama-2-13b-chat-hf",
        "70B" : "meta-llama/Llama-2-70b-chat-hf",
        # "70B" : "meta-llama/Llama-2-70b-hf" 
}

import transformers
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
print(transformers.__version__)

In [None]:
"""
Load the token
"""
token_file_path = f"{DATA_ROOT}/core-kind/yinwang/.cache/huggingface/token"
file = open(token_file_path, "r")
# file read add a new line to the token, remove it.
token = file.read().replace('\n', '')
file.close()

# print the raw string to see if there is new line in the token
# print(r'{}'.format(token))

In [None]:
# model_type = "13B"
model_type = "7B"
model_name = model_map.get(model_type, "7B")

print(model_name)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    #token=token, #transformer==4.32.1
    use_auth_token=token, #transformer==4.31.0
)

In [None]:
# type(tokenizer)

In [None]:
%time
# in Transformer 4.32.1 need to use "token" parameter
# in Transformer 4.30.x need to use "use_auth_token" parameter
# with torch.no_grad():
generator = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    #token=token, #transformer==4.32.1
    use_auth_token=token, #transformer==4.31.0
)

In [None]:
# type(generator)

In [None]:
# check the available GPU memory after loading the LLM
gpu_usage()

In [None]:
def chat_gen(
    generator: transformers.pipelines.text_generation.TextGenerationPipeline, 
    tokenizer: transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast
):    
    def local(input: str, print_mode: bool = True) -> list:
        start = time.time()
        sequences = generator(
            input,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            # max_length=200,
            max_new_tokens=200,
        )
        # for seq in sequences:
        #     print(f"Result: \n{seq['generated_text']}")
        
        result = []
        for seq in sequences:
            result.append(f"Result: \n{seq['generated_text']}")
        
        end = time.time()
        duration = end - start
        if print_mode == True:
            for s in result:
                print(s)

            print("-"*20)
            print(f"walltime: {duration} in secs.")
            gpu_usage() 
        return result
            
    
    return local
    
chat = chat_gen(generator, tokenizer)

In [None]:
# set DEBUG to false to remove all the llm answer outputs
# DEBUG=True
DEBUG=False

In [None]:
def print_answer(answer: list)-> None:
    if DEBUG:
        print("-"*10)
        print(answer[0])
        print("-"*10)
        print(answer[0].split("\n")[-1])        

#### Free pytorch gpu memory
* https://discuss.pytorch.org/t/how-to-delete-a-tensor-in-gpu-to-free-up-memory/48879/5
* https://discuss.huggingface.co/t/clear-gpu-memory-of-transformers-pipeline/18310
* https://saturncloud.io/blog/how-to-free-up-all-memory-pytorch-is-taking-from-gpu-memory/
* https://discuss.pytorch.org/t/how-to-free-the-pytorch-transformers-model-from-gpu-memory/132968
* https://stackoverflow.com/questions/70508960/how-to-free-gpu-memory-in-pytorch

#### Huggingface pipelines
* https://huggingface.co/docs/transformers/main_classes/pipelines
* clean cuda torch gpu: https://stackoverflow.com/questions/55322434/how-to-clear-cuda-memory-in-pytorch

In [None]:
import gc
def free_memory_gen(
    generator: transformers.pipelines.text_generation.TextGenerationPipeline, 
    tokenizer: transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast):
    """
    """
    def local():
        l_generator = generator
        l_tokenizer = tokenizer
        #l_generator.cpu()
        #l_tokenizer.cpu()
        # model.cpu()
        
        del l_tokenizer, l_generator
        gc.collect()
        torch.cuda.empty_cache()
        #for device_idx in range(torch.cuda.device_count()):
        #    print(device_idx)
        #    device = torch.device(f"cuda:{device_idx}")
        #    device.reset()
    return local    

free_memory = free_memory_gen(generator, tokenizer)    

In [None]:
# chain of thoughts prompting

# testing prompt
input='Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?\nA: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\n'
print(input)

In [None]:
answer = chat(input, False)
#print_answer(answer)
print(answer[0])

In [None]:
from pdf_text_loader import PDFHelper

In [None]:
#loader = PDFHelper(data_folder = "./data/medreports", file_pattern="KK-SCIVIAS-*.pdf")
#context = loader.read_pdf(1)

#input = f"Context: Patient: Fried\nFrage: Welcher name hat der Patient?\nAntwort: Name ist Fried\nContext: {context}\nFrage: Welcher name hat die Patientin?\nAntwort: die Patientin hat name "
#print(input)
#chat(input)

In [None]:
data_path = "core-kind/yinwang"
loader = PDFHelper(data_folder = f"{DATA_ROOT}/{data_path}/data/medreports", file_pattern="KK-SCIVIAS-*.txt")
context = loader.read_txt(0)

#### zero shot prompt

In [None]:
#name
input=f"Can you tell me the name of the patient from the folowing doctor's letter?\nLetter:\n{context}\nAnswer: "

In [None]:
#len(input)
# 6810

In [None]:
answer=chat(input, print_mode=False)
print_answer(answer)

In [None]:
#age
input=f"Can you tell me the age of the patient from the following doctor's letter?\nLetter:\n{context}\nAnswer: "

In [None]:
answer=chat(input, print_mode=False)
print_answer(answer)

In [None]:
#diagnosis
input=f"Can you tell me the diagnosis of the patient from the following doctor's letter?\nLetter:\n{context}\nAnswer: "

In [None]:
answer=chat(input, print_mode=False)
print_answer(answer)

#### Chain-of-thoughts prompt

In [None]:
# name prompt
input = f"Context: Patient: Fried\nQuestion: what is the name of the patient? \nAnswer: Name of the patient is Fried\nContext: {context}\nQuestion: what is the name of the patient?\nAnswer: the name of patient is"
#print(input)

In [None]:
answer=chat(input, print_mode=False)
print_answer(answer)

In [None]:
# age prompt
input = f"Context:\nPatient: Fried is a 34-year-old patient\nQuestion:\nhow old is the patient? \nAnswer:\nFried is a patient, 34 year-old, the answers is 34\nContext:\n{context}\nQuestion:\nhow old is the patient?\nAnswer: "
# print(input)

In [None]:
# age prompt
#len(input)
# > 6913 tokens

In [None]:
answer=chat(input, print_mode=False)
print_answer(answer)

In [None]:
# diagnose prompt
input=f"Context:\nPatient: Fried is a 34-year-old patient, Diagnoses: Influenza (J09.X2) \nQuestion:\nWhat diagnoses has the patient? \nAnswer:\nFried is a patient, 34 year-old, has diagnoses Influenza (J09.X2). The answers is Influenza (J09.X2)\nContext:\n{context}\nQuestion:\nWhat diagnoses has the patient?\nAnswer: "

In [None]:
answer=chat(input, print_mode=False)
print_answer(answer)

In [None]:
gpu_usage()

In [None]:
free_memory()

In [None]:
gpu_usage()