# Introduction

this notebook demos example of using llm in a MPS backend (apple silicon GPU) using torch 2.x

Referece:
* torch 2.x MPS Backend: https://pytorch.org/docs/stable/notes/mps.html

In [1]:
import sys, os
import torch

In [2]:
# check that MPS is availabe (Metal Performance Shaders)
if not torch.backends.mps.is_available():
    print("MPS is not available")
else:
    print("MPS is available")
    mps_device = torch.device("mps")
    print(mps_device)



MPS is available
mps


## Define global variables

In [3]:
from dataclasses import dataclass
@dataclass
class DirectorySetting:
    """set the directory for the model download"""
    home_dir: str="/home/jovyan/llm-models"
    transformers_cache_home: str="core-kind/yinwang/models"
    huggingface_token_file: str="core-kind/yinwang/.cache/huggingface/token"

    def get_cache_home(self):
        """get the cache home"""
        return f"{self.home_dir}/{self.transformers_cache_home}"
    
    def get_token_file(self):
        """get the token file"""
        return f"{self.home_dir}/{self.huggingface_token_file}"
    
dir_mode_map = {
    "kf_notebook": DirectorySetting(),
    "mac_local": DirectorySetting(home_dir="/Users/yingding", transformers_cache_home="MODELS", huggingface_token_file="MODELS/.huggingface_token"),
}

model_map = {
    "llama7B-chat":     "meta-llama/Llama-2-7b-chat-hf",
    "llama13B-chat" :   "meta-llama/Llama-2-13b-chat-hf",
    "llama70B-chat" :   "meta-llama/Llama-2-70b-chat-hf",
    # "70B" : "meta-llama/Llama-2-70b-hf"
    "mistral7B-01":     "mistralai/Mistral-7B-v0.1",
    "mistral7B-inst02": "mistralai/Mistral-7B-Instruct-v0.2",
    "mistral8x7B-01":   "mistralai/Mistral-Mixtral-8x7B-v0.1", 
}

default_model_type = "mistral7B-01"
default_dir_mode = "mac_local"

dir_setting = dir_mode_map[default_dir_mode]

os.environ["WORLD_SIZE"] = "1" 
os.environ['XDG_CACHE_HOME'] = dir_setting.get_cache_home()

In [4]:
os.environ['XDG_CACHE_HOME']

'/Users/yingding/MODELS'

In [5]:
import transformers
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer
print(transformers.__version__)
print(torch.__version__)

4.36.2
2.1.2


## Choose LLM model

In [6]:
# model_type = default_model_type
# model_type = "mistral7B-inst02"
model_type = "llama13B-chat"

model_name = model_map.get(model_type, default_model_type)
print(model_name)

meta-llama/Llama-2-13b-chat-hf


### Fast tokenizer

* https://github.com/huggingface/transformers/issues/23889#issuecomment-1584090357

In [7]:
# MAX_POSITION_EMBEDDINGS = 3072
# MAX_LENGTH = 4096

def need_token(model_type: str, model_name_prefix: str="llama"):
    """check if the model needs token"""
    return model_type.startswith(model_name_prefix)

def get_token(dir_setting: DirectorySetting):
    """get the token from the token file"""
    token_file_path = dir_setting.get_token_file()
    with open(token_file_path, "r") as file:
        # file read add a new line to the token, remove it.
        token = file.read().replace('\n', '')
    return token

if need_token(model_type):
    # kwargs = {"use_auth_token": get_token(dir_setting)}
    token_kwargs = {
        "token": get_token(dir_setting),
        # "truncation_side": "left",
        # "return_tensors": "pt",            
                    }
    print("huggingface token loaded")
else:
    token_kwargs = {}
    print("huggingface token is NOT needed")

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # device='mps',
    #max_position_embeddings=MAX_LENGTH,
    #max_length=MAX_LENGTH,
    # device_map="auto", # put to GPU
    device="cpu", # put to CPU
    # use_auth_token=token, #transformer==4.31.0
    **token_kwargs
)

huggingface token loaded


In [8]:
tokenizer

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-13b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [9]:
print(type(tokenizer))

<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>


### Testing token
* https://huggingface.co/docs/tokenizers/pipeline

In [10]:
inputs=["""
        Q: Roger has 3 tennis balls. 
        He buys 2 more cans of tennis balls. 
        Each can has 4 tennis balls. How many tennis balls does he have now?\n
        A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls.
        3 + 8 = 11. The answer is 11.\n
        Q: The cafeteria had 23 apples. 
        If they used 20 to make lunch and bought 6 more, how many apples do they have?\n
        """]

In [11]:
input_test_encoded = tokenizer.encode(inputs[0])
print(f"{len(input_test_encoded)}")
print(input_test_encoded)

141
[1, 29871, 13, 4706, 660, 29901, 14159, 756, 29871, 29941, 22556, 26563, 29889, 29871, 13, 4706, 940, 1321, 952, 29871, 29906, 901, 508, 29879, 310, 22556, 26563, 29889, 29871, 13, 4706, 7806, 508, 756, 29871, 29946, 22556, 26563, 29889, 1128, 1784, 22556, 26563, 947, 540, 505, 1286, 29973, 13, 13, 4706, 319, 29901, 14159, 4687, 411, 29871, 29941, 26563, 29889, 29871, 29906, 508, 29879, 310, 29871, 29946, 22556, 26563, 1269, 338, 29871, 29947, 22556, 26563, 29889, 13, 308, 29941, 718, 29871, 29947, 353, 29871, 29896, 29896, 29889, 450, 1234, 338, 29871, 29896, 29896, 29889, 13, 13, 4706, 660, 29901, 450, 274, 2142, 1308, 423, 750, 29871, 29906, 29941, 623, 793, 29889, 29871, 13, 4706, 960, 896, 1304, 29871, 29906, 29900, 304, 1207, 301, 3322, 322, 18093, 29871, 29953, 901, 29892, 920, 1784, 623, 793, 437, 896, 505, 29973, 13, 13, 308]


In [12]:
response_test_decoded = tokenizer.decode(input_test_encoded)
print(response_test_decoded)

<s> 
        Q: Roger has 3 tennis balls. 
        He buys 2 more cans of tennis balls. 
        Each can has 4 tennis balls. How many tennis balls does he have now?

        A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls.
        3 + 8 = 11. The answer is 11.

        Q: The cafeteria had 23 apples. 
        If they used 20 to make lunch and bought 6 more, how many apples do they have?

        


### load llm

In [13]:
# quantization_enabled = True
# bitsandbytes quantization does not work with MPS 
quantization_enabled = False

if quantization_enabled:
    compression_kwargs = {
        "load_in_8bit": True,
        # "load_in_4bit": True,
    }
else:
    compression_kwargs = {}

generator = transformers.pipeline(
    "text-generation",
    model=model_name,
    tokenizer=tokenizer, # optional
    torch_dtype=torch.float16, #bfloat16 is not supported on MPS backend
    # torch_dtype=torch.float32,
    device_map="auto",
    # max_length=MAX_LENGTH,
    max_length=None, # remove the total length of the generated response
    max_new_tokens=100, # set the size of new generated token # 200, are the token size different as the text size?
    **token_kwargs,
    # **compression_kwargs,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

##### Install autopep8 or black extension in VSCode
`shift + opt + F` to auto format python code

In [14]:
from util.accelerator_utils import AcceleratorStatus

gpu_status = AcceleratorStatus.create_accelerator_status()
gpu_status.gpu_usage()

--------------------
Allocated memory : 26.086761 GB
--------------------


In [15]:
import pydantic, time
pydantic.__version__

'1.10.13'

In [16]:
def chat_gen(
    generator: transformers.pipelines.text_generation.TextGenerationPipeline, 
    tokenizer: transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast,
    gpu_status: AcceleratorStatus
):    
    def local(input_prompts: list=[], temperature: float=0.1, max_new_tokens: int=200, verbose: bool=True) -> list:
        """
        do_sample, top_k, num_return_sequences, eos_token_id are the settings 
        the TextGenerationPipeline
        
        Reference:
        https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation
        """
        start = time.time()
        sequences = generator(
            input_prompts,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            # pad_token_id=tokenizer.eos_token_id, # for mistral
            eos_token_id=tokenizer.eos_token_id,
            # max_length=200,
            max_new_tokens= max_new_tokens, # 200 # max number of tokens to generate in the output
            temperature=temperature,
            repetition_penalty=1.1  # without this output begins repeating
        )
        # for seq in sequences:
        #     print(f"Result: \n{seq['generated_text']}")
        
        batch_result = []
        for prompt_result in sequences: # passed a list of prompt
            result = []
            for seq in prompt_result: # 
                result.append(f"Result: \n{seq['generated_text']}")
            batch_result.append(result)
            
        end = time.time()
        duration = end - start
        
        if verbose == True:
            for prompt_result in batch_result:
                for result in prompt_result:
                    print("promt-response")
                    print(result)
            print("-"*20)
            print(f"walltime: {duration} in secs.")
            gpu_status.gpu_usage()
            
        return batch_result   
    return local
    
chat = chat_gen(generator, tokenizer, gpu_status)

In [17]:
system_message="""[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
Always answer as helpfully as possible using the context text provided.
Your answers should only answer the question once and not have any text after the answer is done.\n\n
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n
"""

# testing prompt
inputs=['Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?\nA: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\n']

def get_inputs(idx):   
    return f"{system_message}{inputs[idx]}"

# def get_inputs(idx):   
#     return f"{inputs[idx]}" 

print(get_inputs(0))

[INST]<<SYS>>
You are a helpful, respectful and honest assistant.
Always answer as helpfully as possible using the context text provided.
Your answers should only answer the question once and not have any text after the answer is done.


If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information.
<</SYS>>


Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?



In [18]:
verbose = True
batch_answers = chat(inputs, temperature=0.001, max_new_tokens = 80, verbose=verbose)
# batch_answers = chat(inputs, temperature=0.1, max_new_tokens = 80, verbose=verbose)
if not verbose:
    prompt_0_results = batch_answers[0]
    print(prompt_0_results[0])

promt-response
Result: 
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A: They started with 23 apples. 20 were used for lunch, leaving 3. Then they bought 6 more, so they have 3 + 6 = 9 apples.
--------------------
walltime: 8.053721189498901 in secs.
--------------------
Allocated memory : 30.546768 GB
--------------------


In [19]:
import gc
def clear_mps_memory(tokenizer, generator):
    """clear the MPS memory"""
    if tokenizer is not None:
        del tokenizer
    if generator is not None:
        # need to move the model to cpu before delete.
        generator.model.cpu()
        del generator
    gc.collect()
    torch.mps.empty_cache()
    # report the GPU usage
    gpu_status.gpu_usage()


In [20]:
CLEAR_MEMORY = False
# CLEAR_MEMORY = True

if CLEAR_MEMORY:
    clear_mps_memory(tokenizer=tokenizer, generator=generator)

In [21]:
gpu_status.gpu_usage()

--------------------
Allocated memory : 30.546768 GB
--------------------
