# Introduction

this notebook demos example of using llm in a MPS backend (apple silicon GPU) using torch 2.x

Referece:
* torch 2.x MPS Backend: https://pytorch.org/docs/stable/notes/mps.html

In [1]:
import os
import torch

In [2]:
# check that MPS is availabe (Metal Performance Shaders)
if not torch.backends.mps.is_available():
    print("MPS is not available")
else:
    print("MPS is available")
    mps_device = torch.device("mps")
    print(mps_device)



MPS is available
mps


## Define global variables

In [3]:
from dataclasses import dataclass
@dataclass
class DirectorySetting:
    """set the directory for the model download"""
    home_dir: str="/home/jovyan/llm-models"
    transformers_cache_home: str="core-kind/yinwang/models"
    huggingface_token_file: str="core-kind/yinwang/.cache/huggingface/token"

    def get_cache_home(self):
        """get the cache home"""
        return f"{self.home_dir}/{self.transformers_cache_home}"
    
    def get_token_file(self):
        """get the token file"""
        return f"{self.home_dir}/{self.huggingface_token_file}"
    
dir_mode_map = {
    "kf_notebook": DirectorySetting(),
    "mac_local": DirectorySetting(home_dir="/Users/yingding", transformers_cache_home="MODELS", huggingface_token_file="MODELS/.huggingface_token"),
}

model_map = {
    "llama7B-chat":     "meta-llama/Llama-2-7b-chat-hf",
    "llama13B-chat" :   "meta-llama/Llama-2-13b-chat-hf",
    "llama70B-chat" :   "meta-llama/Llama-2-70b-chat-hf",
    # "70B" : "meta-llama/Llama-2-70b-hf"
    "mistral7B-01":     "mistralai/Mistral-7B-v0.1",
    "mistral7B-inst02": "mistralai/Mistral-7B-Instruct-v0.2",
    "mixtral8x7B-01":   "mistralai/Mixtral-8x7B-v0.1",
    "mixtral8x7B-inst01":   "mistralai/Mixtral-8x7B-Instruct-v0.1", 
}

default_model_type = "mistral7B-01"
default_dir_mode = "mac_local"

dir_setting = dir_mode_map[default_dir_mode]

os.environ["WORLD_SIZE"] = "1" 
os.environ['XDG_CACHE_HOME'] = dir_setting.get_cache_home()

In [4]:
os.environ['XDG_CACHE_HOME']

'/Users/yingding/MODELS'

In [5]:
import transformers
import torch

from transformers import AutoModelForCausalLM, AutoTokenizer
print(transformers.__version__)
print(torch.__version__)

4.37.1
2.1.2


## Choose LLM model

In [6]:
# model_type = default_model_type
model_type = "mistral7B-inst02"
# model_type = "llama13B-chat"

model_name = model_map.get(model_type, default_model_type)
print(model_name)

mistralai/Mistral-7B-Instruct-v0.2


### Fast tokenizer

* https://github.com/huggingface/transformers/issues/23889#issuecomment-1584090357

In [7]:
# MAX_POSITION_EMBEDDINGS = 3072
# MAX_LENGTH = 4096

def need_token(model_type: str, model_name_prefix: str="llama"):
    """check if the model needs token"""
    return model_type.startswith(model_name_prefix)

def get_token(dir_setting: DirectorySetting):
    """get the token from the token file"""
    token_file_path = dir_setting.get_token_file()
    with open(token_file_path, "r") as file:
        # file read add a new line to the token, remove it.
        token = file.read().replace('\n', '')
    return token

if need_token(model_type):
    # kwargs = {"use_auth_token": get_token(dir_setting)}
    token_kwargs = {
        "token": get_token(dir_setting),
        # "truncation_side": "left",
        # "return_tensors": "pt",            
                    }
    print("huggingface token loaded")
else:
    token_kwargs = {}
    print("huggingface token is NOT needed")

huggingface token is NOT needed


### Load LLM Model and then Tokenizer

In [8]:
from torch import bfloat16

pipeline_kwargs = {
    "torch_dtype": torch.float16, #bfloat16 is not supported on MPS backend, float16 only on GPU accelerator
    # torch_dtype=torch.float32,
    # max_length=MAX_LENGTH,
    "max_length" : None, # remove the total length of the generated response
    "max_new_tokens" : 80,
}    



In [9]:
from transformers import AutoModelForCausalLM

# bnb_config = transformers.BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_compute_dtype=bfloat16
# )

# pretrained_model_name_or_path
# TODO: use model config to set the max_length
model = AutoModelForCausalLM.from_pretrained(
  pretrained_model_name_or_path=model_name,
  device_map='auto',
  # max_length= None, # remove the total length of the generated response
  # max_new_tokens=80,
  # quantization_config=bnb_config,
  # max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB',
  # torch_dtype=torch.float16
  **token_kwargs,  
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    # device='mps',
    #max_position_embeddings=MAX_LENGTH,
    #max_length=MAX_LENGTH,
    # device_map="auto", # put to GPU
    device="cpu", # put to CPU
    # use_auth_token=token, #transformer==4.31.0
    **token_kwargs
)

In [11]:
tokenizer

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-Instruct-v0.2', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
print(type(tokenizer))

<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>


### Testing token
* https://huggingface.co/docs/tokenizers/pipeline

In [13]:
inputs=["""
        Q: Roger has 3 tennis balls. 
        He buys 2 more cans of tennis balls. 
        Each can has 4 tennis balls. How many tennis balls does he have now?\n
        A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls.
        3 + 8 = 11. The answer is 11.\n
        Q: The cafeteria had 23 apples. 
        If they used 20 to make lunch and bought 6 more, how many apples do they have?\n
        """]

In [14]:
input_test_encoded = tokenizer.encode(inputs[0])
print(f"{len(input_test_encoded)}")
print(input_test_encoded)

139
[1, 28705, 13, 5390, 1186, 28747, 14115, 659, 28705, 28770, 19552, 16852, 28723, 28705, 13, 5390, 650, 957, 846, 28705, 28750, 680, 277, 509, 302, 19552, 16852, 28723, 28705, 13, 5390, 7066, 541, 659, 28705, 28781, 19552, 16852, 28723, 1602, 1287, 19552, 16852, 1235, 400, 506, 1055, 28804, 13, 13, 5390, 330, 28747, 14115, 2774, 395, 28705, 28770, 16852, 28723, 28705, 28750, 277, 509, 302, 28705, 28781, 19552, 16852, 1430, 349, 28705, 28783, 19552, 16852, 28723, 13, 273, 28770, 648, 28705, 28783, 327, 28705, 28740, 28740, 28723, 415, 4372, 349, 28705, 28740, 28740, 28723, 13, 13, 5390, 1186, 28747, 415, 18302, 1623, 515, 553, 28705, 28750, 28770, 979, 2815, 28723, 28705, 13, 5390, 1047, 590, 1307, 28705, 28750, 28734, 298, 1038, 9957, 304, 7620, 28705, 28784, 680, 28725, 910, 1287, 979, 2815, 511, 590, 506, 28804, 13, 13, 273]


In [15]:
response_test_decoded = tokenizer.decode(input_test_encoded)
print(response_test_decoded)

<s> 
        Q: Roger has 3 tennis balls. 
        He buys 2 more cans of tennis balls. 
        Each can has 4 tennis balls. How many tennis balls does he have now?

        A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls.
        3 + 8 = 11. The answer is 11.

        Q: The cafeteria had 23 apples. 
        If they used 20 to make lunch and bought 6 more, how many apples do they have?

        


### load llm

In [16]:
# quantization_enabled = True
# bitsandbytes quantization does not work with MPS 

# quantization_enabled = False

# if quantization_enabled:
#     compression_kwargs = {
#         "load_in_8bit": True,
#         # "load_in_4bit": True,
#     }
# else:
#     compression_kwargs = {}

generator = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer, # optional
    torch_dtype=torch.float16, #bfloat16 is not supported on MPS backend
    # torch_dtype=torch.float32,
    device_map="auto",
    # max_length=MAX_LENGTH,
    max_length=None, # remove the total length of the generated response
    max_new_tokens=80, # set the size of new generated token # 200, are the token size different as the text size?
    **token_kwargs,
    # **compression_kwargs,
)

##### Install autopep8 or black extension in VSCode
`shift + opt + F` to auto format python code

In [17]:
from util.accelerator_utils import AcceleratorStatus

gpu_status = AcceleratorStatus.create_accelerator_status()
gpu_status.gpu_usage()

--------------------
Allocated memory : 28.008636 GB
--------------------


In [18]:
import pydantic, time
pydantic.__version__

'1.10.13'

In [19]:
def chat_gen(
    generator: transformers.pipelines.text_generation.TextGenerationPipeline, 
    tokenizer: transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast,
    gpu_status: AcceleratorStatus
):    
    def local(input_prompts: list=[], temperature: float=0.1, max_new_tokens: int=200, verbose: bool=True) -> list:
        """
        do_sample, top_k, num_return_sequences, eos_token_id are the settings 
        the TextGenerationPipeline
        
        Reference:
        https://huggingface.co/docs/transformers/generation_strategies#customize-text-generation
        """
        start = time.time()
        sequences = generator(
            input_prompts,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            # pad_token_id=tokenizer.eos_token_id, # for mistral
            eos_token_id=tokenizer.eos_token_id,
            # max_length=200,
            max_new_tokens= max_new_tokens, # 200 # max number of tokens to generate in the output
            temperature=temperature,
            repetition_penalty=1.1  # without this output begins repeating
        )
        # for seq in sequences:
        #     print(f"Result: \n{seq['generated_text']}")
        
        batch_result = []
        for prompt_result in sequences: # passed a list of prompt
            result = []
            for seq in prompt_result: # 
                result.append(f"Result: \n{seq['generated_text']}")
            batch_result.append(result)
            
        end = time.time()
        duration = end - start
        
        if verbose == True:
            for prompt_result in batch_result:
                for result in prompt_result:
                    print("promt-response")
                    print(result)
            print("-"*20)
            print(f"walltime: {duration} in secs.")
            gpu_status.gpu_usage()
            
        return batch_result   
    return local
    
chat = chat_gen(generator, tokenizer, gpu_status)

In [20]:
def mistral_instruct_message(user_msg: str) -> str:
    # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
    mistral_system_message = f"""<s>[INST] You are a helpful, respectful and honest assistant.
    Always answer as helpfully as possible using the context text provided.
    Your answers should only answer the question once and not have any text after the answer is done.\n
    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
    If you don't know the answer to a question, please don't share false information. Just return "</s>"
    \n
    {user_msg}\n
    [/INST]"""
    return mistral_system_message
    
def llama_instruct_message(user_msg: str) -> str:
    # https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF
    llama_system_message = f"""[INST]<<SYS>>You are a helpful, respectful and honest assistant.
    Always answer as helpfully as possible using the context text provided.
    Your answers should only answer the question once and not have any text after the answer is done.\n
    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
    If you don't know the answer to a question, please don't share false information.<</SYS>>
    \n
    {user_msg}\n
    [/INST]"""
    return llama_system_message


In [21]:


# testing prompt
inputs=['Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?\nA: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\n']

# global variable model_name
# print(model_name)

def get_inputs(idx, model_name):
    print(model_name)
    if model_name.startswith("mi"):
        prompt = mistral_instruct_message(inputs[idx])
    elif model_name.startedwith("ll"):
        prompt = llama_instruct_message(inputs[idx])
    else:
        prompt = inputs[idx]
    return prompt

# def get_inputs(idx):   
#     return f"{inputs[idx]}" 

print(get_inputs(0, model_name))

mistralai/Mistral-7B-Instruct-v0.2
<s>[INST] You are a helpful, respectful and honest assistant.
    Always answer as helpfully as possible using the context text provided.
    Your answers should only answer the question once and not have any text after the answer is done.

    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
    If you don't know the answer to a question, please don't share false information. Just return "</s>"
    

    Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?


    [/INST]


In [22]:
verbose = True
batch_answers = chat(inputs, temperature=0.001, max_new_tokens = 80, verbose=verbose)
# batch_answers = chat(inputs, temperature=0.1, max_new_tokens = 80, verbose=verbose)
if not verbose:
    prompt_0_results = batch_answers[0]
    print(prompt_0_results[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


promt-response
Result: 
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A: The cafeteria started with 23 apples. They used 20 for lunch, so that leaves 3 apples remaining. Then they bought 6 more apples, making the total number of apples 9. However, this answer is incorrect because it doesn't match the given information. The correct answer should be 29 (23 original apples
--------------------
walltime: 13.717299222946167 in secs.
--------------------
Allocated memory : 32.028671 GB
--------------------


In [23]:
import gc
def clear_mps_memory(tokenizer, generator):
    """clear the MPS memory"""
    if tokenizer is not None:
        del tokenizer
    if generator is not None:
        # need to move the model to cpu before delete.
        generator.model.cpu()
        del generator
    gc.collect()
    torch.mps.empty_cache()
    # report the GPU usage
    gpu_status.gpu_usage()


In [24]:
CLEAR_MEMORY = False
# CLEAR_MEMORY = True

if CLEAR_MEMORY:
    clear_mps_memory(tokenizer=tokenizer, generator=generator)

In [25]:
gpu_status.gpu_usage()

--------------------
Allocated memory : 32.028671 GB
--------------------
