# Introduction

this notebook demos example of using llm in a MPS backend (apple silicon GPU) using torch 2.x

Referece:
* torch 2.x MPS Backend: https://pytorch.org/docs/stable/notes/mps.html

In [1]:
import os
import torch
import applyllm as apl

print(apl.__version__)


0.0.6


In [2]:
# check that MPS is availabe (Metal Performance Shaders)
if not torch.backends.mps.is_available():
    print("MPS is not available")
else:
    print("MPS is available")
    mps_device = torch.device("mps")
    print(mps_device)



MPS is available
mps


## Define global variables

In [3]:
from applyllm.accelerators import (
    DirectorySetting,
    TokenHelper as th,
)
    
dir_mode_map = {
    "kf_notebook": DirectorySetting(),
    "mac_local": DirectorySetting(home_dir="/Users/yingding", transformers_cache_home="MODELS", huggingface_token_file="MODELS/.huggingface_token"),
}

model_map = {
    "llama7B-chat":     "meta-llama/Llama-2-7b-chat-hf",
    "llama13B-chat" :   "meta-llama/Llama-2-13b-chat-hf",
    "llama70B-chat" :   "meta-llama/Llama-2-70b-chat-hf",
    "mistral7B-01":     "mistralai/Mistral-7B-v0.1",
    "mistral7B-inst02": "mistralai/Mistral-7B-Instruct-v0.2",
    "mixtral8x7B-01":   "mistralai/Mixtral-8x7B-v0.1",
    "mixtral8x7B-inst01":   "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "gemma7b-it": "google/gemma-7b-it",
    "gemma7b" : "google/gemma-7b",
    "gemma7b-it-1.1": "google/gemma-1.1-7b-it",
    "gemma2b-it": "google/gemma-2b-it",
    "gemma2b" : "google/gemma-2b",
    "gemma2b-it-1.1": "google/gemma-1.1-2b-it",
}

default_model_type = "mistral7B-01"
default_dir_mode = "mac_local"

dir_setting = dir_mode_map[default_dir_mode]

os.environ["WORLD_SIZE"] = "1" 
os.environ['XDG_CACHE_HOME'] = dir_setting.get_cache_home()

print(os.environ['XDG_CACHE_HOME'])

/Users/yingding/MODELS


In [4]:
import transformers
import torch

print(transformers.__version__)
print(torch.__version__)

4.39.3
2.2.2


## Choose LLM model

In [5]:
# model_type = default_model_type
# model_type = "gemma7b-it"
model_type = "gemma7b-it-1.1"
# model_type = "gemma2b-it"
# model_type = "gemma2b-it-1.1"
# model_type = "mistral7B-inst02"
# model_type = "llama7B-chat"
# model_type = "llama13B-chat"

model_name = model_map.get(model_type, default_model_type)
print(model_name)

google/gemma-1.1-7b-it


### Fast tokenizer

* https://github.com/huggingface/transformers/issues/23889#issuecomment-1584090357

### Load LLM Model and then Tokenizer

In [6]:
from applyllm.pipelines import (
    ModelCatalog,
    KwargsBuilder
)
token_kwargs = th.gen_token_kwargs(model_type=model_type, dir_setting=dir_setting)
print(f"token_kwargs: {token_kwargs}")

# data_type = torch.bfloat16
data_type = torch.float16
device_map = "mps" # "auto"  
# auto caste not working for mps 4.38.2
# https://github.com/huggingface/transformers/issues/29431 

# mixtral model has no max_new_tokens limit, so it is not set here.
model_kwargs = {
    "torch_dtype": data_type, #bfloat16 is not supported on MPS backend, float16 only on GPU accelerator
    # torch_dtype=torch.float32,
    # max_length=MAX_LENGTH,
    "device_map": device_map,
    "max_length" : None, # remove the total length of the generated response
}
print(f"model_kwargs: {model_kwargs}")

# set the transformers.pipeline kwargs
# the torch_dtype shall be set both for the model and the pipeline, due to a transformer issue.
# otherwise it will cause unnecessary more memory usage in the pipeline of transformers
# https://github.com/huggingface/transformers/issues/28817
# https://github.com/mlflow/mlflow/pull/10979

# Set transformers.pipeline only to return generated text return_full_text=False
# https://github.com/huggingface/transformers/issues/17117#issuecomment-1120809167
pipeline_kwargs = {
    "task": "text-generation",
    "max_new_tokens" : 200,
    "do_sample" : True, # do_sample True is required for temperature
    "temperature" : 0.001, 
    "device_map" : device_map, # use the MPS device if available
    "top_k": 3,
    "top_p": 0.85, #0.95
    # "num_return_sequences": 1,
    "framework": "pt", # use pytorch as framework
    "return_full_text": False, # return only the generated text, not the input text with the generated text
}

gemma_pipeline_kwargs = {
    "add_special_tokens": True,
    "torch_dtype": data_type,
}

# pipeline_kwargs override the model_kwargs during the merge
pipeline_kwargs = KwargsBuilder([model_kwargs]).override(pipeline_kwargs).build()

if model_name.startswith(ModelCatalog.GOOGLE_FAMILY):
    pipeline_kwargs = KwargsBuilder([pipeline_kwargs]).override(gemma_pipeline_kwargs).build()

print(f"pipeline_kwargs: {pipeline_kwargs}")


huggingface token is NOT needed
token_kwargs: {}
model_kwargs: {'torch_dtype': torch.float16, 'device_map': 'mps', 'max_length': None}
pipeline_kwargs: {'torch_dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'task': 'text-generation', 'max_new_tokens': 200, 'do_sample': True, 'temperature': 0.001, 'top_k': 3, 'top_p': 0.85, 'framework': 'pt', 'return_full_text': False, 'add_special_tokens': True}


### Max memory to offload parts of LLM model to the CPU memory
* https://huggingface.co/docs/accelerate/concept_guides/big_model_inference#designing-a-device-map

Note:
* Max Memory offload to CPU is CUDA implementation only



In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from applyllm.utils import time_func
from applyllm.pipelines import ModelConfig, LocalCausalLMConfig


base_lm_config = ModelConfig(
  model_config = {
    "pretrained_model_name_or_path": model_name,
    "device_map": device_map,
  }
)

# No bitsandbytes qunatization support for MPS backend yet, set quantized to False
kwargs = {
  "quantized": False,
  "model_config": base_lm_config.get_config(),
  "quantization_config": {
    "quantization_config": transformers.BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type='nf4',
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16
      )
    },
    "trust_remote_code": True,
}

lm_config = LocalCausalLMConfig(**kwargs)

@time_func
def load_model():
  return AutoModelForCausalLM.from_pretrained(    
    **lm_config.get_config(),
    **token_kwargs,  
  )

model = load_model()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

executed: load_model() python function
walltime: 19.388441801071167 in secs.


In [8]:
tokenizer_kwargs = {
    "model_config": {
        "pretrained_model_name_or_path": model_name,
        "device": "cpu",
        # "device_map": "auto", # put to GPU if GPU is available
        # "max_position_embeddings": MAX_LENGTH,
        # "max_length": MAX_LENGTH,
    },
}
tokenizer_config = ModelConfig(**tokenizer_kwargs)

tokenizer = AutoTokenizer.from_pretrained(
    **tokenizer_config.get_config(),
    **token_kwargs
)

In [9]:
tokenizer

GemmaTokenizerFast(name_or_path='google/gemma-1.1-7b-it', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, sing

### Testing token
* https://huggingface.co/docs/tokenizers/pipeline

In [10]:
print(model_name)

google/gemma-1.1-7b-it


In [11]:
from applyllm.pipelines import (
    ModelCatalog,
    PromptHelper
)

model_info = ModelCatalog.get_model_info(model_name)
prompt_helper = PromptHelper(model_info)

if model_info.model_family == ModelCatalog.GOOGLE_FAMILY:
    query = """BEGIN EXAMPLE
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
END EXAMPLE

Your turn:            
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? 
"""
    inputs=[prompt_helper.gen_prompt(query)]
else: 
    inputs=["""
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
"""]

In [12]:
input_test_encoded = tokenizer.encode(inputs[0])
print(f"{len(input_test_encoded)}")
print(input_test_encoded)

253
[2, 106, 1645, 108, 2045, 708, 476, 10055, 235269, 62275, 578, 10406, 20409, 235265, 108, 24367, 3448, 685, 1707, 3948, 685, 3077, 2177, 573, 4807, 2793, 4646, 235265, 108, 6922, 10523, 1412, 1297, 3448, 573, 2872, 3631, 235269, 66004, 578, 749, 780, 791, 1089, 2793, 1452, 573, 3448, 603, 3015, 235265, 108, 6922, 10523, 1412, 1297, 614, 2793, 578, 780, 3707, 1089, 19319, 235269, 26353, 3782, 235269, 689, 1156, 93846, 235265, 109, 2495, 476, 2872, 1721, 780, 1501, 5229, 689, 603, 780, 2251, 38303, 63269, 235269, 10200, 3165, 5918, 576, 39534, 2775, 780, 5112, 235265, 1927, 692, 1453, 235303, 235251, 1230, 573, 3448, 577, 476, 2872, 235269, 3743, 1453, 235303, 235251, 4638, 1566, 2113, 235265, 6372, 1931, 590, 1453, 235303, 235251, 1230, 235265, 109, 26093, 90412, 108, 235368, 235292, 23627, 919, 235248, 235304, 22560, 20980, 235265, 1315, 58015, 235248, 235284, 978, 34252, 576, 22560, 20980, 235265, 9573, 798, 919, 235248, 235310, 22560, 20980, 235265, 2250, 1767, 22560, 20980, 1721

In [13]:
response_test_decoded = tokenizer.decode(input_test_encoded)
print(response_test_decoded)

<bos><start_of_turn>user
You are a helpful, respectful and honest assistant.
Always answer as helpfully as possible using the context text provided.
Your answers should only answer the question once, concise and do not have any text after the answer is done.
Your answers should only be text and not include any HTML, bullet points, or other markup.

If a question does not make sense or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. Just say I don't know.

BEGIN EXAMPLE
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
END EXAMPLE

Your turn:            
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? 

<end_of_turn>
<

### Load LLM

In [14]:
# bitsandbytes quantization does not work with MPS backend
print(pipeline_kwargs)

# transformer pipeline kwargs
tp_kwargs = {
    "model": model,
    "tokenizer": tokenizer,
}

tp_config = ModelConfig(model_config = tp_kwargs)

generator = transformers.pipeline(
    **tp_config.get_config(),
    **pipeline_kwargs,
    **token_kwargs,
    # **compression_kwargs,
)

{'torch_dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'task': 'text-generation', 'max_new_tokens': 200, 'do_sample': True, 'temperature': 0.001, 'top_k': 3, 'top_p': 0.85, 'framework': 'pt', 'return_full_text': False, 'add_special_tokens': True}


##### Install autopep8 or black extension in VSCode
`shift + opt + F` to auto format python code

In [15]:
from applyllm.accelerators import AcceleratorStatus

gpu_status = AcceleratorStatus.create_accelerator_status()
gpu_status.gpu_usage()

--------------------
Allocated memory : 32.937866 GB
--------------------


In [16]:
import pydantic
pydantic.__version__

'2.6.4'

In [17]:
from pprint import pprint
from langchain import PromptTemplate, LLMChain
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline


llm = HuggingFacePipeline(
    pipeline=generator 
)

template = prompt_helper.gen_prompt("{input}")
prompt = PromptTemplate(template=template, input_variables=["input"])


@time_func
def chat(input) -> str:
    """
    Args: 
        input: str - the input text to chat with the model, e.g. inputs[0]
    """
    llm_chain = LLMChain(llm=llm, prompt=prompt)
    # print(repr(llm_chain))
    dict_response = llm_chain.invoke(input={"input": input})
    return dict_response.get("text", "")

# pprint(response, indent=0, width=100)

# response = chat(input=inputs[0])
# print(response)

In [18]:
repeat = 1
for i in range(repeat):
    response = chat(input=inputs[0])
    print(response)

executed: chat() python function
walltime: 9.593132019042969 in secs.



In [19]:
import gc
def clear_mps_memory(tokenizer, generator):
    """clear the MPS memory"""
    if tokenizer is not None:
        del tokenizer
    if generator is not None:
        # need to move the model to cpu before delete.
        generator.model.cpu()
        del generator
    gc.collect()
    torch.mps.empty_cache()
    # report the GPU usage
    gpu_status.gpu_usage()


In [20]:
gpu_status.gpu_usage()

--------------------
Allocated memory : 34.070023 GB
--------------------


In [21]:
# inputs2 = ["Which animal is the largest mammal?"]
# inputs2 = ["Can you tell me something about chron's disease?"]

# hallucination https://www.findacode.com/snomed/34000006--crohns-disease.html

# real answer is 34000006, probably need a RAG 
# inputs2 = ["Which snomed ct code has chron's disease?"]

# inputs2 = ["Can you tell me more about the company nordcloud?"]
inputs2 = ["Can you tell me more about the company nordcloud in munich?"]

In [22]:
print(chat(input=inputs2[0]))

executed: chat() python function
walltime: 5.8323140144348145 in secs.
I am unable to provide information regarding specific companies or their details. Please provide the relevant company or query for more information.


## Define agent
* https://python.langchain.com/docs/modules/agents/
* https://python.langchain.com/docs/modules/agents/quick_start/
* https://python.langchain.com/docs/modules/agents/how_to/custom_agent/

## Gemma with custom langchain tool 
* https://github.com/Ashufet/LangChain_ReAct-Agent-with-Function-Calling_Ollama-Gemma-LLM_LangSmith
* ReAct Agent: https://www.youtube.com/watch?v=exYUJcz4uZs

## Ollama local host endpoint with LangChain
* https://medium.com/the-constellar-digital-technology-blog/geek-out-time-play-with-langchain-2-locally-with-gemma-96c6ca370649

## ReAct Agent
* https://python.langchain.com/docs/modules/agents/agent_types/react/

## Retrieval Agent examples
* https://github.com/pinecone-io/examples/blob/master/learn/generation/langchain/handbook/08-langchain-retrieval-agent.ipynb

In [23]:
embed_model_map = {
    "sentence-transformers": "sentence-transformers/all-MiniLM-L12-v2", # 384
    "baai" : "BAAI/bge-base-en-v1.5" # 768 embedding dims
}

embed_model_vendor = "sentence-transformers"
# embed_model_vendor = "baai"

In [24]:
embed_model_name = embed_model_map[embed_model_vendor]

In [25]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_config = {
    "model_name" : embed_model_name,
    "model_kwargs": {'device': 'cpu'},
    "encode_kwargs": {'normalize_embeddings': True}
}
embed_config = ModelConfig(model_config=model_config)

# is downloaded at "{MODEL_CACHE_DIR}/models/torch/sentence_transformer" folder
embed_model = HuggingFaceEmbeddings(
    **embed_config.get_config()
)

embed_config

ModelConfig(model_config={'model_name': 'sentence-transformers/all-MiniLM-L12-v2', 'model_kwargs': {'device': 'cpu'}, 'encode_kwargs': {'normalize_embeddings': True}})

In [26]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from applyllm.utils import token_size
from applyllm.pipelines import ModelConfig

MAX_POSITION_EMBEDDINGS = 1000 # 4096
CHUNK_SIZE = (MAX_POSITION_EMBEDDINGS // 1000) * 1000

# Config splitter
model_config = {
    # Set a really small chunk size, just to show.
    "chunk_size": CHUNK_SIZE,
    "chunk_overlap": 200,
    "length_function": token_size, # len,
    "is_separator_regex": False,
}

splitter_config = ModelConfig(model_config=model_config)

loader = WebBaseLoader("https://nordcloud.com/company/")
docs = loader.load()
documents = RecursiveCharacterTextSplitter(
    **splitter_config.get_config()
).split_documents(docs)

vector = FAISS.from_documents(documents, embed_model)
retriever = vector.as_retriever()

In [27]:
retriever.get_relevant_documents(inputs2[0])[0]

Document(page_content="Proud to be cloud native.\n\n\n\n\n\n\n\n\nWhy NordcloudWe help you use the cloud to become stronger, fitter and faster.\nLearn more \n\nOur ApproachWe empower your business to drive value, velocity and growth with the public cloud.\nLearn more \n\n\n\n\n\nCompany Timeline. \n\n\n\n2006A cloud-native infrastructure and web application development company is born in Finland!\n2011Nordcloud was established ‚Äì with a focus on helping customers leverage public cloud infrastructure and DevOps. Growth skyrockets.\nEsa Kunninen appointed as the first Nordcloud CEO.\n2013One of our original founders, Pyry Lehdonvirta, becomes a published author with HTML5 as an Application, a go-to resource for application designers and developers ‚Äì cementing our position as a pioneer in emerging technologies.\nNordcloud expands into its second country: Sweden.\n2014Growth gets serious as Nordcloud joins the 2-year EIT Digital Accelerator to kickstart European expansion.\nNordcloud be

In [28]:
from langchain.chains import RetrievalQA

# chain_type = "map_reduce"
# chain_type = "stuff"
chain_type = "refine" 
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=chain_type,
    retriever=retriever,
    # combine_docs_chain_kwargs={'prompt': reduce_prompt_template},
    # chain_type_kwargs={"map_prompt": map_prompt_template},
    return_source_documents=True,
    verbose=True,
    )

In [29]:
import langchain

query = inputs2[0]
# langchain.debug = True
qa.invoke({"query": query})
# langchain.debug = False



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'Can you tell me more about the company nordcloud in munich?',
 'result': '',
 'source_documents': [Document(page_content="Proud to be cloud native.\n\n\n\n\n\n\n\n\nWhy NordcloudWe help you use the cloud to become stronger, fitter and faster.\nLearn more \n\nOur ApproachWe empower your business to drive value, velocity and growth with the public cloud.\nLearn more \n\n\n\n\n\nCompany Timeline. \n\n\n\n2006A cloud-native infrastructure and web application development company is born in Finland!\n2011Nordcloud was established ‚Äì with a focus on helping customers leverage public cloud infrastructure and DevOps. Growth skyrockets.\nEsa Kunninen appointed as the first Nordcloud CEO.\n2013One of our original founders, Pyry Lehdonvirta, becomes a published author with HTML5 as an Application, a go-to resource for application designers and developers ‚Äì cementing our position as a pioneer in emerging technologies.\nNordcloud expands into its second country: Sweden.\n2014Growth get

# Tools
* https://github.com/langchain-ai/langchain/issues/14954#issuecomment-1864918697
* https://github.com/langchain-ai/langchain/issues/14954#issuecomment-1876906769

In [30]:
from langchain.agents import Tool

# tools = [
#     Tool(
#         name='Knowledge Base',
#         func=qa.invoke,
#         # func=qa.invoke,
#         description=(
#             'use this tool when answering general knowledge queries to get '
#             'more information about the topic'
#         )
#     )
# ]

tools = [
    Tool(
        name='northcloud_search',
        func=qa.invoke,
        description=(
            'Search for information about NorthCloud. '
            'For any questions about NorthCloud, you must use this tool!'
        )
    )
]

In [31]:
from langchain.agents import initialize_agent
from langchain.memory import ConversationBufferMemory
from langchain.agents import AgentType 

agent = initialize_agent(
    agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    output_key = "result",
    handle_parsing_errors=True,
    early_stopping_method='generate',
    memory = ConversationBufferMemory(memory_key = 'chat_history')   
)

  warn_deprecated(


In [32]:
agent.invoke({"input": inputs2[0]})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: ``[0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: ``[0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: ``[0m
Observation: Invalid or incomplete response
Thought:

OutputParserException: Could not parse LLM output: ``

In [None]:
# from langchain.agents import AgentExecutor
# https://github.com/langchain-ai/langchain/issues/14954#issuecomment-1864918697
# agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, return_intermediate_steps=False, handle_parsing_errors=True)

In [None]:
# agent_executor.invoke({"input": inputs2[0]})
# agent_executor.invoke({"input": "hi"})

In [None]:
CLEAR_MEMORY = False
# CLEAR_MEMORY = True

if CLEAR_MEMORY:
    clear_mps_memory(tokenizer=tokenizer, generator=generator)