# Introduction

this notebook demos example of using llm in a MPS backend (apple silicon GPU) using torch 2.x

Referece:
* torch 2.x MPS Backend: https://pytorch.org/docs/stable/notes/mps.html

In [30]:
import os
import torch
import applyllm as apl

print(apl.__version__)


0.0.6


In [31]:
# check that MPS is availabe (Metal Performance Shaders)
if not torch.backends.mps.is_available():
    print("MPS is not available")
else:
    print("MPS is available")
    mps_device = torch.device("mps")
    print(mps_device)



MPS is available
mps


## Define global variables

In [32]:
from applyllm.accelerators import (
    DirectorySetting,
    TokenHelper as th,
)
    
dir_mode_map = {
    "kf_notebook": DirectorySetting(),
    "mac_local": DirectorySetting(home_dir="/Users/yingding", transformers_cache_home="MODELS", huggingface_token_file="MODELS/.huggingface_token"),
}

model_map = {
    "llama7B-chat":     "meta-llama/Llama-2-7b-chat-hf",
    "llama13B-chat" :   "meta-llama/Llama-2-13b-chat-hf",
    "llama70B-chat" :   "meta-llama/Llama-2-70b-chat-hf",
    "mistral7B-01":     "mistralai/Mistral-7B-v0.1",
    "mistral7B-inst02": "mistralai/Mistral-7B-Instruct-v0.2",
    "mixtral8x7B-01":   "mistralai/Mixtral-8x7B-v0.1",
    "mixtral8x7B-inst01":   "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "gemma7b-it": "google/gemma-7b-it",
    "gemma7b" : "google/gemma-7b",
    "gemma7b-it-1.1": "google/gemma-1.1-7b-it",
    "gemma2b-it": "google/gemma-2b-it",
    "gemma2b" : "google/gemma-2b",
    "gemma2b-it-1.1": "google/gemma-1.1-2b-it",
}

default_model_type = "mistral7B-01"
default_dir_mode = "mac_local"

dir_setting = dir_mode_map[default_dir_mode]

os.environ["WORLD_SIZE"] = "1" 
os.environ['XDG_CACHE_HOME'] = dir_setting.get_cache_home()

print(os.environ['XDG_CACHE_HOME'])

/Users/yingding/MODELS


In [33]:
import transformers
import torch

print(transformers.__version__)
print(torch.__version__)

4.40.2
2.3.0


## Choose LLM model

In [34]:
# model_type = default_model_type
# model_type = "gemma7b-it"
# model_type = "gemma7b-it-1.1"
# model_type = "gemma2b-it"
# model_type = "gemma2b-it-1.1"
model_type = "mistral7B-inst02"
# model_type = "llama7B-chat"
# model_type = "llama13B-chat"

model_name = model_map.get(model_type, default_model_type)
print(model_name)

mistralai/Mistral-7B-Instruct-v0.2


### Fast tokenizer

* https://github.com/huggingface/transformers/issues/23889#issuecomment-1584090357

### Load LLM Model and then Tokenizer

In [35]:
from applyllm.pipelines import (
    ModelCatalog,
    KwargsBuilder
)
token_kwargs = th.gen_token_kwargs(model_type=model_type, dir_setting=dir_setting)
print(f"token_kwargs: {token_kwargs}")

# data_type = torch.bfloat16
data_type = torch.float16
device_map = "mps" # "auto"  
# auto caste not working for mps 4.38.2
# https://github.com/huggingface/transformers/issues/29431 

# mixtral model has no max_new_tokens limit, so it is not set here.
model_kwargs = {
    "torch_dtype": data_type, #bfloat16 is not supported on MPS backend, float16 only on GPU accelerator
    # torch_dtype=torch.float32,
    # max_length=MAX_LENGTH,
    "device_map": device_map,
    "max_length" : None, # remove the total length of the generated response
}
print(f"model_kwargs: {model_kwargs}")

# set the transformers.pipeline kwargs
# the torch_dtype shall be set both for the model and the pipeline, due to a transformer issue.
# otherwise it will cause unnecessary more memory usage in the pipeline of transformers
# https://github.com/huggingface/transformers/issues/28817
# https://github.com/mlflow/mlflow/pull/10979

# Set transformers.pipeline only to return generated text return_full_text=False
# https://github.com/huggingface/transformers/issues/17117#issuecomment-1120809167
pipeline_kwargs = {
    "task": "text-generation",
    "max_new_tokens" : 200,
    "do_sample" : True, # do_sample True is required for temperature
    "temperature" : 0.001, 
    "device_map" : device_map, # use the MPS device if available
    "top_k": 3,
    "top_p": 0.85, #0.95
    # "num_return_sequences": 1,
    "framework": "pt", # use pytorch as framework
    "return_full_text": False, # return only the generated text, not the input text with the generated text
}

gemma_pipeline_kwargs = {
    "add_special_tokens": True,
    "torch_dtype": data_type,
}

# pipeline_kwargs override the model_kwargs during the merge
pipeline_kwargs = KwargsBuilder([model_kwargs]).override(pipeline_kwargs).build()

if model_name.startswith(ModelCatalog.GOOGLE_FAMILY):
    pipeline_kwargs = KwargsBuilder([pipeline_kwargs]).override(gemma_pipeline_kwargs).build()

print(f"pipeline_kwargs: {pipeline_kwargs}")


huggingface token is NOT needed
token_kwargs: {}
model_kwargs: {'torch_dtype': torch.float16, 'device_map': 'mps', 'max_length': None}
pipeline_kwargs: {'torch_dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'task': 'text-generation', 'max_new_tokens': 200, 'do_sample': True, 'temperature': 0.001, 'top_k': 3, 'top_p': 0.85, 'framework': 'pt', 'return_full_text': False}


### Max memory to offload parts of LLM model to the CPU memory
* https://huggingface.co/docs/accelerate/concept_guides/big_model_inference#designing-a-device-map

Note:
* Max Memory offload to CPU is CUDA implementation only



In [36]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from applyllm.utils import time_func
from applyllm.pipelines import ModelConfig, LocalCausalLMConfig


base_lm_config = ModelConfig(
  model_config = {
    "pretrained_model_name_or_path": model_name,
    "device_map": device_map,
  }
)

# No bitsandbytes qunatization support for MPS backend yet, set quantized to False
kwargs = {
  "quantized": False,
  "model_config": base_lm_config.get_config(),
  "quantization_config": {
    "quantization_config": transformers.BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type='nf4',
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16
      )
    },
    "trust_remote_code": True,
}

lm_config = LocalCausalLMConfig(**kwargs)

@time_func
def load_model():
  return AutoModelForCausalLM.from_pretrained(    
    **lm_config.get_config(),
    **token_kwargs,  
  )

model = load_model()



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

executed: load_model() python function
walltime: 57.30163502693176 in secs.


In [37]:
tokenizer_kwargs = {
    "model_config": {
        "pretrained_model_name_or_path": model_name,
        "device": "cpu",
        # "device_map": "auto", # put to GPU if GPU is available
        # "max_position_embeddings": MAX_LENGTH,
        # "max_length": MAX_LENGTH,
    },
}
tokenizer_config = ModelConfig(**tokenizer_kwargs)

tokenizer = AutoTokenizer.from_pretrained(
    **tokenizer_config.get_config(),
    **token_kwargs
)

In [38]:
tokenizer

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-Instruct-v0.2', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

### Testing token
* https://huggingface.co/docs/tokenizers/pipeline

In [39]:
print(model_name)

mistralai/Mistral-7B-Instruct-v0.2


In [40]:
from applyllm.pipelines import (
    ModelCatalog,
    PromptHelper
)

model_info = ModelCatalog.get_model_info(model_name)
prompt_helper = PromptHelper(model_info)

if model_info.model_family == ModelCatalog.GOOGLE_FAMILY:
    query = """BEGIN EXAMPLE
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
END EXAMPLE

Your turn:            
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? 
"""
    inputs=[prompt_helper.gen_prompt(query)]
else: 
    inputs=["""
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
"""]

In [41]:
input_test_encoded = tokenizer.encode(inputs[0])
print(f"{len(input_test_encoded)}")
print(input_test_encoded)

122
[1, 28705, 13, 28824, 28747, 14115, 659, 28705, 28770, 19552, 16852, 28723, 650, 957, 846, 28705, 28750, 680, 277, 509, 302, 19552, 16852, 28723, 7066, 541, 659, 28705, 28781, 19552, 16852, 28723, 1602, 1287, 19552, 16852, 1235, 400, 506, 1055, 28804, 13, 28741, 28747, 14115, 2774, 395, 28705, 28770, 16852, 28723, 28705, 28750, 277, 509, 302, 28705, 28781, 19552, 16852, 1430, 349, 28705, 28783, 19552, 16852, 28723, 28705, 28770, 648, 28705, 28783, 327, 28705, 28740, 28740, 28723, 415, 4372, 349, 28705, 28740, 28740, 28723, 13, 28824, 28747, 415, 18302, 1623, 515, 553, 28705, 28750, 28770, 979, 2815, 28723, 1047, 590, 1307, 28705, 28750, 28734, 298, 1038, 9957, 304, 7620, 28705, 28784, 680, 28725, 910, 1287, 979, 2815, 511, 590, 506, 28804, 13]


In [42]:
response_test_decoded = tokenizer.decode(input_test_encoded)
print(response_test_decoded)

<s> 
Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?
A: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.
Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?



### Load LLM

In [43]:
# bitsandbytes quantization does not work with MPS backend
print(pipeline_kwargs)

# transformer pipeline kwargs
tp_kwargs = {
    "model": model,
    "tokenizer": tokenizer,
}

tp_config = ModelConfig(model_config = tp_kwargs)

generator = transformers.pipeline(
    **tp_config.get_config(),
    **pipeline_kwargs,
    **token_kwargs,
    # **compression_kwargs,
)

{'torch_dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'task': 'text-generation', 'max_new_tokens': 200, 'do_sample': True, 'temperature': 0.001, 'top_k': 3, 'top_p': 0.85, 'framework': 'pt', 'return_full_text': False}


##### Install autopep8 or black extension in VSCode
`shift + opt + F` to auto format python code

In [44]:
from applyllm.accelerators import AcceleratorStatus

gpu_status = AcceleratorStatus.create_accelerator_status()
gpu_status.gpu_usage()

--------------------
Allocated memory : 67.420227 GB
--------------------


In [45]:
import pydantic
pydantic.__version__

'2.7.1'

In [46]:
from pprint import pprint
from langchain import PromptTemplate, LLMChain
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline


llm = HuggingFacePipeline(
    pipeline=generator 
)

def config_tokenizer(model_name: str, config: dict, pad_token_id = 2):
    if model_name.startswith(ModelCatalog.MISTRAL_FAMILY):
        return {**config, "pad_token_id": pad_token_id}
    else:
        return config

llm.model_id = model_name
llm.model_kwargs = config_tokenizer(model_name=model_name, config=model_kwargs, pad_token_id=tokenizer.eos_token_id)
llm.model_kwargs["trust_remote_code"] = True
llm.pipeline_kwargs = config_tokenizer(model_name=model_name, config=pipeline_kwargs, pad_token_id=tokenizer.eos_token_id)

print(llm)

# MAX_LENGTH = 1024
# MAX_NEW_TOKENS = 200

# pipeline_kwargs_config = {
#     "device_map": "auto",
#     "max_length": MAX_LENGTH, # deactivate to use max_new_tokens
#     "max_new_tokens": MAX_NEW_TOKENS, # this is not taken by the model ?
#     "eos_token_id": tokenizer.eos_token_id, # also making trouble (optional)
#     "temperature": 0.01,
#     "repetition_penalty": 1.15, # 1.15,
# }
# model_kwargs_config = {
#     "do_sample": True, # also making trouble with langchain (optional)
#     "top_k": 3, # this param result in trouble with langchain (optional)
#     "num_return_sequences": 1, # (optional)
#     "eos_token_id": tokenizer.eos_token_id, # also making trouble (optional)
#     "max_length": MAX_LENGTH, # deactivate to use max_new_tokens
#     "max_new_tokens": MAX_NEW_TOKENS, # this is not taken by the model ?
#     "temperature": 0.01,
#     "top_p": 0.8, # 0.95 # alternative to top_k summerized probability while do_sample=True
#     "repetition_penalty": 1.15, # 1.15,
# }

template = prompt_helper.gen_prompt("{input}")
prompt = PromptTemplate(template=template, input_variables=["input"])


@time_func
def chat(input) -> str:
    """
    Args: 
        input: str - the input text to chat with the model, e.g. inputs[0]
    """
    llm_chain = LLMChain(llm=llm, prompt=prompt)
    # print(repr(llm_chain))
    dict_response = llm_chain.invoke(input={"input": input})
    return dict_response.get("text", "")

# pprint(response, indent=0, width=100)

# response = chat(input=inputs[0])
# print(response)

[1mHuggingFacePipeline[0m
Params: {'model_id': 'mistralai/Mistral-7B-Instruct-v0.2', 'model_kwargs': {'torch_dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'pad_token_id': 2, 'trust_remote_code': True}, 'pipeline_kwargs': {'torch_dtype': torch.float16, 'device_map': 'mps', 'max_length': None, 'task': 'text-generation', 'max_new_tokens': 200, 'do_sample': True, 'temperature': 0.001, 'top_k': 3, 'top_p': 0.85, 'framework': 'pt', 'return_full_text': False, 'pad_token_id': 2}}


In [47]:
repeat = 1
for i in range(repeat):
    response = chat(input=inputs[0])
    print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:LLMChain] Entering Chain run with input:
[0m{
  "input": "\nQ: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?\nA: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\n"
}
[32;1m[1;3m[llm/start][0m [1m[chain:LLMChain > llm:HuggingFacePipeline] Entering LLM run with input:
[0m{
  "prompts": [
    "<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don't know the answer to a questi

In [48]:
import gc
def clear_mps_memory(tokenizer, generator):
    """clear the MPS memory"""
    if tokenizer is not None:
        del tokenizer
    if generator is not None:
        # need to move the model to cpu before delete.
        generator.model.cpu()
        del generator
    gc.collect()
    torch.mps.empty_cache()
    # report the GPU usage
    gpu_status.gpu_usage()


In [49]:
gpu_status.gpu_usage()

--------------------
Allocated memory : 67.453430 GB
--------------------


In [50]:
# inputs2 = ["Which animal is the largest mammal?"]
# inputs2 = ["Can you tell me something about chron's disease?"]

# hallucination https://www.findacode.com/snomed/34000006--crohns-disease.html

# real answer is 34000006, probably need a RAG 
# inputs2 = ["Which snomed ct code has chron's disease?"]

# inputs2 = ["Can you tell me more about the company nordcloud?"]
inputs2 = ["Can you tell me more about the company nordcloud in munich?"]

In [51]:
print(chat(input=inputs2[0]))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:LLMChain] Entering Chain run with input:
[0m{
  "input": "Can you tell me more about the company nordcloud in munich?"
}
[32;1m[1;3m[llm/start][0m [1m[chain:LLMChain > llm:HuggingFacePipeline] Entering LLM run with input:
[0m{
  "prompts": [
    "<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don't know the answer to a question, please don't share false information. Just return \"</s>\"\n\nCan you tell me more about the company nordcloud in munich?\n[/INST]"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[chain:LLMChain > llm:HuggingFacePipeline] [10.83s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "

## Define agent
* https://python.langchain.com/docs/modules/agents/
* https://python.langchain.com/docs/modules/agents/quick_start/
* https://python.langchain.com/docs/modules/agents/how_to/custom_agent/

## Gemma with custom langchain tool 
* https://github.com/Ashufet/LangChain_ReAct-Agent-with-Function-Calling_Ollama-Gemma-LLM_LangSmith
* ReAct Agent: https://www.youtube.com/watch?v=exYUJcz4uZs

## Ollama local host endpoint with LangChain
* https://medium.com/the-constellar-digital-technology-blog/geek-out-time-play-with-langchain-2-locally-with-gemma-96c6ca370649

## ReAct Agent
* https://python.langchain.com/docs/modules/agents/agent_types/react/

## Retrieval Agent examples
* https://github.com/pinecone-io/examples/blob/master/learn/generation/langchain/handbook/08-langchain-retrieval-agent.ipynb

In [52]:
embed_model_map = {
    "sentence-transformers": "sentence-transformers/all-MiniLM-L12-v2", # 384
    "baai" : "BAAI/bge-base-en-v1.5" # 768 embedding dims
}

embed_model_vendor = "sentence-transformers"
# embed_model_vendor = "baai"

In [53]:
embed_model_name = embed_model_map[embed_model_vendor]

In [54]:
from langchain_community.embeddings import HuggingFaceEmbeddings

model_config = {
    "model_name" : embed_model_name,
    "model_kwargs": {'device': 'cpu'},
    "encode_kwargs": {'normalize_embeddings': True}
}
embed_config = ModelConfig(model_config=model_config)

# is downloaded at "{MODEL_CACHE_DIR}/models/torch/sentence_transformer" folder
embed_model = HuggingFaceEmbeddings(
    **embed_config.get_config()
)

embed_config



ModelConfig(model_config={'model_name': 'sentence-transformers/all-MiniLM-L12-v2', 'model_kwargs': {'device': 'cpu'}, 'encode_kwargs': {'normalize_embeddings': True}})

In [55]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from applyllm.utils import token_size
from applyllm.pipelines import ModelConfig

MAX_POSITION_EMBEDDINGS = 1000 # 4096
CHUNK_SIZE = (MAX_POSITION_EMBEDDINGS // 1000) * 1000

# Config splitter
model_config = {
    # Set a really small chunk size, just to show.
    "chunk_size": CHUNK_SIZE,
    "chunk_overlap": 200,
    "length_function": token_size, # len,
    "is_separator_regex": False,
}

splitter_config = ModelConfig(model_config=model_config)

loader = WebBaseLoader("https://nordcloud.com/company/", encoding="utf-8")
docs = loader.load()
documents = RecursiveCharacterTextSplitter(
    **splitter_config.get_config()
).split_documents(docs)

vector = FAISS.from_documents(documents, embed_model)
retriever = vector.as_retriever()

In [56]:
# retriever.get_relevant_documents(inputs2[0])[0]
retriever.invoke(input=inputs2[0])[0]

Document(page_content='Tuomas Toropainen\nCFO Office\nTuomas is our numbers guy. With a finance career spanning a wide range of industries, he’s on a mission to challenge traditional perceptions of finance leaders. Like Nordcloud, he’s forward-thinking, and the words ‘But that’s how we’ve always done it!’ will never leave his mouth.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nProud to be cloud native.\n\n\n\n\n\n\n\n\nWhy NordcloudWe help you use the cloud to become stronger, fitter and faster.\nLearn more \n\nOur ApproachWe empower your business to drive value, velocity and growth with the public cloud.\nLearn more \n\n\n\n\n\nCompany Timeline. \n\n\n\n2006A cloud-native infrastructure and web application development company is born in Finland!\n2011Nordcloud was established – with a focus on helping customers leverage public cloud infrastructure and DevOps. Growth skyrockets.\nEsa Kinnunen appointed as the first Nordcloud CEO.\n2013One of our original founders, Pyry Lehdonvirta, becomes a p

In [57]:
from langchain.chains import RetrievalQA

chain_type = "map_reduce"
# chain_type = "stuff"
# chain_type = "refine" 
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=chain_type,
    retriever=retriever,
    # combine_docs_chain_kwargs={'prompt': reduce_prompt_template},
    # chain_type_kwargs={"map_prompt": map_prompt_template},
    return_source_documents=True,
    verbose=True,
    )

qa



In [58]:
import langchain

query = inputs2[0]
langchain.debug = True
qa.invoke({"query": query})
langchain.debug = False

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Can you tell me more about the company nordcloud in munich?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Tuomas Toropainen\nCFO Office\nTuomas is our numbers guy. With a finance career spanning a wide range of industries, he’s on a mission to challenge traditional perceptions of finance leaders. Like Nordcloud, he’s forward-thinking, and the words ‘But that’s how we’ve always done it!’ will never leave his mouth.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nProud to be cloud native.\n\n\n\n\n\n\n\n\nWhy NordcloudWe help you use the cloud to become stronger, fitter and faster.\nLearn more \n\nOur ApproachWe empower your busin

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1107ded10>>
Traceback (most recent call last):
  File "/Users/yingding/VENV/agents3.11/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQA > chain:MapReduceDocumentsChain > chain:LLMChain > llm:HuggingFacePipeline] [326.00s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nNordcloud Deutschland GmbH (IBM)\nMies-van-der-Rohe-Straße 6\nTower 1 / 28 OG\n80807 München\n\nThis text indicates that Nordcloud has an office located in Munich, Germany, at the address Mies-van-der-Rohe-Straße 6, Tower 1 / 28 OG, 80807 München.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[llm/end][0m [1m[chain:RetrievalQA > chain:MapReduceDocumentsChain > chain:LLMChain > llm:HuggingFacePipeline] [326.00s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nIlja Summala is the Group CTO at Nordcloud in Munich.\nNordcloud is a European leader in cloud implementation, application development, and managed services.\nNordcloud helps customers 

Token indices sequence length is longer than the specified maximum sequence length for this model (1955 > 1024). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQA > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Can you tell me more about the company nordcloud in munich?",
  "summaries": "\nNordcloud Deutschland GmbH (IBM)\nMies-van-der-Rohe-Straße 6\nTower 1 / 28 OG\n80807 München\n\nThis text indicates that Nordcloud has an office located in Munich, Germany, at the address Mies-van-der-Rohe-Straße 6, Tower 1 / 28 OG, 80807 München.\n\n\nIlja Summala is the Group CTO at Nordcloud in Munich.\nNordcloud is a European leader in cloud implementation, application development, and managed services.\nNordcloud helps customers manage infrastructure and develop apps in the cloud.\nNordcloud partners with Amazon Web Services, Microsoft Azure, Google Cloud, SAP, IBM Multicloud, and others.\nNordcloud has offices in multiple countries, including Munich.\nNordcloud's newsletter provides unique professional insights from their cloud native experts on th

# Tools
* https://github.com/langchain-ai/langchain/issues/14954#issuecomment-1864918697
* https://github.com/langchain-ai/langchain/issues/14954#issuecomment-1876906769

In [None]:
from langchain.agents import Tool

# tools = [
#     Tool(
#         name='Knowledge Base',
#         func=qa.invoke,
#         # func=qa.invoke,
#         description=(
#             'use this tool when answering general knowledge queries to get '
#             'more information about the topic'
#         )
#     )
# ]

tools = [
    Tool(
        name='northcloud_search',
        func=qa.invoke,
        description=(
            'Search for information about NorthCloud. '
            'For any questions about NorthCloud, you must use this tool!'
        )
    )
]

In [None]:
from langchain.agents import initialize_agent
from langchain.memory import ConversationBufferMemory
from langchain.agents import AgentType 

agent = initialize_agent(
    agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    output_key = "result",
    handle_parsing_errors=True,
    early_stopping_method='generate',
    memory = ConversationBufferMemory(memory_key = 'chat_history')   
)

In [None]:
agent.invoke({"input": inputs2[0]})

In [None]:
# from langchain.agents import AgentExecutor
# https://github.com/langchain-ai/langchain/issues/14954#issuecomment-1864918697
# agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, return_intermediate_steps=False, handle_parsing_errors=True)

In [None]:
# agent_executor.invoke({"input": inputs2[0]})
# agent_executor.invoke({"input": "hi"})

In [None]:
CLEAR_MEMORY = False
# CLEAR_MEMORY = True

if CLEAR_MEMORY:
    clear_mps_memory(tokenizer=tokenizer, generator=generator)