# Setup LLM pipeline with LangChain

In [4]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

In [10]:
template = """You are an expert in physics and astronomy.

Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate.from_template(template)

In [11]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [12]:
# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="../../resources/models/OLMo-7B-Instruct-GGUF/OLMo-7B-Instruct-Q4_K_M.gguf",
    temperature=0.75,
    max_tokens=2000,
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 21 key-value pairs and 226 tensors from ../../resources/models/OLMo-7B-Instruct-GGUF/OLMo-7B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = olmo
llama_model_loader: - kv   1:                               general.name str              = OLMo-7B-Instruct-hf
llama_model_loader: - kv   2:                           olmo.block_count u32              = 32
llama_model_loader: - kv   3:                        olmo.context_length u32              = 2048
llama_model_loader: - kv   4:                      olmo.embedding_length u32              = 4096
llama_model_loader: - kv   5:                   olmo.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                  olmo.attention.head_count u32              = 32
llama_model_loader: - kv   7

In [13]:
question = """
Question: Solar system is
"""
llm.invoke(question)

  a) A stationary object
  b) An ever-moving object
  Answer: a) A stationary object. (True)


llama_print_timings:        load time =    4162.74 ms
llama_print_timings:      sample time =       4.24 ms /    29 runs   (    0.15 ms per token,  6836.40 tokens per second)
llama_print_timings: prompt eval time =    4162.10 ms /     7 tokens (  594.59 ms per token,     1.68 tokens per second)
llama_print_timings:        eval time =    4457.53 ms /    28 runs   (  159.20 ms per token,     6.28 tokens per second)
llama_print_timings:       total time =    8743.09 ms /    35 tokens


'  a) A stationary object\n  b) An ever-moving object\n  Answer: a) A stationary object. (True)'

## References
1. https://python.langchain.com/docs/integrations/llms/llamacpp/
2. https://github.com/abetlen/llama-cpp-python/blob/main/examples/high_level_api/langchain_custom_llm.py