## Introduction
The plan:
- load and use llama-cpp with langchain
- add custom CallbackHandler to track token usage

### Imports and installs

In [1]:
%pip install -qqq langchain==0.0.304 --progress-bar off
%pip install -qqq llama-cpp-python==0.2.7 --progress-bar off

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import sys
import warnings

warnings.simplefilter("ignore")

from huggingface_hub import hf_hub_download
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks import get_openai_callback
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # handle std out of llm in jupyterNB

***
## Load Llama2-13b

In [None]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model_basename = "llama-2-13b-chat.Q4_K_M.gguf"
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

n_gpu_layers = 32  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

llm = LlamaCpp(
    model_path=model_path,
    max_tokens=2500,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    n_ctx=2500, # Context window
    verbose=True, # Verbose is required to pass to the callback manager
)

## Create prompt template and run chain

In [4]:
template = """Question: {question}

Answer: Answer briefly in a sentence!"""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [5]:
chain = LLMChain(prompt=prompt, llm=llm)

with get_openai_callback() as cb:
    result = chain.run({'question': "When was Einstein born? Give year, month day.",})

 Albert Einstein was born on March 14th (3/14), 1879.


llama_print_timings:        load time = 20219.05 ms
llama_print_timings:      sample time =    20.39 ms /    24 runs   (    0.85 ms per token,  1177.22 tokens per second)
llama_print_timings: prompt eval time = 20218.95 ms /    25 tokens (  808.76 ms per token,     1.24 tokens per second)
llama_print_timings:        eval time = 133941.85 ms /    23 runs   ( 5823.56 ms per token,     0.17 tokens per second)
llama_print_timings:       total time = 154335.42 ms


## Custom CallbackHandler

In [6]:
from typing import Any, Dict, List
from langchain.callbacks.base import BaseCallbackHandler
from langchain.schema import AgentAction, AgentFinish, LLMResult
from langchain.pydantic_v1 import BaseModel
from langchain.schema.messages import BaseMessage


class CustomTokenCounter(BaseCallbackHandler):
    """Callback Handler that tracks token info."""
    
    llama_model: LlamaCpp = None
    total_tokens: int = 0
    prompt_tokens: int = 0
    completion_tokens: int = 0
    successful_requests: int = 0
    total_cost: float = 0.0
    llprompts: List[str] = []
    llres: List[str] = []
        
    def __repr__(self) -> str:
        return (
            f"\tTokens Used: {self.total_tokens}\n"
            f"\tPrompt Tokens: {self.prompt_tokens}\n"
            f"\tCompletion Tokens: {self.completion_tokens}\n"
        )

    def on_llm_start(
        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> None:
        """Run when LLM starts running."""
        self.llprompts.append(prompts)

    def on_chat_model_start(
        self,
        serialized: Dict[str, Any],
        messages: List[List[BaseMessage]],
        **kwargs: Any
    ) -> None:
        """Run when LLM starts running."""

    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
        """Run on new LLM token. Only available when streaming is enabled."""
        self.total_tokens += 1

    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
        """Run when LLM ends running."""
        if response.llm_output is None:  
            return None
        else:
            text_responses = [gen.text for gens in response.generations for gen in gens]
            self.llres.append(text_responses)

    def on_llm_error(self, error: BaseException, **kwargs: Any) -> None:
        """Run when LLM errors."""

    def on_chain_start(
        self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
    ) -> None:
        """Run when chain starts running."""
        print(f'questions: {inputs["question"]}')
        print(f"on_chain_start {serialized['name']}")

    def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None:
        """Run when chain ends running."""
        print(f'Chain output: {outputs["text"]}')

    def on_chain_error(self, error: BaseException, **kwargs: Any) -> None:
        """Run when chain errors."""

    def on_tool_start(
        self, serialized: Dict[str, Any], input_str: str, **kwargs: Any
    ) -> None:
        """Run when tool starts running."""
        print(f"on_tool_start {serialized['name']}")

    def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any:
        """Run on agent action."""
        print(f"on_agent_action {action}")

    def on_tool_end(self, output: str, **kwargs: Any) -> None:
        """Run when tool ends running."""

    def on_tool_error(self, error: BaseException, **kwargs: Any) -> None:
        """Run when tool errors."""

    def on_text(self, text: str, **kwargs: Any) -> None:
        """Run on arbitrary text."""

    def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None:
        """Run on agent end."""

In [13]:
tkns = CustomTokenCounter()
chain = LLMChain(prompt=prompt, llm=llm, callbacks=[tkns])

with get_openai_callback() as cb:
#     result = chain.run({'question': "When was Einstein born? Give year, month day.",})
    result = chain.run({'question': "What is the Fermi paradox?",})

questions: What is the Fermi paradox?


Llama.generate: prefix-match hit


 The Fermi Paradox is that, given the vast numbers of stars with planets and the probability that some have intelligent life, we should have seen evidence of extraterrestrial civilizations by now.output:  The Fermi Paradox is that, given the vast numbers of stars with planets and the probability that some have intelligent life, we should have seen evidence of extraterrestrial civilizations by now.



llama_print_timings:        load time = 20219.05 ms
llama_print_timings:      sample time =    28.78 ms /    45 runs   (    0.64 ms per token,  1563.59 tokens per second)
llama_print_timings: prompt eval time =  7734.64 ms /    20 tokens (  386.73 ms per token,     2.59 tokens per second)
llama_print_timings:        eval time = 23120.62 ms /    44 runs   (  525.47 ms per token,     1.90 tokens per second)
llama_print_timings:       total time = 31049.75 ms


In [14]:
tkns

	Tokensss Used: 0
	Prompt Tokens: 0
	Completion Tokens: 0