## Introduction
The plan:
- install, load and use llama-cpp with langchain
- add custom CallbackHandler to track token usage

### Imports and installs

In [1]:
import os
import sys

import logging
import warnings
from importlib import reload

warnings.simplefilter("ignore")

reload(logging)
logging.basicConfig(stream=sys.stdout, format='',
                    level=logging.INFO, datefmt=None)
log = logging.getLogger(__name__)
# log.info("This should print as normal output in Jupyter, and not in a red box")

In [2]:
%%capture
%pip install -qqq langchain==0.0.304 --progress-bar off
%pip install -qqq llama-cpp-python==0.2.7 --progress-bar off

In [None]:
from huggingface_hub import hf_hub_download
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks import get_openai_callback
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # handle std out of llm in jupyterNB

***
## Load Llama2-13b

In [None]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model_basename = "llama-2-13b-chat.Q4_K_M.gguf"
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

n_gpu_layers = 32  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

llm = LlamaCpp(
    model_path=model_path,
    max_tokens=2500,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    n_ctx=2500, # Context window
    verbose=True, # Verbose is required to pass to the callback manager
)

## Create prompt template and run chain

In [5]:
template = """Question: {question}

Answer: Answer briefly in a sentence!"""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [6]:
# %%capture --no-stdout
chain = LLMChain(prompt=prompt, llm=llm)

with get_openai_callback() as cb:
    result = chain.run({'question': "When was Einstein born? Give year, month day.",})



Einstein was born on March 14, 1879.


llama_print_timings:        load time = 20707.13 ms
llama_print_timings:      sample time =    13.37 ms /    20 runs   (    0.67 ms per token,  1496.00 tokens per second)
llama_print_timings: prompt eval time = 20707.02 ms /    25 tokens (  828.28 ms per token,     1.21 tokens per second)
llama_print_timings:        eval time = 14672.60 ms /    19 runs   (  772.24 ms per token,     1.29 tokens per second)
llama_print_timings:       total time = 35483.85 ms


## Custom CallbackHandler

In [7]:
from typing import Any, Dict, List
from langchain.callbacks.base import BaseCallbackHandler
from langchain.schema import AgentAction, AgentFinish, LLMResult
from langchain.pydantic_v1 import BaseModel
from langchain.schema.messages import BaseMessage


class CustomTokenCounter(BaseCallbackHandler):
    """Callback Handler that tracks token info."""
    
    llama_model: LlamaCpp = None
    total_tokens: int = 0
    prompt_tokens: int = 0
    completion_tokens: int = 0
    successful_requests: int = 0
    total_cost: float = 0.0
    llprompts: List[str] = []
    llres: List[str] = []
        
    def __repr__(self) -> str:
        return (
            f"\tTokens Used: {self.total_tokens}\n"
            f"\tPrompt Tokens: {self.prompt_tokens}\n"
            f"\tCompletion Tokens: {self.completion_tokens}\n"
        )

    def on_llm_start(
        self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
    ) -> None:
        """Run when LLM starts running."""
        self.llprompts.append(prompts)

    def on_chat_model_start(
        self,
        serialized: Dict[str, Any],
        messages: List[List[BaseMessage]],
        **kwargs: Any
    ) -> None:
        """Run when LLM starts running."""

    def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
        """Run on new LLM token. Only available when streaming is enabled."""
        # self.total_tokens += 1

    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
        """Run when LLM ends running."""
        if response.generations is None:  
            return None
        else:
            text_responses = [gen.text for gens in response.generations for gen in gens]
            self.llres.append(text_responses)

    def on_llm_error(self, error: BaseException, **kwargs: Any) -> None:
        """Run when LLM errors."""

    def on_chain_start(
        self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any
    ) -> None:
        """Run when chain starts running."""
        print(f'questions: {inputs["question"]}')
        print(f"on_chain_start {serialized['name']}")

    def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None:
        """Run when chain ends running."""
        print(f'Chain output: {outputs["text"]}')

    def on_chain_error(self, error: BaseException, **kwargs: Any) -> None:
        """Run when chain errors."""

    def on_tool_start(
        self, serialized: Dict[str, Any], input_str: str, **kwargs: Any
    ) -> None:
        """Run when tool starts running."""
        print(f"on_tool_start {serialized['name']}")

    def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any:
        """Run on agent action."""
        print(f"on_agent_action {action}")

    def on_tool_end(self, output: str, **kwargs: Any) -> None:
        """Run when tool ends running."""

    def on_tool_error(self, error: BaseException, **kwargs: Any) -> None:
        """Run when tool errors."""

    def on_text(self, text: str, **kwargs: Any) -> None:
        """Run on arbitrary text."""

    def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None:
        """Run on agent end."""
        
    def count(self, llama_model: LlamaCpp)->str:
        self.prompt_tokens = sum(
            [llama_model.get_num_tokens(prompt[0]) for prompt in self.llprompts]
        )
        self.completion_tokens = sum(
            [llama_model.get_num_tokens(res[0]) for res in self.llres]
        )
        self.total_tokens = self.prompt_tokens + self.completion_tokens
        print (
            f"\tTokens Used: {self.total_tokens}\n"
            f"\tPrompt Tokens: {self.prompt_tokens}\n"
            f"\tCompletion Tokens: {self.completion_tokens}\n"
        )

In [None]:
model_name_or_path = "TheBloke/Llama-2-13B-chat-GGUF"
model_basename = "llama-2-13b-chat.Q4_K_M.gguf"
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

streamstd = StreamingStdOutCallbackHandler()
tokencnt = CustomTokenCounter()
callback_manager = CallbackManager([streamstd, tokencnt])

n_gpu_layers = 32  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

llm = LlamaCpp(
    model_path=model_path,
    max_tokens=2500,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    n_ctx=2500, # Context window
    verbose=True, # Verbose is required to pass to the callback manager
)

In [9]:
chain = LLMChain(prompt=prompt, llm=llm)

with get_openai_callback() as cb:
    chain.run({'question': "Explain 2nd law of thermodynamics.",})
    chain.run({'question': "When was Einstein born? Give year, month day.",})
    chain.run({'question': "What is the Fermi paradox?",})

 The second law of thermodynamics states that entropy, or disorder, always increases over time in any isolated system, which means that energy can never be fully converted into another form without producing some waste heat or other forms of dissipation.  


llama_print_timings:        load time = 16644.78 ms
llama_print_timings:      sample time =    34.44 ms /    52 runs   (    0.66 ms per token,  1510.09 tokens per second)
llama_print_timings: prompt eval time = 16644.72 ms /    25 tokens (  665.79 ms per token,     1.50 tokens per second)
llama_print_timings:        eval time = 43265.92 ms /    51 runs   (  848.35 ms per token,     1.18 tokens per second)
llama_print_timings:       total time = 60177.49 ms
Llama.generate: prefix-match hit


 Albert Einstein was born on March 14, 1879.


llama_print_timings:        load time = 16644.78 ms
llama_print_timings:      sample time =    11.81 ms /    18 runs   (    0.66 ms per token,  1524.39 tokens per second)
llama_print_timings: prompt eval time = 14568.79 ms /    22 tokens (  662.22 ms per token,     1.51 tokens per second)
llama_print_timings:        eval time = 14828.62 ms /    17 runs   (  872.27 ms per token,     1.15 tokens per second)
llama_print_timings:       total time = 29490.92 ms
Llama.generate: prefix-match hit




The Fermi Paradox refers to the apparent disconnect between our expectation of the likelihood of extraterrestrial life existing in the universe and the lack of empirical evidence or direct observation of such existence.


llama_print_timings:        load time = 16644.78 ms
llama_print_timings:      sample time =    32.24 ms /    47 runs   (    0.69 ms per token,  1457.86 tokens per second)
llama_print_timings: prompt eval time = 12789.03 ms /    20 tokens (  639.45 ms per token,     1.56 tokens per second)
llama_print_timings:        eval time = 40482.68 ms /    46 runs   (  880.06 ms per token,     1.14 tokens per second)
llama_print_timings:       total time = 53518.63 ms


In [10]:
tokencnt.count(llm)

	Tokens Used: 193
	Prompt Tokens: 73
	Completion Tokens: 120



In [11]:
tokencnt.llprompts

[['Question: Explain 2nd law of thermodynamics.\n\nAnswer: Answer briefly in a sentence!'],
 ['Question: When was Einstein born? Give year, month day.\n\nAnswer: Answer briefly in a sentence!'],
 ['Question: What is the Fermi paradox?\n\nAnswer: Answer briefly in a sentence!']]