In [77]:
import llama_cpp , sys , os, multiprocessing
from llama_cpp import Llama
sys.path.append('//home/zjc1002/Mounts/code/MyModules/utils')
from hf_utils import download_model_hfhub
from pathlib import Path 
from transformers import AutoTokenizer


#### Model Information *(We will be using the below model and its prompt templates for this demo)*
- **Model Name:** Mistral-7B-Instruct V0.2
- **Model Card:** [hf model card](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/blob/main/)
- **Context Window:** 32K 
- **Rope-thea:** 1e6
- No Sliding Window Attention 

#### Model Prompt Templates 
- Description: In order to leverage instruction fine-tuning, your prompt should be surrounded by [INST] and [/INST] tokens. The very first instruction should begin with a begin of sentence id. The next instructions should not. The assistant generation will be ended by the end-of-sentence token id.

- Standard Instruction Template

    ```python
    ext = "<s>[INST] What is your favourite condiment? [/INST]"
    "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!</s> "
    "[INST] Do you have mayonnaise recipes? [/INST]"
    ```

- Chat & Generation Templates: *this hf funcitonality can eliminate abiguity around what prompt template is needed for each model. the model tokenizer actually contains a method to format input text into model specific prompt tempalte*
    - [Chat Templates HF](https://huggingface.co/docs/transformers/main/chat_templating)
        - Note: Generation template information can be found in same link, not all models require generation prompts

    - **MIstral-7B-Instruct V0.2 Jinja Template**    
        ```jinja
        {% if messages[0]['role'] == 'system' %}

            {% set loop_messages = messages[1:] %}
            {% set system_message = messages[0]['content'] %}

        {% elif false == true and not '<<SYS>>' in messages[0]['content'] %}
            {% set loop_messages = messages %}
            {% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.' %}

        {% else %}
            {% set loop_messages = messages %}
            {% set system_message = false %}

        {% endif %}

        {% for message in loop_messages %}
            
            {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
                {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
            {% endif %}
            
            {% if loop.index0 == 0 and system_message != false %}
                {% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}
            {% else %}
                {% set content = message['content'] %}
            {% endif %}

            {% if message['role'] == 'user' %}
                {{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}
            {% elif message['role'] == 'system' %}
                {{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}
            {% elif message['role'] == 'assistant' %}
                {{ ' '  + content.strip() + ' ' + eos_token }}
            {% endif %}

        {% endfor %}"
        ```

In [86]:
non_gguf_modelname= "mistralai/Mistral-7B-Instruct-v0.2"
llamacpp_chat_format ="mistral-instruct"

local_dir = "/home/zjc1002/Mounts/llms/TheBloke_Mistral-7B-Instruct-v0.2-GGUF"
filename = "mistral-7b-instruct-v0.2.Q4_0.gguf"
context_window = 1024 #MISTRAL 7B 

## Download model to local directory
download_model_hfhub(repo_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
    , filename = filename
    , repo_type = "model"
    , local_dir = local_dir 
    , local_dir_use_symlinks = False
    )

local_dir:/home/zjc1002/Mounts/llms/TheBloke_Mistral-7B-Instruct-v0.2-GGUF already exists, skipping download


### Llama CPP Model Object Parameter Overview

In [87]:
model = Llama(

    model_path =  Path(local_dir,filename).as_posix()  # The path to the Llama model file being used
    , n_ctx = context_window  # The context size of the model
    #, n_gpu_layers = -1    # number of layers to offload to GPU (if -1 all layers are offloaded to GPU)
    # , split_mode= 0     # 0: split the model into layers, 1: split the model into blocks
    # , n_batch = None    # prompt processing maximum batch size
    # , n_threads= multiprocessing.cpu_count()-4   # number of threads to use for generation 
    # , n_threads_batch = None #number of threads to use for batch processing
    # , seed = llama_cpp.LLAMA_DEFAULT_SEED
    # , rope_scaling_type = llama_cpp.LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED #Rows on Processor Elements (ROPE): Objective is to distribute the rows of the model layers across multiple GPUS or other processing elements. This can help scale ttraining by allowing it to take advantage of parallel processing capabilities.  
    # , pooling_type = llama_cpp.LLAMA_POOLING_TYPE_UNSPECIFIED  #None, Mean , CLS
    # , logits_all  = False #return logits for all tokens, not just the last token (MUST BE TRUE FOR COMPLETION to return logprobs) *IMPORTANT*
    # , embedding = False #embedding mode only 
    # , last_n_tokens_size = 64 # Maximum number of of tokens to keep in the last_n_tokens deque
    # , lora_base = None #Optional path to base model, useful if using a quantized base model and you want to applyu LoRa to an f16 model 
    # , lora_path = None #path to LoRa file to apply the model 
     , chat_format = llamacpp_chat_format #String specifying the chat format to use when calling create_chat_completion. 
    # , chat_handler = None #Optional chat handler to use when calling create_chat_completion 
    # , draft_model = None #OPtional draft model to use for speculative decoding 
    # , tokenizer = None #optional tokenizer to override the default tokenizer from llama.cpp
)

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /home/zjc1002/Mounts/llms/TheBloke_Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32    

Number of CPU threads available: 12


In [88]:
#load mistral tokenizer from huggyface to enable use of chat templates 
tokenizer = AutoTokenizer.from_pretrained(non_gguf_modelname)
print(f'default JNIJA chat template for {non_gguf_modelname}')
print(tokenizer.default_chat_template)

#SAMPLE MESSAGES
chat = [

  {"role": "user", "content": "Hello, how are you?"},

  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},

  {"role": "user", "content": "I'd like to show off how chat templating works!"},

]


print('Default Chat Prompt')
print(tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False))

print('Default Generation Prompt')
print(tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True))



model.create_chat_completion(messages = chat)

default JNIJA chat template for mistralai/Mistral-7B-Instruct-v0.2
{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\'t know the answer to a question, please don\'t share false information.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.


llama_print_timings:        load time =    3050.23 ms
llama_print_timings:      sample time =     267.62 ms /   709 runs   (    0.38 ms per token,  2649.25 tokens per second)
llama_print_timings: prompt eval time =    3050.17 ms /    49 tokens (   62.25 ms per token,    16.06 tokens per second)
llama_print_timings:        eval time =  118205.79 ms /   708 runs   (  166.96 ms per token,     5.99 tokens per second)
llama_print_timings:       total time =  123241.75 ms /   757 tokens


{'id': 'chatcmpl-3434bb33-afa9-4832-a168-2755abcf6c88',
 'object': 'chat.completion',
 'created': 1713401967,
 'model': '/home/zjc1002/Mounts/llms/TheBloke_Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q4_0.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': ' Sure thing! Chat templating is a way to create reusable templates for common messages or responses in a conversational interface, such as a chatbot or messaging application. This can save time and effort by allowing you to define a set of pre-written messages that can be easily inserted into conversations.\n\nHere\'s an example of how you might use chat templating in Python using the ChatterBot library:\n\nFirst, let\'s define some templates for common greetings and farewells:\n```python\nimport chatterbot\n\nclass GreetingChatBot(chatterbot.ChatBot):\n    def __init__(self, *args, **kwargs):\n        super().__init__(*args, **kwargs)\n\n        self.greetings = [\n            "Hello! How 

In [84]:
def generate_text_from_prompt(user_prompt,
                             model,
                             tokenizer,
                             max_tokens = 1000,
                             temperature = 0.3,
                             top_p = 0.1,
                             echo = True,
                             stop = ["Q", "\n"]):



    messages = [
        {"role": "user", "content": user_prompt},
    ]

    tempalted_messages = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    print(tempalted_messages)


   # Define the parameters
    model_output = model(
       # user_prompt,
       tempalted_messages,
       max_tokens=max_tokens,
       temperature=temperature,
       top_p=top_p,
       echo=echo,
       stop=stop,
    )

    return model_output


my_prompt = "Generate a short biography about Ronald Dahl?. Please do not exceed 3 sentences in your response."

generate_text_from_prompt(my_prompt, model, tokenizer, stop=None)

<s>[INST] Generate a short biography about Ronald Dahl?. Please do not exceed 3 sentences in your response. [/INST]



llama_print_timings:        load time =    1837.83 ms
llama_print_timings:      sample time =      59.11 ms /   153 runs   (    0.39 ms per token,  2588.39 tokens per second)
llama_print_timings: prompt eval time =    1837.71 ms /    30 tokens (   61.26 ms per token,    16.32 tokens per second)
llama_print_timings:        eval time =   24162.72 ms /   152 runs   (  158.97 ms per token,     6.29 tokens per second)
llama_print_timings:       total time =   26362.08 ms /   182 tokens


{'id': 'cmpl-280c0a0d-57c2-4143-82b8-718eb2aff802',
 'object': 'text_completion',
 'created': 1713401769,
 'model': '/home/zjc1002/Mounts/llms/TheBloke_Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q4_0.gguf',
 'choices': [{'text': '<s>[INST] Generate a short biography about Ronald Dahl?. Please do not exceed 3 sentences in your response. [/INST] Ronald Dahl (September 13, 1916 – November 23, 1990) was a British novelist, short story writer, poet, and screenwriter whose works have been translated into over 60 languages. He is best known for his children\'s books, which include "James and the Giant Peach," "Charlie and the Chocolate Factory," "Matilda," and "The BFG." These fantastical stories, filled with quirky characters and rollicking adventure, have delighted readers of all ages for generations. Dahl\'s unique storytelling style and imaginative plots continue to inspire and captivate audiences around the world.',
   'index': 0,
   'logprobs': None,
   'finish_reason': 'sto

In [73]:
#Text Completion: quick teste of text completion / awnsering differeent genres of questions
model("(4+4453)*32=",stop=["."], max_tokens=1000)["choices"][0]['text']
model("the best LLM is",stop=["."], max_tokens=1000)["choices"][0]['text']


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1004.72 ms
llama_print_timings:      sample time =      12.28 ms /    35 runs   (    0.35 ms per token,  2849.70 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    5568.45 ms /    35 runs   (  159.10 ms per token,     6.29 tokens per second)
llama_print_timings:       total time =    5649.20 ms /    36 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    1004.72 ms
llama_print_timings:      sample time =       5.74 ms /    18 runs   (    0.32 ms per token,  3134.80 tokens per second)
llama_print_timings: prompt eval time =     356.78 ms /     5 tokens (   71.36 ms per token,    14.01 tokens per second)
llama_print_timings:        eval time =    2688.38 ms /    17 runs   (  158.14 ms per token,     6.32 tokens per second)
llama_print_timings:       to

' a subjective matter and depends on your career goals, academic interests, and personal circumstances'

In [107]:
model = Llama(

    model_path =  Path(local_dir,filename).as_posix() , n_ctx = 40  )
tokenized_text = model.tokenize(b'Explain the solor system', add_bos=True, special=False)
model.detokenize(tokenized_text)


for token in model.generate(tokenized_text, top_k=50 ,  top_p=.98 , temp=.9, frequency_penalty=.5, ):
    print(model.detokenize([token]))


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /home/zjc1002/Mounts/llms/TheBloke_Mistral-7B-Instruct-v0.2-GGUF/mistral-7b-instruct-v0.2.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32    

TypeError: 'list' object is not callable