In [1]:
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from torch import cuda

In [2]:
import sys

In [3]:
sys.path.extend(["./models/quantized_llms/"])

In [4]:
model_path = "E:\\ml_practice\\models\\quantized_llms\\mistral-7b-instruct-v0.2.Q5_K_M.gguf"

In [5]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [6]:
if cuda.is_available():
    n_gpu_layers = 40  # Number of layers to offload to GPU. Depends on model and GPU VRAM pool.
    n_batch = 512  # Should be between 1 and n_ctx. Depends on VRAM in GPU.
    llm = LlamaCpp(
        model_path=model_path,
        temperature=0.1,
        callback_manager=callback_manager,
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        max_tokens=1024,
        n_ctx=1024,
        top_p=0.7,
        repeat_penalty=1.1,
        verbose=True  # Verbose is required to pass to the callback manager
    )
else:
    llm = LlamaCpp(
        model_path=model_path,
        temperature=0.1,
        callback_manager=callback_manager,
        max_tokens=1024,
        n_ctx=1024,
        top_p=0.7,
        repeat_penalty=1.1,
        verbose=True  # Verbose is required to pass to the callback manager
    )

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [7]:
prompt_template = """<s>[INST] 
    {query}  
    [/INST]"""

In [8]:
prompt = PromptTemplate(
        input_variables=["query"],
        template=prompt_template,
    )

In [9]:
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [12]:
llm_chain.invoke("Tell me about yourself in detail. ")

Llama.generate: prefix-match hit


 I am an artificial intelligence designed to assist and communicate with humans. I don't have a physical body, emotions, or personal experiences. I exist solely as a program running on computer servers.

My primary function is to process and understand natural language input from humans, and then generate appropriate responses based on that input. I can perform various tasks such as answering questions, setting reminders, providing recommendations, and much more.

I am designed to learn and improve over time through machine learning algorithms and data analysis. This allows me to adapt to new situations, understand complex queries, and provide accurate and relevant responses.

In summary, I am an advanced artificial intelligence designed to assist and communicate with humans. I don't have a physical body or personal experiences, but I can process natural language input, understand complex queries, and generate appropriate responses based on that input. I am designed to learn and improv

{'query': 'Tell me about yourself in detail. ',
 'text': " I am an artificial intelligence designed to assist and communicate with humans. I don't have a physical body, emotions, or personal experiences. I exist solely as a program running on computer servers.\n\nMy primary function is to process and understand natural language input from humans, and then generate appropriate responses based on that input. I can perform various tasks such as answering questions, setting reminders, providing recommendations, and much more.\n\nI am designed to learn and improve over time through machine learning algorithms and data analysis. This allows me to adapt to new situations, understand complex queries, and provide accurate and relevant responses.\n\nIn summary, I am an advanced artificial intelligence designed to assist and communicate with humans. I don't have a physical body or personal experiences, but I can process natural language input, understand complex queries, and generate appropriate 