In [None]:
# Installation des packages
import os
import sys
import s3fs
from llama_cpp import Llama
from langchain.callbacks.manager import CallbackManager 
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.globals import set_verbose
from langchain.llms import LlamaCpp
set_verbose(True)


In [None]:
from langchain.callbacks.base import BaseCallbackHandler
class StreamDisplayHandler(BaseCallbackHandler):
    def __init__(self, container, initial_text="", display_method='markdown'):
        self.container = container
        self.text = initial_text
        self.display_method = display_method
        self.new_sentence = ""

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        self.text += token
        self.new_sentence += token

        display_function = getattr(self.container, self.display_method, None)
        if display_function is not None:
            display_function(self.text)
        else:
            raise ValueError(f"Invalid display_method: {self.display_method}")

    def on_llm_end(self, response, **kwargs) -> None:
        self.text = ""

In [None]:
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
bucket = 'vlapegue/mistral-7b-instruct-v0.2.Q4_K_M'
files = fs.ls(bucket)[-3:]
fs.download(files[1],'mistral-7b-instruct-v0.2.Q4_K_M.gguf')

In [None]:
import panel as pn
pn.extension()

In [None]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    # Verbose is required to pass to the callback manager
n_batch = 512
llm = Llama(
    model_path='./mistral-7b-instruct-v0.2.Q4_K_M.gguf',
    n_gpu_layers=0,
    max_tokens = 8000,
    temperature = 0.1,
    n_batch=n_batch,
    f16_kv=True,
    use_mlock=True,
    n_ctx=2048,
    callback_manager=callback_manager,
    n_threads=8,
    verbose=True,
    streaming=True)

In [None]:
def run_mistral(query):
    output=llm(query,
    stop=["Q:", "\n"],
    echo=True
)
    return output

In [None]:

chat_interface = pn.chat.ChatInterface(
    callback=run_mistral, 
    callback_user="Mistral"
)



In [None]:
chat_interface

In [None]:
run_mistral("Q: Who is the president of USA? A: ")

In [None]:
output = llm(
   "Q: USA president in 1993 ? A:",
   max_tokens=32,
   stop=["Q:", "\n"],
   echo=True
)


In [None]:
print(output)
print(output['choices'][0]['text'])

In [None]:
# LLama2
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})
bucket = 'vlapegue/llama-2-7b-chat.Q5_K_M'
files = fs.ls(bucket)[-3:]
fs.download(files[1],'llama-2-7b-chat.Q5_K_M.gguf')

In [None]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    # Verbose is required to pass to the callback manager
n_batch = 512
llm_llama2 = LlamaCpp(
    model_path='./llama-2-7b-chat.Q5_K_M.gguf',
    n_gpu_layers=0,
    max_tokens = 8000,
    temperature = 0.1,
    n_batch=n_batch,
    f16_kv=True,
    use_mlock=True,
    n_ctx=2048,
    callback_manager=callback_manager,
    n_threads=8,
    verbose=True,
    streaming=True)

In [None]:
output = llm_llama2(
   "Q: USA president in 1993 ? A:",
   max_tokens=32,
   stop=["Q:", "\n"],
   echo=True
)

In [None]:
print(output)