In [None]:
from llama_cpp import Llama
import time
import src.constants as c

# Load the model
llm = Llama(
    model_path="./models/LFM2-2.6B-Q4_0.gguf",
    n_gpu_layers=c.Hyperparameters.GPU_LAYERS,
    verbose=False,
    n_ctx=c.Hyperparameters.CONTEXT_SIZE,
    flash_attn=True,
    use_mlock=True,
    n_threads=8,
    seed=int(time.time() * 1000) % (2**31)
)

In [None]:
def generate_response_stream(prompt, system_prompt=None):
    if system_prompt:
        formatted_prompt = (
            f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
            f"<|im_start|>user\n{prompt}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
    else:
        formatted_prompt = (
            f"<|im_start|>user\n{prompt}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
    
    # Reset the model state
    llm.reset()
    
    # Generate response with streaming
    stream = llm(
        prompt=formatted_prompt,
        max_tokens=c.Hyperparameters.MAX_TOKENS,
        temperature=c.Hyperparameters.TEMPERATURE,
        repeat_penalty=c.Hyperparameters.REPETITION_PENALTY,
        stream=True,
        stop=["<|im_end|>", "<|im_start|>"]  # Stop tokens pour LFM2
    )
    
    accumulated_text = ""
    
    # Iterate through the stream and yield accumulated tokens
    for output in stream:
        new_token = output.get("choices", [{}])[0].get("text", "")
        # Remove quotation marks
        new_token = new_token.translate(str.maketrans('', '', '"«»'))
        
        # Stop at first sentence end
        accumulated_text += new_token
        if new_token in ['.', '!', '?']:
            yield accumulated_text.strip()
            break
            
        yield accumulated_text

In [None]:
system_prompt = "Tu es un PNJ dans un RPG. Tu demandes de l'aide au joueur en une seule phrase."
prompt = "Demande au joueur de récupérer un objet. Indique l'objet, où il se trouve, et pourquoi tu en as besoin."


for partial_response in generate_response_stream(prompt, system_prompt):
    print(partial_response, end="\r")