In [None]:
from llama_cpp import Llama
import time
import src.constants as c

# Load the model
llm = Llama(
    model_path="./models/LFM2-2.6B-Q4_0.gguf",
    n_gpu_layers=c.Hyperparameters.GPU_LAYERS,
    verbose=False,
    n_ctx=c.Hyperparameters.CONTEXT_SIZE,
    flash_attn=True,
    use_mlock=True,
    n_threads=8,
    seed=int(time.time() * 1000) % (2**31)
)

In [None]:
CHAR_FILTER = str.maketrans('', '', '"«»')
def generate_response_stream(prompt, system_prompt=None):
    if system_prompt:
        formatted_prompt = (
            f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
            f"<|im_start|>user\n{prompt}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
    else:
        formatted_prompt = (
            f"<|im_start|>user\n{prompt}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
    
    # Reset the model state
    llm.reset()
    
    # Generate response with streaming
    stream = llm(
        prompt=formatted_prompt,
        max_tokens=c.Hyperparameters.MAX_TOKENS,
        temperature=c.Hyperparameters.TEMPERATURE,
        repeat_penalty=c.Hyperparameters.REPETITION_PENALTY,
        stream=True,
        stop=["<|im_end|>", "<|im_start|>"]  # Stop tokens pour LFM2
    )
    
    accumulated_text = ""
    started = False
    for output in stream:
        new_token = output.get("choices", [{}])[0].get("text", "")
        
        if new_token:
            started = True
        if started and new_token == "\n":
            break
        
        accumulated_text += new_token
        yield accumulated_text.translate(CHAR_FILTER)

def generate_response(prompt, system_prompt):
    if system_prompt:
        formatted_prompt = (
            f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
            f"<|im_start|>user\n{prompt}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
    else:
        formatted_prompt = (
            f"<|im_start|>user\n{prompt}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
    
    llm.reset()
    response = llm(
        prompt=formatted_prompt,
        max_tokens=c.Hyperparameters.MAX_TOKENS,
        temperature=c.Hyperparameters.TEMPERATURE,
        repeat_penalty=c.Hyperparameters.REPETITION_PENALTY,
        stop=["<|im_end|>", "<|im_start|>"]
    )
    
    generated_text = response.get("choices", [{}])[0].get("text", "").strip()
    generated_text = generated_text.translate(str.maketrans('', '', '"«»'))
    generated_text = generated_text.strip('\n')
    
    if '\n' in generated_text:
        generated_text = generated_text.split('\n', 1)[0].strip()
    
    return generated_text

In [None]:
system_prompt = (
    "Tu crées des mondes pour un RPG 2D. "
    "Chaque monde doit contenir un détail original vivant ou intrigant qui peut servir de point de départ pour des quêtes ou des histoires."
)
prompt = (
    "En une seule phrase très courte, décris un monde RPG 2D avec son ambiance, ses habitants, "
    "et un ou élément ou événement intéressant pour des quêtes."
)

llm.reset()
for partial_response in generate_response_stream(prompt, system_prompt):
    print(partial_response, end="\r")

print("\n\n")

print(generate_response(prompt, system_prompt))