In [None]:
from llama_cpp import Llama
import time

MAX_TOKENS = 100
TEMPERATURE = 0.8
REPETITION_PENALTY = 1.2

# Load the model
llm = Llama(
    model_path="./models/LFM2-2.6B-Q4_0.gguf",
    n_gpu_layers=20,
    verbose=False,
    n_ctx=128000,
    flash_attn=True,
    seed=int(time.time() * 1000) % (2**31)
)

In [None]:
def generate_response(prompt):
    # Simplified prompt formatting for dialogue
    formatted_prompt = f"{prompt}\nRéponse:"
    
    # Reset the model state
    llm.reset()
    
    # Generate response
    response = llm(
        prompt=formatted_prompt,
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        repeat_penalty=REPETITION_PENALTY
    )
    
    # Extract the generated text
    generated_text = response.get("choices", [{}])[0].get("text", "").strip()
    
    # Remove any quotation marks
    generated_text = generated_text.replace('"', '')

    # Remove leading/trailing newlines
    generated_text = generated_text.strip('\n')

    # If newline in middle → keep left part only
    if '\n' in generated_text:
        generated_text = generated_text.split('\n', 1)[0].strip()
    
    return generated_text

def generate_response_stream(prompt):
    formatted_prompt = f"{prompt}\nRéponse :\n"
    
    # Reset the model state
    llm.reset()
    
    # Generate response with streaming
    stream = llm(
        prompt=formatted_prompt,
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        repeat_penalty=REPETITION_PENALTY,
        stream=True
    )

    started = False
    
    # Iterate through the stream and yield accumulated tokens
    for output in stream:
        new_token = output.get("choices", [{}])[0].get("text", "")
        
        # Remove any quotation marks from the new token
        new_token = new_token.replace('"', '')
            
        yield new_token

In [None]:
prompt = (
    "Tu es un PNJ dans un RPG. Demande au joueur de récupérer un objet. "
    "Écris une phrase courte et fluide, à la première personne, indiquant l'objet, où le trouver et pourquoi tu en as besoin. "
    "Demande directement son aide. Une seule phrase suffit, pas de liste, pas de paragraphe."
)
# Example usage
for token in generate_response_stream(prompt):
    print(token, end="", flush=True)
print()