In [None]:
from llama_cpp import Llama
import time

MAX_TOKENS = 100
TEMPERATURE = 0.8
REPETITION_PENALTY = 1.2

# Load the model
llm = Llama(
    model_path="./model/LFM2-2.6B-Q4_0.gguf",
    n_gpu_layers=20,
    verbose=False,
    n_ctx=128000,
    flash_attn=True,
    seed=int(time.time() * 1000) % (2**31)
)

In [None]:
def generate_response(prompt):
    # Simplified prompt formatting for dialogue
    formatted_prompt = f"{prompt}\nAnswer:"
    
    # Reset the model state
    llm.reset()
    
    # Generate response
    response = llm(
        prompt=formatted_prompt,
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        repeat_penalty=REPETITION_PENALTY
    )
    
    # Extract the generated text
    generated_text = response.get("choices", [{}])[0].get("text", "").strip()
    
    # Remove any quotation marks
    generated_text = generated_text.replace('"', '')
    
    return generated_text

def generate_response_stream(prompt):
    """
    Generate a response from a prompt with streaming output.
    
    Args:
        prompt: Input text prompt
        max_new_tokens: Maximum number of tokens to generate
        temperature: Sampling temperature (higher = more random)
        repetition_penalty: Penalty for repeating tokens
    
    Yields:
        str: Accumulated generated text after each token
    """

    # Simplified prompt formatting for dialogue
    formatted_prompt = f"{prompt}\nAnswer:"
    
    # Reset the model state
    llm.reset()
    
    # Generate response with streaming
    stream = llm(
        prompt=formatted_prompt,
        max_tokens=MAX_TOKENS,
        temperature=TEMPERATURE,
        repeat_penalty=REPETITION_PENALTY,
        stream=True
    )
    
    # Iterate through the stream and yield accumulated tokens
    for output in stream:
        new_token = output.get("choices", [{}])[0].get("text", "")
        
        # Remove any quotation marks from the new token
        new_token = new_token.replace('"', '')
            
        yield new_token

In [None]:
prompt = (
    "You are an NPC in an RPG. Ask the player in first person to retrieve an item. "
    "Keep it very short and simple: state the item, where to find it, and why you need it. "
    "Directly ask for their help. Avoid extra complications, hidden locations, or long explanations."
)

# Example usage
for token in generate_response_stream(prompt):
    print(token, end="", flush=True)
print()