In [None]:
from llama_cpp import Llama
import time

# Load the model
llm = Llama(
    model_path="./model/LFM2-2.6B-Q4_0.gguf",
    n_gpu_layers=20,
    verbose=False,
    n_ctx=128000,
    flash_attn=True,
    seed=int(time.time() * 1000) % (2**31)
)

def generate_text(prompt: str, max_tokens: int = 100) -> str:
    """Generate text from a prompt using LLaMA model."""
    llm.reset()
    response = llm(
        prompt=prompt, 
        max_tokens=max_tokens,
        temperature=0.3,
        min_p=0.15,
        repeat_penalty=1.05
    )
    return response.get("choices", [{}])[0].get("text", "")

# Example usage
text = generate_text("Once upon a time")
print(text)

In [None]:
from llama_cpp import Llama
import time

# Load the model
llm = Llama(
    model_path="./model/LFM2-2.6B-Q4_0.gguf",
    n_gpu_layers=20,
    verbose=False,
    n_ctx=128000,
    flash_attn=True,
    seed=int(time.time() * 1000) % (2**31)
)

def generate_text_streaming(prompt: str, max_tokens: int = 100):
    """Generate text from a prompt using LLaMA model with streaming."""
    llm.reset()
    
    stream = llm(
        prompt=prompt, 
        max_tokens=max_tokens,
        stream=True,
        temperature=0.3,
        min_p=0.15,
        repeat_penalty=1.05
    )
    
    # Iterate through the stream and yield each token
    for output in stream:
        text = output.get("choices", [{}])[0].get("text", "")
        yield text

# Example usage
print("Generating text: ", end="", flush=True)
for token in generate_text_streaming("Once upon a time"):
    print(token, end="", flush=True)
print()