In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

def generate_response(prompt, model_name="LiquidAI/LFM2-350M", max_new_tokens=150, temperature=0.7, repetition_penalty=1.2):
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type == "cpu":
        print("Warning: Running on CPU may be slow.")
    
    model.to(device)
    
    # Instruction-style prompt
    formatted_prompt = f"Please answer the following question concisely in one paragraph:\n{prompt}\nAnswer:"

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    
    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Only decode new tokens (skip prompt)
    generated_text = tokenizer.decode(output_ids[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True)
    
    return generated_text.strip()

In [None]:
generate_response("What is Brittany in France?")