In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

def generate_response(prompt, model_name="LiquidAI/LFM2-350M", max_new_tokens=150, temperature=0.7, repetition_penalty=1.2):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    formatted_prompt = f"Please answer the following question concisely in one paragraph:\n{prompt}\nAnswer:"
    input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids.to(device)
    
    # Start with the prompt
    output_ids = input_ids.clone()
    
    for _ in range(max_new_tokens):
        # Generate 1 token at a time
        outputs = model.generate(
            output_ids,
            max_new_tokens=1,
            do_sample=True,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        
        new_token_id = outputs[0, -1].unsqueeze(0).unsqueeze(0)
        output_ids = torch.cat([output_ids, new_token_id], dim=-1)
        
        new_token = tokenizer.decode(new_token_id[0], skip_special_tokens=True)
        print(new_token, end="", flush=True)
        
        if new_token_id.item() == tokenizer.eos_token_id:
            break
    print()  # for newline at the end

In [None]:
generate_response("What is Brittany in France?")