In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0", 
    torch_dtype=torch.float16,
    device_map="auto"
)

In [9]:
def stream_chat(message, max_tokens=300, temperature=0.2):
    """Generate response token by token with live printing"""
    # Format prompt for TinyLlama
    prompt = f"<|user|>\n{message}<|endoftext|>\n<|assistant|>\n"
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_ids = inputs['input_ids']
    
    # Print the prompt
    print(f"User: {message}")
    print("Assistant: ", end="", flush=True)
    
    # Generate token by token
    generated_tokens = []
    previous_text = ""
    
    with torch.no_grad():
        for _ in range(max_tokens):
            # Get model outputs
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]
            
            # Apply temperature
            logits = logits / temperature
            
            # Sample next token
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, 1)
            
            # Add to generated tokens
            generated_tokens.append(next_token[0].item())
            
            # Decode all generated tokens to get proper spacing
            current_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            
            # Print only the new part
            new_text = current_text[len(previous_text):]
            print(new_text, end="", flush=True)
            previous_text = current_text
            
            # Update input_ids
            input_ids = torch.cat([input_ids, next_token], dim=1)
            
            # Check for end token
            if next_token[0].item() == tokenizer.eos_token_id:
                break
    
    print()  # New line at end
    return tokenizer.decode(generated_tokens, skip_special_tokens=True)

# Test the function
response = stream_chat("Tell me a tiny short poem about a cat and a dog that become friends?")

User: Tell me a tiny short poem about a cat and a dog that become friends?
Assistant: Here's a tiny short poem about a cat and a dog that become friends:

Once upon a time, in a small town,
Lived a cat and a dog, who were best friends.
Their names were Luna and Max,
And they had a love that was true.

Luna was a little shy,
But Max was bold and brave.
They would play fetch,
And chase each other around.

One day, Luna got lost,
And Max was worried,
But Max was quick to help,
And he found her, safe and sound.

Luna was grateful,
For Max's quick thinking,
And they became best friends,
For life was a little sweeter.

Together, they went on walks,
And played in the park,
They snuggled up on the couch,
And watched TV with a smile.

Luna and Max were inseparable,
For they had found their true home,
And they knew that they would always be,
Best friends, forever and ever.
