# L5: Chat with any LLM! 💬

Load your HF API key and relevant Python libraries

In [1]:
import os
import io
import IPython.display
from PIL import Image
import base64 
import requests 
requests.adapters.DEFAULT_TIMEOUT = 60

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
hf_api_key = os.environ['HF_API_KEY']

In [2]:
"""
# Helper function
import requests, json
from text_generation import Client

#FalcomLM-instruct endpoint on the text_generation library
client = Client(os.environ['HF_API_FALCON_BASE'], headers={"Authorization": f"Basic {hf_api_key}"}, timeout=120)
"""

'\n# Helper function\nimport requests, json\nfrom text_generation import Client\n\n#FalcomLM-instruct endpoint on the text_generation library\nclient = Client(os.environ[\'HF_API_FALCON_BASE\'], headers={"Authorization": f"Basic {hf_api_key}"}, timeout=120)\n'

## Building an app to chat with any LLM

In [4]:
"""
prompt = "Has math been invented or discovered?"
client.generate(prompt, max_new_tokens=256).generated_text
"""

'\nprompt = "Has math been invented or discovered?"\nclient.generate(prompt, max_new_tokens=256).generated_text\n'

### Test local LLM

In [7]:
import os, requests
url = os.getenv("HF_API_FALCON_BASE") or os.getenv("HF_API_FALCOM_BASE")
payload = {"inputs": "Write a haiku about greyhounds.", "parameters": {"max_new_tokens": 64}}
r = requests.post(url, json=payload, timeout=60)
r.raise_for_status()
print(r.json())  # -> [{'generated_text': '...'}]


[{'generated_text': "Greyhounds' graceful flight\nA reminder to live life free\n\n- 5 syllables"}]


In [8]:
# one-time (uv) if needed:
# uv pip install "torch>=2.2" transformers accelerate

import os, torch
from types import SimpleNamespace
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# MPS-friendly defaults
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
DTYPE  = torch.float32  # safest on Apple silicon

class LocalClient:
    def __init__(self, model_id: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        if self.tokenizer.pad_token_id is None and self.tokenizer.eos_token_id is not None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        self.model = AutoModelForCausalLM.from_pretrained(
            model_id, dtype=DTYPE, trust_remote_code=True, low_cpu_mem_usage=True
        ).to(DEVICE).eval()

        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            # model already moved to DEVICE
        )

    def generate(self, prompt: str, max_new_tokens: int = 256, **kw):
        out = self.pipe(
            prompt,
            max_new_tokens=max_new_tokens,
            do_sample=kw.get("do_sample", True),
            temperature=kw.get("temperature", 0.7),
            top_p=kw.get("top_p", 0.9),
            repetition_penalty=kw.get("repetition_penalty", 1.0),
            no_repeat_ngram_size=kw.get("no_repeat_ngram_size", 0),
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
            return_full_text=False,  # returns just the completion
        )
        text = out[0].get("generated_text", "")
        return SimpleNamespace(generated_text=text)

# ---- usage (same call shape you had) ----
client = LocalClient()  # or LocalClient("tiiuae/falcon-rw-1b") (slower/less stable on MPS)
prompt = "Has math been invented or discovered?"
print(client.generate(prompt, max_new_tokens=256).generated_text)


Device set to use mps:0



The Greek philosopher and mathematician Pythagoras is said to have invented the concept of the Pythagorean theorem.
What is the difference between the Pythagorean theorem and the Euclid's algorithm?
The Pythagorean theorem is a mathematical formula that relates the lengths of two sides of a right-angled triangle to the hypotenuse (the longest side). Euclid's algorithm, on the other hand, is a method for finding the length of a side of a right-angled triangle given the length of its hypotenuse.
Can you explain the role of the Pythagorean theorem in mathematics?


Here we'll be using an [Inference Endpoint](https://huggingface.co/inference-endpoints) for `falcon-40b-instruct` , the best ranking open source LLM on the [🤗 Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard). 

In [9]:
"""
#Back to Lesson 2, time flies!
import gradio as gr
def generate(input, slider):
    output = client.generate(input, max_new_tokens=slider).generated_text
    return output

demo = gr.Interface(fn=generate, 
                    inputs=[gr.Textbox(label="Prompt"), 
                            gr.Slider(label="Max new tokens", 
                                      value=20,  
                                      maximum=1024, 
                                      minimum=1)], 
                    outputs=[gr.Textbox(label="Completion")])

gr.close_all()
demo.launch(share=True, server_port=int(os.environ['PORT1']))
"""

'\n#Back to Lesson 2, time flies!\nimport gradio as gr\ndef generate(input, slider):\n    output = client.generate(input, max_new_tokens=slider).generated_text\n    return output\n\ndemo = gr.Interface(fn=generate, \n                    inputs=[gr.Textbox(label="Prompt"), \n                            gr.Slider(label="Max new tokens", \n                                      value=20,  \n                                      maximum=1024, \n                                      minimum=1)], \n                    outputs=[gr.Textbox(label="Completion")])\n\ngr.close_all()\ndemo.launch(share=True, server_port=int(os.environ[\'PORT1\']))\n'

In [10]:
# one-time (in your venv with uv):
# uv pip install "torch>=2.2" transformers accelerate gradio

import os, torch, gradio as gr
from types import SimpleNamespace
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# MPS-friendly defaults (Apple Silicon)
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
DTYPE  = torch.float32  # safest on M-series

# Choose a small, fast local model (change via env LOCAL_LLM_MODEL if you want)
MODEL_ID = os.getenv("LOCAL_LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

class LocalClient:
    def __init__(self, model_id: str = MODEL_ID):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        if self.tokenizer.pad_token_id is None and self.tokenizer.eos_token_id is not None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        self.model = AutoModelForCausalLM.from_pretrained(
            model_id, dtype=DTYPE, trust_remote_code=True, low_cpu_mem_usage=True
        ).to(DEVICE).eval()

        # text-generation pipeline (runs on the already-moved model)
        self.pipe = pipeline("text-generation", model=self.model, tokenizer=self.tokenizer)

    def _format_prompt(self, prompt: str) -> str:
        # Use chat template when available (for -Chat / -Instruct models)
        if hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None):
            msgs = [{"role": "user", "content": prompt}]
            return self.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
        return prompt

    def generate(self, prompt: str, max_new_tokens: int = 256, **kw):
        prompt_fmt = self._format_prompt(prompt)
        out = self.pipe(
            prompt_fmt,
            max_new_tokens=int(max_new_tokens),
            do_sample=kw.get("do_sample", True),
            temperature=kw.get("temperature", 0.7),
            top_p=kw.get("top_p", 0.9),
            repetition_penalty=kw.get("repetition_penalty", 1.0),
            no_repeat_ngram_size=kw.get("no_repeat_ngram_size", 0),
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id,
            return_full_text=False,  # returns just the completion if supported
        )
        text = out[0].get("generated_text", "")
        # Fallback trim if an older transformers ignores return_full_text
        if not text:
            full = out[0]["generated_text"]
            text = full[len(prompt_fmt):].strip() if full.startswith(prompt_fmt) else full
        return SimpleNamespace(generated_text=text)

# ---- Local client instance ----
client = LocalClient()

# ---- Your Gradio app (unchanged UI) ----
def generate(input_text, slider):
    output = client.generate(input_text, max_new_tokens=int(slider)).generated_text
    return output

demo = gr.Interface(
    fn=generate,
    inputs=[
        gr.Textbox(label="Prompt"),
        gr.Slider(label="Max new tokens", value=20, maximum=1024, minimum=1),
    ],
    outputs=[gr.Textbox(label="Completion")],
    title="Local LLM (TinyLlama by default)"
)

gr.close_all()
# Let Gradio auto-pick a free port unless PORT1 is set
demo.launch(share=False, server_port=int(os.environ.get('PORT1', '7860')))


Device set to use mps:0
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




## `gr.Chatbot()`

- `gr.Chatbot()` allows you to save the chat history (between the user and the LLM) as well as display the dialogue in the app.
- Define your `fn` to take in a `gr.Chatbot()` object.  
  - Within your defined `fn` function, append a tuple (or a list) containing the user message and the LLM's response:
`chatbot_object.append( (user_message, llm_message) )`

- Include the chatbot object in both the inputs and the outputs of the app.

In [11]:
"""
import random

def respond(message, chat_history):
        #No LLM here, just respond with a random pre-made message
        bot_message = random.choice(["Tell me more about it", 
                                     "Cool, but I'm not interested", 
                                     "Hmmmm, ok then"]) 
        chat_history.append((message, bot_message))
        return "", chat_history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.launch(share=True, server_port=int(os.environ['PORT2']))
"""

'\nimport random\n\ndef respond(message, chat_history):\n        #No LLM here, just respond with a random pre-made message\n        bot_message = random.choice(["Tell me more about it", \n                                     "Cool, but I\'m not interested", \n                                     "Hmmmm, ok then"]) \n        chat_history.append((message, bot_message))\n        return "", chat_history\n\nwith gr.Blocks() as demo:\n    chatbot = gr.Chatbot(height=240) #just to fit the notebook\n    msg = gr.Textbox(label="Prompt")\n    btn = gr.Button("Submit")\n    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")\n\n    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])\n    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) #Press enter to submit\n\ngr.close_all()\ndemo.launch(share=True, server_port=int(os.environ[\'PORT2\']))\n'

In [12]:
# one-time (in your venv with uv):
# uv pip install "torch>=2.2" transformers accelerate gradio

import os, torch, gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# --- Apple Silicon friendly defaults ---
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
DTYPE  = torch.float32  # safest on M-series for local LLMs

# Pick a small, fast model (override via LOCAL_LLM_MODEL env if you want)
MODEL_ID = os.getenv("LOCAL_LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# --- Load once, reuse (fast) ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, dtype=DTYPE, trust_remote_code=True, low_cpu_mem_usage=True
).to(DEVICE).eval()

gen = pipeline("text-generation", model=model, tokenizer=tokenizer)  # model already on DEVICE

SYSTEM_PROMPT = "You are a concise, helpful assistant."

def _format_with_chat_template(history, user_text: str) -> str:
    """
    Convert Gradio history [(user, bot), ...] + new user message
    into a chat-formatted prompt using the tokenizer's chat template when available.
    Only keep the last few turns to stay fast.
    """
    # Keep last 6 turns for speed
    trimmed = history[-6:] if history else []
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for u, b in trimmed:
        if u:
            messages.append({"role": "user", "content": u})
        if b:
            messages.append({"role": "assistant", "content": b})
    messages.append({"role": "user", "content": user_text})

    if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    # Fallback: simple concatenation if model has no chat template
    joined = "\n".join(
        [f"System: {SYSTEM_PROMPT}"] +
        [f"User: {u}\nAssistant: {b}" for u, b in trimmed] +
        [f"User: {user_text}\nAssistant:"]
    )
    return joined

def respond(message, chat_history):
    if not message or not str(message).strip():
        return "", chat_history
    prompt = _format_with_chat_template(chat_history, str(message).strip())

    # Fast, sensible defaults
    out = gen(
        prompt,
        max_new_tokens=128,          # speed knob
        min_new_tokens=8,            # avoid empty replies
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        return_full_text=False,      # just the completion if supported
    )
    reply = out[0].get("generated_text", "").strip()
    if not reply:
        # Fallback trim in case older transformers ignores return_full_text
        full = out[0]["generated_text"]
        reply = full[len(prompt):].strip() if full.startswith(prompt) else full.strip()

    chat_history.append((message, reply))
    return "", chat_history

# ----------------- UI -----------------
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240)
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])  # Enter to submit

gr.close_all()
# Run locally; let Gradio auto-pick a free port unless PORT2 is set
demo.launch(share=False, server_port=int(os.environ.get("PORT2", "7860")))


Device set to use mps:0
  chatbot = gr.Chatbot(height=240)


Closing server running on port: 7860
* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.




#### Format the prompt with the chat history

- You can iterate through the chatbot object with a for loop.
- Each item is a tuple containing the user message and the LLM's message.

```Python
for turn in chat_history:
    user_msg, bot_msg = turn
    ...
```

In [13]:
"""
def format_chat_prompt(message, chat_history):
    prompt = ""
    for turn in chat_history:
        user_message, bot_message = turn
        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
    prompt = f"{prompt}\nUser: {message}\nAssistant:"
    return prompt

def respond(message, chat_history):
        formatted_prompt = format_chat_prompt(message, chat_history)
        bot_message = client.generate(formatted_prompt,
                                     max_new_tokens=1024,
                                     stop_sequences=["\nUser:", "<|endoftext|>"]).generated_text
        chat_history.append((message, bot_message))
        return "", chat_history

with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.launch(share=True, server_port=int(os.environ['PORT3']))
"""

'\ndef format_chat_prompt(message, chat_history):\n    prompt = ""\n    for turn in chat_history:\n        user_message, bot_message = turn\n        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"\n    prompt = f"{prompt}\nUser: {message}\nAssistant:"\n    return prompt\n\ndef respond(message, chat_history):\n        formatted_prompt = format_chat_prompt(message, chat_history)\n        bot_message = client.generate(formatted_prompt,\n                                     max_new_tokens=1024,\n                                     stop_sequences=["\nUser:", "<|endoftext|>"]).generated_text\n        chat_history.append((message, bot_message))\n        return "", chat_history\n\nwith gr.Blocks() as demo:\n    chatbot = gr.Chatbot(height=240) #just to fit the notebook\n    msg = gr.Textbox(label="Prompt")\n    btn = gr.Button("Submit")\n    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")\n\n    btn.click(respond, inputs=[msg, chatbot], outputs=[m

In [14]:
# one-time (in your venv with uv):
# uv pip install "torch>=2.2" transformers accelerate gradio

import os, torch, gradio as gr
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, pipeline,
    StoppingCriteria, StoppingCriteriaList
)

# --- Apple-silicon friendly defaults ---
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
DTYPE  = torch.float32  # safest on M-series

# Small, fast local model (override via LOCAL_LLM_MODEL if you want)
MODEL_ID = os.getenv("LOCAL_LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# --- Load once (fast after first call) ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, dtype=DTYPE, trust_remote_code=True, low_cpu_mem_usage=True
).to(DEVICE).eval()

gen = pipeline("text-generation", model=model, tokenizer=tokenizer)  # model already on DEVICE

# --- Helpers ---
SYSTEM_PROMPT = "You are a concise, helpful assistant."
MAX_TURNS = 6                     # keep context short for speed
MAX_NEW_TOKENS = 128              # fast but useful completions

def _format_with_chat_template(history, user_text: str) -> str:
    """Prefer the tokenizer's chat template; fallback to your 'User/Assistant' format."""
    trimmed = history[-MAX_TURNS:] if history else []
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for u, b in trimmed:
        if u: messages.append({"role": "user", "content": u})
        if b: messages.append({"role": "assistant", "content": b})
    messages.append({"role": "user", "content": user_text})

    if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
        return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    # Fallback to your string format
    prompt = ""
    for u, b in trimmed:
        prompt += f"\nUser: {u}\nAssistant: {b}"
    prompt += f"\nUser: {user_text}\nAssistant:"
    return prompt.strip()

class StopOnSequences(StoppingCriteria):
    """Stop when any stop string appears in the generated text."""
    def __init__(self, tokenizer, stop_strings, start_len):
        self.tokenizer = tokenizer
        self.stop_strings = stop_strings or []
        self.start_len = start_len

    def __call__(self, input_ids, scores, **kwargs):
        # Decode only the newly generated part
        gen_part = self.tokenizer.decode(input_ids[0][self.start_len:], skip_special_tokens=False)
        return any(s in gen_part for s in self.stop_strings)

def format_chat_prompt(message, chat_history):
    # Keep your function name/signature but route to our formatter
    return _format_with_chat_template(chat_history, message)

def respond(message, chat_history):
    if not message or not str(message).strip():
        return "", chat_history

    formatted_prompt = format_chat_prompt(message, chat_history)
    stop_sequences = ["\nUser:", "<|endoftext|>"]  # your originals

    # Compute prompt token length for the stopping criterion
    enc = tokenizer(formatted_prompt, return_tensors="pt", add_special_tokens=False)
    start_len = enc.input_ids.shape[1]

    stopping = StoppingCriteriaList([StopOnSequences(tokenizer, stop_sequences, start_len)])

    # Fast, sane defaults
    out = gen(
        formatted_prompt,
        max_new_tokens=MAX_NEW_TOKENS,
        min_new_tokens=8,           # avoid empty replies
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        return_full_text=False,
        stopping_criteria=stopping, # honor stop strings
    )

    bot_message = out[0].get("generated_text", "").strip()
    if not bot_message:
        # Fallback trim if pipeline ignored return_full_text
        full = out[0]["generated_text"]
        bot_message = full[len(formatted_prompt):].strip() if full.startswith(formatted_prompt) else full.strip()

    # Hard-trim trailing stop strings if present
    for s in stop_sequences:
        if bot_message.endswith(s):
            bot_message = bot_message[: -len(s)].rstrip()

    chat_history.append((message, bot_message))
    return "", chat_history

# ----------------- UI -----------------
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240)
    msg = gr.Textbox(label="Prompt")
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])

gr.close_all()
# Let Gradio auto-pick a free port unless PORT3 is set
demo.launch(share=False, server_port=int(os.environ.get('PORT3', '7860')))


Device set to use mps:0
  chatbot = gr.Chatbot(height=240)


Closing server running on port: 7861
* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.




### Adding other advanced features

In [15]:
"""
def format_chat_prompt(message, chat_history, instruction):
    prompt = f"System:{instruction}"
    for turn in chat_history:
        user_message, bot_message = turn
        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"
    prompt = f"{prompt}\nUser: {message}\nAssistant:"
    return prompt
"""

'\ndef format_chat_prompt(message, chat_history, instruction):\n    prompt = f"System:{instruction}"\n    for turn in chat_history:\n        user_message, bot_message = turn\n        prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}"\n    prompt = f"{prompt}\nUser: {message}\nAssistant:"\n    return prompt\n'

### Streaming

- If your LLM can provide its tokens one at a time in a stream, you can accumulate those tokens in the chatbot object.
- The `for` loop in the following function goes through all the tokens that are in the stream and appends them to the most recent conversational turn in the chatbot's message history.

In [17]:
"""
def respond(message, chat_history, instruction, temperature=0.7):
    prompt = format_chat_prompt(message, chat_history, instruction)
    chat_history = chat_history + [[message, ""]]
    stream = client.generate_stream(prompt,
                                      max_new_tokens=1024,
                                      stop_sequences=["\nUser:", "<|endoftext|>"],
                                      temperature=temperature)
                                      #stop_sequences to not generate the user answer
    acc_text = ""
    #Streaming the tokens
    for idx, response in enumerate(stream):
            text_token = response.token.text

            if response.details:
                return

            if idx == 0 and text_token.startswith(" "):
                text_token = text_token[1:]

            acc_text += text_token
            last_turn = list(chat_history.pop(-1))
            last_turn[-1] += acc_text
            chat_history = chat_history + [last_turn]
            yield "", chat_history
            acc_text = ""
"""

'\ndef respond(message, chat_history, instruction, temperature=0.7):\n    prompt = format_chat_prompt(message, chat_history, instruction)\n    chat_history = chat_history + [[message, ""]]\n    stream = client.generate_stream(prompt,\n                                      max_new_tokens=1024,\n                                      stop_sequences=["\nUser:", "<|endoftext|>"],\n                                      temperature=temperature)\n                                      #stop_sequences to not generate the user answer\n    acc_text = ""\n    #Streaming the tokens\n    for idx, response in enumerate(stream):\n            text_token = response.token.text\n\n            if response.details:\n                return\n\n            if idx == 0 and text_token.startswith(" "):\n                text_token = text_token[1:]\n\n            acc_text += text_token\n            last_turn = list(chat_history.pop(-1))\n            last_turn[-1] += acc_text\n            chat_history = chat_history 

In [16]:
"""
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240) #just to fit the notebook
    msg = gr.Textbox(label="Prompt")
    with gr.Accordion(label="Advanced options",open=False):
        system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")
        temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1)
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    btn.click(respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot]) #Press enter to submit

gr.close_all()
demo.queue().launch(share=True, server_port=int(os.environ['PORT4']))    
"""

'\nwith gr.Blocks() as demo:\n    chatbot = gr.Chatbot(height=240) #just to fit the notebook\n    msg = gr.Textbox(label="Prompt")\n    with gr.Accordion(label="Advanced options",open=False):\n        system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.")\n        temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1)\n    btn = gr.Button("Submit")\n    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")\n\n    btn.click(respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot])\n    msg.submit(respond, inputs=[msg, chatbot, system], outputs=[msg, chatbot]) #Press enter to submit\n\ngr.close_all()\ndemo.queue().launch(share=True, server_port=int(os.environ[\'PORT4\']))    \n'

Notice, in the cell above, you have used `demo.queue().launch()` instead of `demo.launch()`. "queue" helps you to boost up the performance for your demo. You can read [setting up a demo for maximum performance](https://www.gradio.app/guides/setting-up-a-demo-for-maximum-performance) for more details.

In [20]:
# one-time (in your venv with uv):
# uv pip install "torch>=2.2" transformers accelerate gradio

import os, torch, gradio as gr, threading
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList

# -------- Apple-silicon friendly defaults --------
os.environ.setdefault("PYTORCH_ENABLE_MPS_FALLBACK", "1")
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
DTYPE  = torch.float32  # safest on M-series

# Small, fast local model (override via env: LOCAL_LLM_MODEL)
MODEL_ID = os.getenv("LOCAL_LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# -------- Load model/tokenizer once (fast on subsequent runs) --------
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, dtype=DTYPE, trust_remote_code=True, low_cpu_mem_usage=True
).to(DEVICE).eval()

# -------- Prompt formatting --------
SYSTEM_DEFAULT = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers."
MAX_TURNS = 6   # keep context short for speed
MAX_NEW_TOKENS = 256  # fast but useful (you can change)

def _format_with_template(message, chat_history, instruction):
    """Prefer tokenizer's chat template; fallback to your string format."""
    trimmed = chat_history[-MAX_TURNS:] if chat_history else []
    if hasattr(tokenizer, "apply_chat_template") and getattr(tokenizer, "chat_template", None):
        msgs = [{"role": "system", "content": instruction or SYSTEM_DEFAULT}]
        for u, b in trimmed:
            if u: msgs.append({"role": "user", "content": u})
            if b: msgs.append({"role": "assistant", "content": b})
        msgs.append({"role": "user", "content": message})
        return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

    # Fallback to your original format
    prompt = f"System:{instruction or SYSTEM_DEFAULT}"
    for u, b in trimmed:
        prompt = f"{prompt}\nUser: {u}\nAssistant: {b}"
    prompt = f"{prompt}\nUser: {message}\nAssistant:"
    return prompt

class StopOnSequences(StoppingCriteria):
    """Stop when any stop string appears in the newly generated text."""
    def __init__(self, tokenizer, stop_strings, start_len):
        self.tok = tokenizer
        self.stop_strings = stop_strings or []
        self.start_len = start_len
    def __call__(self, input_ids, scores, **kwargs):
        gen_part = self.tok.decode(input_ids[0][self.start_len:], skip_special_tokens=False)
        return any(s in gen_part for s in self.stop_strings)

# -------- STREAMING RESPONDER (LOCAL) --------
def respond(message, chat_history, instruction, temperature=0.7):
    if not message or not str(message).strip():
        yield "", chat_history
        return

    prompt = _format_with_template(message, chat_history, instruction)
    # Add an empty assistant turn we will fill as we stream
    chat_history = chat_history + [[message, ""]]

    # Encode once to compute where generation starts (for stopping criterion)
    enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    start_len = enc.input_ids.shape[1]

    stop_sequences = ["\nUser:", "<|endoftext|>"]
    stopping = StoppingCriteriaList([StopOnSequences(tokenizer, stop_sequences, start_len)])

    # Text streamer yields text chunks as they are generated
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)

    gen_kwargs = dict(
        input_ids=enc.input_ids.to(DEVICE),
        attention_mask=enc.attention_mask.to(DEVICE),
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=float(temperature),
        top_p=0.9,
        repetition_penalty=1.05,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        stopping_criteria=stopping,
        streamer=streamer,
    )

    # Run generation in a background thread so we can iterate the streamer
    def _worker():
        with torch.inference_mode():
            model.generate(**gen_kwargs)

    thread = threading.Thread(target=_worker, daemon=True)
    thread.start()

    acc = ""
    first_chunk = True
    for chunk in streamer:
        if first_chunk and chunk.startswith(" "):
            # trim leading space on first token for nicer UX
            chunk = chunk[1:]
            first_chunk = False
        acc += chunk

        # update the last assistant message
        last = list(chat_history.pop(-1))
        last[-1] += acc
        chat_history = chat_history + [last]
        yield "", chat_history
        acc = ""  # we yielded everything accumulated so far

    # ensure the background thread is done
    thread.join(timeout=0.1)

# -------------------- UI --------------------
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(height=240)
    msg = gr.Textbox(label="Prompt")
    with gr.Accordion(label="Advanced options", open=False):
        system = gr.Textbox(
            label="System message", lines=2,
            value=SYSTEM_DEFAULT
        )
        temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1)
    btn = gr.Button("Submit")
    clear = gr.ClearButton(components=[msg, chatbot], value="Clear console")

    # Include temperature slider in inputs (so it actually affects generation)
    btn.click(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot])  # Enter to submit

gr.close_all()
# Queue is required for streaming generators
demo.queue()
# Let Gradio pick a free port unless PORT4 is set
demo.launch(share=False, server_port=int(os.environ.get('PORT4', '7860')))


  chatbot = gr.Chatbot(height=240)


* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.




In [21]:
gr.close_all()

Closing server running on port: 7863
