[DOC] Add reasoning capability to vLLM streamlit code (#19557)

Navanit-git · web-flow · commit 3e7506975c86 · 2025-06-16T07:09:12.000-04:00
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -11,6 +11,7 @@
 - Streaming response display
 - Configurable API endpoint
 - Real-time chat history
+- Reasoning Display: Optional thinking process visualization 
 
 Requirements:
     pip install streamlit openai
@@ -51,13 +52,33 @@
 if "active_session" not in st.session_state:
     st.session_state.active_session = None
 
+# Add new session state for reasoning
+if "show_reasoning" not in st.session_state:
+    st.session_state.show_reasoning = {}
+
 # Initialize session state for API base URL
 if "api_base_url" not in st.session_state:
     st.session_state.api_base_url = openai_api_base
 
 
 def create_new_chat_session():
-    """Create a new chat session with timestamp as ID"""
+    """Create a new chat session with timestamp as unique identifier.
+
+    This function initializes a new chat session by:
+    1. Generating a timestamp-based session ID
+    2. Creating an empty message list for the new session
+    3. Setting the new session as both current and active session
+    4. Resetting the messages list for the new session
+
+    Returns:
+        None
+
+    Session State Updates:
+        - sessions: Adds new empty message list with timestamp key
+        - current_session: Sets to new session ID
+        - active_session: Sets to new session ID
+        - messages: Resets to empty list
+    """
     session_id = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     st.session_state.sessions[session_id] = []
     st.session_state.current_session = session_id
@@ -66,30 +87,98 @@ def create_new_chat_session():
 
 
 def switch_to_chat_session(session_id):
-    """Switch to a different chat session"""
+    """Switch the active chat context to a different session.
+
+    Args:
+        session_id (str): The timestamp ID of the session to switch to
+
+    This function handles chat session switching by:
+    1. Setting the specified session as current
+    2. Updating the active session marker
+    3. Loading the messages history from the specified session
+
+    Session State Updates:
+        - current_session: Updated to specified session_id
+        - active_session: Updated to specified session_id
+        - messages: Loaded from sessions[session_id]
+    """
     st.session_state.current_session = session_id
     st.session_state.active_session = session_id
     st.session_state.messages = st.session_state.sessions[session_id]
 
 
-def get_llm_response(messages, model):
-    """Get streaming response from llm
+def get_llm_response(messages, model, reason, content_ph=None, reasoning_ph=None):
+    """Generate and stream LLM response with optional reasoning process.
 
     Args:
-        messages: List of message dictionaries
-        model: Name of model
+        messages (list): List of conversation message dicts with 'role' and 'content'
+        model (str): The model identifier to use for generation
+        reason (bool): Whether to enable and display reasoning process
+        content_ph (streamlit.empty): Placeholder for streaming response content
+        reasoning_ph (streamlit.empty): Placeholder for streaming reasoning process
 
     Returns:
-        Streaming response object or error message string
+        tuple: (str, str)
+            - First string contains the complete response text
+            - Second string contains the complete reasoning text (if enabled)
+
+    Features:
+        - Streams both reasoning and response text in real-time
+        - Handles model API errors gracefully
+        - Supports live updating of thinking process
+        - Maintains separate content and reasoning displays
+
+    Raises:
+        Exception: Wrapped in error message if API call fails
+
+    Note:
+        The function uses streamlit placeholders for live updates.
+        When reason=True, the reasoning process appears above the response.
     """
+    full_text = ""
+    think_text = ""
+    live_think = None
+    # Build request parameters
+    params = {"model": model, "messages": messages, "stream": True}
+    if reason:
+        params["extra_body"] = {"chat_template_kwargs": {"enable_thinking": True}}
+
     try:
-        response = client.chat.completions.create(
-            model=model, messages=messages, stream=True
-        )
-        return response
+        response = client.chat.completions.create(**params)
+        if isinstance(response, str):
+            if content_ph:
+                content_ph.markdown(response)
+            return response, ""
+
+        # Prepare reasoning expander above content
+        if reason and reasoning_ph:
+            exp = reasoning_ph.expander("💭 Thinking Process (live)", expanded=True)
+            live_think = exp.empty()
+
+        # Stream chunks
+        for chunk in response:
+            delta = chunk.choices[0].delta
+            # Stream reasoning first
+            if reason and hasattr(delta, "reasoning_content") and live_think:
+                rc = delta.reasoning_content
+                if rc:
+                    think_text += rc
+                    live_think.markdown(think_text + "▌")
+            # Then stream content
+            if hasattr(delta, "content") and delta.content and content_ph:
+                full_text += delta.content
+                content_ph.markdown(full_text + "▌")
+
+        # Finalize displays: reasoning remains above, content below
+        if reason and live_think:
+            live_think.markdown(think_text)
+        if content_ph:
+            content_ph.markdown(full_text)
+
+        return full_text, think_text
     except Exception as e:
         st.error(f"Error details: {str(e)}")
-        return f"Error: {str(e)}"
+        return f"Error: {str(e)}", ""
 
 
 # Sidebar - API Settings first
@@ -108,6 +197,7 @@ def get_llm_response(messages, model):
 if st.sidebar.button("New Session"):
     create_new_chat_session()
 
+
 # Display all sessions in reverse chronological order
 for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
     # Mark the active session with a pinned button
@@ -143,47 +233,79 @@ def get_llm_response(messages, model):
     create_new_chat_session()
     st.session_state.active_session = st.session_state.current_session
 
-# Display chat history for current session
-for message in st.session_state.messages:
-    with st.chat_message(message["role"]):
-        st.write(message["content"])
+# Update the chat history display section
+for idx, msg in enumerate(st.session_state.messages):
+    # Render user messages normally
+    if msg["role"] == "user":
+        with st.chat_message("user"):
+            st.write(msg["content"])
+    # Render assistant messages with reasoning above
+    else:
+        # If reasoning exists for this assistant message, show it above the content
+        if idx in st.session_state.show_reasoning:
+            with st.expander("💭 Thinking Process", expanded=False):
+                st.markdown(st.session_state.show_reasoning[idx])
+        with st.chat_message("assistant"):
+            st.write(msg["content"])
+
+
+# Setup & Cache reasoning support check
+@st.cache_data(show_spinner=False)
+def server_supports_reasoning():
+    """Check if the current model supports reasoning capability.
+
+    Returns:
+        bool: True if the model supports reasoning, False otherwise
+    """
+    resp = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": "Hi"}],
+        stream=False,
+    )
+    return hasattr(resp.choices[0].message, "reasoning_content") and bool(
+        resp.choices[0].message.reasoning_content
+    )
 
-# Handle user input and generate llm response
+
+# Check support
+supports_reasoning = server_supports_reasoning()
+
+# Add reasoning toggle in sidebar if supported
+reason = False  # Default to False
+if supports_reasoning:
+    reason = st.sidebar.checkbox("Enable Reasoning", value=False)
+else:
+    st.sidebar.markdown(
+        "<span style='color:gray;'>Reasoning unavailable for this model.</span>",
+        unsafe_allow_html=True,
+    )
+    # reason remains False
+
+# Update the input handling section
 if prompt := st.chat_input("Type your message here..."):
-    # Save user message to session
+    # Save and display user message
     st.session_state.messages.append({"role": "user", "content": prompt})
     st.session_state.sessions[st.session_state.current_session] = (
         st.session_state.messages
     )
-
-    # Display user message
     with st.chat_message("user"):
         st.write(prompt)
 
-    # Prepare messages for llm
-    messages_for_llm = [
+    # Prepare LLM messages
+    msgs = [
         {"role": m["role"], "content": m["content"]} for m in st.session_state.messages
     ]
 
-    # Generate and display llm response
+    # Stream assistant response
     with st.chat_message("assistant"):
-        message_placeholder = st.empty()
-        full_response = ""
-
-        # Get streaming response from llm
-        response = get_llm_response(messages_for_llm, model)
-        if isinstance(response, str):
-            message_placeholder.markdown(response)
-            full_response = response
-        else:
-            for chunk in response:
-                if hasattr(chunk.choices[0].delta, "content"):
-                    content = chunk.choices[0].delta.content
-                    if content:
-                        full_response += content
-                        message_placeholder.markdown(full_response + "▌")
-
-            message_placeholder.markdown(full_response)
-
-    # Save llm response to session history
-    st.session_state.messages.append({"role": "assistant", "content": full_response})
+        # Placeholders: reasoning above, content below
+        reason_ph = st.empty()
+        content_ph = st.empty()
+        full, think = get_llm_response(msgs, model, reason, content_ph, reason_ph)
+        # Determine index for this new assistant message
+        message_index = len(st.session_state.messages)
+        # Save assistant reply
+        st.session_state.messages.append({"role": "assistant", "content": full})
+        # Persist reasoning in session state if any
+        if reason and think:
+            st.session_state.show_reasoning[message_index] = think