# wordslab-notebooks-lib.chat

> Chat with local and remote LLMs in the context of the wordslab-notebooks environment 

In [1]:
#| default_exp chat

In [97]:
#| export
from abc import ABC, abstractmethod
from collections.abc import Sequence as SequenceType
from typing import Callable, Optional, Union, Sequence, Mapping, Any, Literal
from datetime import datetime
import inspect, json, re, time, traceback

from toolslm.funccall import get_schema, call_func
from IPython.display import display, Markdown, clear_output

from ollama import Client
from ollama._types import Options
from openai import OpenAI

from wordslab_notebooks_lib.env import WordslabEnv

## Observable conversation turns

In [81]:
#| export
class ChatTurn:
    def __init__(self, refresh_display: Callable[None, None] = None):
        self.thinking_chunks = []
        self.content_chunks = []
        self.tool_calls = {}
        self.refresh_display = refresh_display

    def append_thinking(self, chunk:str):
        self.thinking_chunks.append(chunk)
        if self.refresh_display:
            self.refresh_display()

    @property
    def thinking(self):
        return "".join(self.thinking_chunks)
    
    def append_content(self, chunk:str):
        self.content_chunks.append(chunk)
        if self.refresh_display:
            self.refresh_display()

    @property
    def content(self):
        return "".join(self.content_chunks)
    
    def append_tool_call(self, tool_name:str, params:dict):
        self.tool_calls[tool_name] = { "params": params }
        if self.refresh_display:
            self.refresh_display()

    def start_tool_call(self, tool_name:str):
        self.tool_calls[tool_name]["start_time"] = time.time()
        if self.refresh_display:
            self.refresh_display()

    def end_tool_call(self, tool_name:str, result: object):
        self.tool_calls[tool_name]["end_time"] = time.time()
        self.tool_calls[tool_name]["result"] = str(result)
        if self.refresh_display:
            self.refresh_display()
    
    def to_markdown(self, hide_thinking:bool=True, hide_tool_calls:bool=True):
        output = ""
        if len(self.thinking_chunks) > 0:
            if not hide_thinking:
                output += "> [Thinking]\n\n"
                output += "\n".join(f"> {line}" for line in "".join(self.thinking_chunks).splitlines())
            else:
                output += f"> [Thinking] ... thought in {sum(s.count(' ') + s.count('\n') for s in self.thinking_chunks)} words\n\n"
        if len(self.content_chunks) > 0:
            output += "".join(self.content_chunks) + "\n\n"
        if len(self.tool_calls.keys()) > 0:
            if not hide_tool_calls:
                for tool_name in self.tool_calls.keys():
                    output += "> [Tool call]\n"
                    tool_call = self.tool_calls[tool_name]
                    if "params" in tool_call:
                        output += f"> - model wants to call `{tool_name}` with parameters `{tool_call["params"]}`\n"
                    if "start_time" in tool_call:
                        output += f"> - agent called {tool_name} at {datetime.fromtimestamp(tool_call["start_time"]).strftime("%H:%M:%S")}\n"
                    if "end_time" in tool_call:
                        result = tool_call["result"]
                        output += f"> - {tool_name} returned `{result if len(result)<=100 else result[:97]+'...'}` in {(tool_call["end_time"]-tool_call["start_time"]):.3f} sec\n"
                    output += "\n"
            else:
                for tool_name in self.tool_calls.keys():
                    tool_call = self.tool_calls[tool_name]
                    if "end_time" in tool_call:
                        result = self.tool_calls[tool_name]["result"]
                        output += f"> [Tool call] ... `{tool_name}` returned `{result if len(result)<=50 else result[:47]+'...'}`\n\n"
                    elif "start_time" in tool_call:                        
                        output += f"> [Tool call] ... agent is calling `{tool_name}`\n\n"
                    elif "params" in tool_call:                        
                        output += f"> [Tool call] ... model wants to call `{tool_name}`\n\n"
        return output

class ChatTurns:
    def __init__(self, notebook_display:bool=True, hide_thinking:bool=True, hide_tool_calls:bool=True):
        self.chat_turns = []
        self.notebook_display = notebook_display
        self.hide_thinking = hide_thinking
        self.hide_tool_calls = hide_tool_calls

    def new_turn(self):
        new_turn = ChatTurn(self.refresh_notebook_display)
        self.chat_turns.append(new_turn)
        return new_turn
    
    def refresh_notebook_display(self):
        clear_output(wait=True)
        output = ""
        for turn in self.chat_turns:
            output += turn.to_markdown(self.hide_thinking, self.hide_tool_calls)
        display(Markdown(output))

In [71]:
turns = ChatTurns()

turn = turns.new_turn()
turn.append_thinking("I think a lot longer.\nIn sentences.\n\nWith line breaks.")
time.sleep(0.5)
turn.append_content("I need to call 2 tools.")
time.sleep(0.5)
turn.append_tool_call("myfunc", {"param1": "value1", "param2": "value2"})
turn.start_tool_call("myfunc")
time.sleep(0.5)
turn.end_tool_call("myfunc", 17.43)
turn1 = turn

turn = turns.new_turn()
turn.append_thinking("Ok, I got the first result, now call the second tool.")
time.sleep(0.5)
turn.append_tool_call("myfunc2", {})
turn.start_tool_call("myfunc2")
time.sleep(0.5)
turn.end_tool_call("myfunc2", "The weather is nice today but clouds an wind are coming for tommorow and the rest of the week will be awful")

turn = turns.new_turn()
turn.append_thinking("Ok, I got the second result, now I can answer the question.")
time.sleep(0.5)
turn.append_content("This is the incredible result.")

> [Thinking] ... thought in 10 words

I need to call 2 tools.

> [Tool call] ... `myfunc` returned `17.43`

> [Thinking] ... thought in 10 words

> [Tool call] ... `myfunc2` returned `The weather is nice today but clouds an wind ar...`

> [Thinking] ... thought in 11 words

This is the incredible result.



In [5]:
turn1.thinking, turn1.content

('I think a lot longer.\nIn sentences.\n\nWith line breaks.',
 'I need to call 2 tools.')

In [6]:
Markdown(turn1.to_markdown())

> [Thinking] ... thought in 10 words

I need to call 2 tools.

> [Tool call] ... `myfunc` returned `17.43`



In [7]:
Markdown(turn1.to_markdown(hide_thinking=False, hide_tool_calls=False))

> [Thinking]
>
> I think a lot longer.
In sentences.
>
> With line breaks.

I need to call 2 tools.

> [Tool call]
> - model wants to call `myfunc` with parameters `{'param1': 'value1', 'param2': 'value2'}`
> - agent called myfunc at 15:58:43
> - myfunc returned `17.43` in 0.501 sec



## Native tool calling

Use python functions as tools callable by Large Language Models.

The python functions must be fully documented:
- type annotations are mandatory on all parameters and on the return type
- a docstring after the function definition is mandatory, it should explain the return value
- a descriptive comment after each parameter is also mandatory
- the expected format is: one parameter by line, a traditional python comment at the end of the line

In [8]:
def add(a: int,  # The first number
        b: int   # The second number
       ) -> int: # The sum of the two numbers
  """Add two numbers"""
  return a + b


def multiply(a: int,  # The first number 
             b: int   # The second number
            ) -> int: # The product of the two numbers
  """Multiply two numbers"""
  return a * b

**Tool description format for ollama API**

Here is the code used to process the tools parameter:

```python
for unprocessed_tool in tools or []:
    yield convert_function_to_tool(unprocessed_tool) if callable(unprocessed_tool) else Tool.model_validate(unprocessed_tool)
```

So we can pass either a list of pyhton functions or a list of dictionaries conforming to a specific tool schema.

Here are the expectations for the python functions documentation:

```python
def convert_function_to_tool(func: Callable) -> Tool:
 
  -> def _parse_docstring(doc_string: Union[str, None]) -> dict[str, str]:
  ...
  for line in doc_string.splitlines():
    ...
    if lowered_line.startswith('args:'):
      key = 'args'
    elif lowered_line.startswith(('returns:', 'yields:', 'raises:')):
      key = '_'
  ...
  for line in parsed_docstring['args'].splitlines():
    ...
    if ':' in line:
      # Split the line on either:
      # 1. A parenthetical expression like (integer) - captured in group 1
      # 2. A colon :
      # Followed by optional whitespace. Only split on first occurrence.
      ...
```

This is much less robust and readable than what `toolslm.funccall.get_schema` does, so we will preprocess the list of python functions ourselves.

Now let's see what tool description schema is expected by ollama.

pydantic Tool.model_validate() accepts:
- dict
- Pydantic model instances
- Objects with attributes (ORM-style, if configured)

Here is the ollama schema:

```python
class Tool(SubscriptableBaseModel):
  type: Optional[str] = 'function'

  class Function(SubscriptableBaseModel):
    name: Optional[str] = None
    description: Optional[str] = None

    class Parameters(SubscriptableBaseModel):
      model_config = ConfigDict(populate_by_name=True)
      type: Optional[Literal['object']] = 'object'
      defs: Optional[Any] = Field(None, alias='$defs')
      items: Optional[Any] = None
      required: Optional[Sequence[str]] = None
```

So this is the schema expected by the openai completions API, as we will see below.

**Tool description formats for the openai API**

The legacy openai completions API: `client.chat.completions.create(...)` expects tools to be described in a json format that uses the “wrapped function” schema:

```
tool = {
  type: "function",
  function: {
    name,
    description,
    parameters
  }
}
```

This is the canonical format for Chat Completions and is what OpenAI examples historically used.

This format does not work for the new API: `client.responses.create(...)`

The Responses API uses a flattened tool schema.

```
{
  "type": "function",
  "name": "...",
  "description": "..",
  "parameters": {
    "type": "object",
    "properties": {
      "a": {"type": "integer"},
      "b": {"type": "integer"}
    },
    "required": ["a", "b"]
  }
}
```

If you pass your wrapped version (function: {...}) to responses.create, you’ll get a schema validation error.

- Chat Completions treats tools as message-level actions → nested function
- Responses API treats tools as first-class model capabilities → flattened schema
- The Responses API also supports non-function tools (web search, file search, computer use), which drove the redesign

If you want maximum forward compatibility:
- Use the flattened format
- Even when working with Chat Completions, it’s easy to convert

In [9]:
#| export
def _get_function_schema(func: Callable, responsesAPIFormat: bool = False):
    "Get a json schema for a python function defined with comments for all parameters"
    if responsesAPIFormat:        
        return {'type': 'function', **get_schema(func, pname='parameters')}
    else:
        return {'type': 'function', 'function': get_schema(func, pname='parameters')}

def get_tools_schemas_and_functions(funcs: Sequence[Callable], responsesAPIFormat: bool = False):
    """Get a dictionary of json schemas and callable functions which can be used for native tool calling."""
    return {func.__name__: (_get_function_schema(func, responsesAPIFormat), func) for func in funcs}

In [10]:
get_tools_schemas_and_functions([add, multiply])

{'add': ({'type': 'function',
   'function': {'name': 'add',
    'description': 'Add two numbers\n\nReturns:\n- type: integer',
    'parameters': {'type': 'object',
     'properties': {'a': {'type': 'integer',
       'description': 'The first number'},
      'b': {'type': 'integer', 'description': 'The second number'}},
     'required': ['a', 'b']}}},
  <function __main__.add(a: int, b: int) -> int>),
 'multiply': ({'type': 'function',
   'function': {'name': 'multiply',
    'description': 'Multiply two numbers\n\nReturns:\n- type: integer',
    'parameters': {'type': 'object',
     'properties': {'a': {'type': 'integer',
       'description': 'The first number'},
      'b': {'type': 'integer', 'description': 'The second number'}},
     'required': ['a', 'b']}}},
  <function __main__.multiply(a: int, b: int) -> int>)}

In [11]:
get_tools_schemas_and_functions([add, multiply], responsesAPIFormat=True)

{'add': ({'type': 'function',
   'name': 'add',
   'description': 'Add two numbers\n\nReturns:\n- type: integer',
   'parameters': {'type': 'object',
    'properties': {'a': {'type': 'integer', 'description': 'The first number'},
     'b': {'type': 'integer', 'description': 'The second number'}},
    'required': ['a', 'b']}},
  <function __main__.add(a: int, b: int) -> int>),
 'multiply': ({'type': 'function',
   'name': 'multiply',
   'description': 'Multiply two numbers\n\nReturns:\n- type: integer',
   'parameters': {'type': 'object',
    'properties': {'a': {'type': 'integer', 'description': 'The first number'},
     'b': {'type': 'integer', 'description': 'The second number'}},
    'required': ['a', 'b']}},
  <function __main__.multiply(a: int, b: int) -> int>)}

In [12]:
#| export
class ToolExecutionError(Exception):
    """Raised when a tool cannot be executed safely."""

class Tools:
    """"Execute tools implemented as python functions with Large Language Models.
    The python functions must be fully documented:
    - type annotations are mandatory on all parameters and on the return type
    - a docstring after the function definition is mandatory
    - a descriptive comment after each parameter and the return type is also mandatory
    - the expected format is: one parameter by line, a traditional python comment at the end of the line
    """   
    def __init__(self, python_functions:Sequence[Callable], responsesAPIFormat:bool=False):
        self.schemas_and_functions = get_tools_schemas_and_functions(python_functions, responsesAPIFormat=responsesAPIFormat)

    def has_tool(self, tool_name:str):
        return tool_name in self.schemas_and_functions
    
    def get_schemas(self):
        return [t[0] for t in self.schemas_and_functions.values()]

    def get_schema(self, tool_name:str):
        return self.schemas_and_functions[tool_name][0]

    def get_functions(self):
        return [t[1] for t in self.schemas_and_functions.values()]

    def get_function(self, tool_name:str):
        return self.schemas_and_functions[tool_name][1]

    def call(self, tool_name:str, tool_arguments_dict:Mapping[str,Any]):
        # 1. Resolve the tool safely
        try:
            self.get_function(tool_name)
        except Exception as e:
            raise ToolExecutionError(f"Tool '{tool_name}' does not exist or could not be resolved.")
            
        # 2. Execute the tool with runtime protection
        try:
            return call_func(tool_name, tool_arguments_dict, self.get_functions(), raise_on_err=True)
        except Exception as e:
            tb = traceback.format_exc()
            raise ToolExecutionError(f"Tool '{tool_name}' raised an exception: {e}")

In [13]:
tools = Tools([add,multiply])

In [14]:
tools.has_tool("add"), tools.has_tool("toto")

(True, False)

In [15]:
tools.get_schemas()

[{'type': 'function',
  'function': {'name': 'add',
   'description': 'Add two numbers\n\nReturns:\n- type: integer',
   'parameters': {'type': 'object',
    'properties': {'a': {'type': 'integer', 'description': 'The first number'},
     'b': {'type': 'integer', 'description': 'The second number'}},
    'required': ['a', 'b']}}},
 {'type': 'function',
  'function': {'name': 'multiply',
   'description': 'Multiply two numbers\n\nReturns:\n- type: integer',
   'parameters': {'type': 'object',
    'properties': {'a': {'type': 'integer', 'description': 'The first number'},
     'b': {'type': 'integer', 'description': 'The second number'}},
    'required': ['a', 'b']}}}]

In [16]:
tools.get_schema("add")

{'type': 'function',
 'function': {'name': 'add',
  'description': 'Add two numbers\n\nReturns:\n- type: integer',
  'parameters': {'type': 'object',
   'properties': {'a': {'type': 'integer', 'description': 'The first number'},
    'b': {'type': 'integer', 'description': 'The second number'}},
   'required': ['a', 'b']}}}

In [17]:
tools.get_functions()

[<function __main__.add(a: int, b: int) -> int>,
 <function __main__.multiply(a: int, b: int) -> int>]

In [18]:
tools.get_function("add")

<function __main__.add(a: int, b: int) -> int>

In [19]:
tools.call("add", {"a": 1, "b": 2})

3

## Model client

In [20]:
#| export
class ModelClient(ABC):
    def __init__(
        self,
        model: str,
        base_url: Optional[str] = None,
        api_key: Optional[str] = None,
        context_size: Optional[int] = None,
    ):
        self.model = model
        self.base_url = base_url
        self.api_key = api_key
        self.context_size = context_size

    @abstractmethod
    def __call__(
        self,
        messages: Sequence[Mapping[str, Any]],
        chat_turns: ChatTurns,
        tools: Tools = None,
        think: Union[bool, Literal["low", "medium", "high"], None] = None,
        max_new_tokens: Optional[int] = None,
        seed: Optional[int] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        min_p: Optional[float] = None,
    ) -> bool:
        """
        Execute a model call and return the model response.
        """
        raise NotImplementedError

### ollama model client

In [27]:
#| export
_whitespace_pattern = re.compile(r"\s+")

def _messages_words(messages):
    return sum([len(_whitespace_pattern.findall(message["content"])) for message in messages if message["role"] in {"user", "assitant"}])

class OllamaModelClient(ModelClient):
    def __init__(
        self,
        model: str,
        context_size: int = 32768, # This is the default value for the ollama server in wordslab-notebooks
        base_url: str = "http://localhost:11434",
        api_key: Optional[str] = None,  # If not provided, the optional key will be pulled from WordslabEnv
    ):
        super().__init__(model, base_url, api_key, context_size)

        # Initialize API client
        if not api_key:
            env = WordslabEnv()
            api_key = env.cloud_ollama_api_key
        if api_key:
            headers = {'Authorization': 'Bearer ' + api_key}
        else:            
            headers = {}
        self.client = Client(host=self.base_url, headers=headers)
        
        # Load model in memory with the right context length
        print(f"ollama: loading model {self.model} with context size {self.context_size} ... ", end="");
        self.client.chat(model=self.model, messages=[{'role': 'user', 'content': 'say yes'}], options=Options(num_ctx=self.context_size, num_predict=1))
        print(f"ok");

    def __call__(
        self,
        messages: Sequence[Mapping[str, Any]],
        chat_turns: ChatTurns,
        tools: Tools = None,
        think: Union[bool, Literal["low", "medium", "high"], None] = None,
        max_new_tokens: Optional[int] = None,
        seed: Optional[int] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        min_p: Optional[float] = None,      
    ) -> bool:
        # Check tools parameter type
        if tools and not isinstance(tools, Tools):
            raise TypeError("Argument tools must be of type wordslab_notebooks_lib.chat.Tools. Create a tools object with the syntax: Tools([func1, func2, func3]), where the parameters are documented python functions.")
        
        # Immediate user feedback
        print(f"ollama: processing {_messages_words(messages)} words with `{self.model}` ...")
        
        # Observable conversation turn
        chat_turn = chat_turns.new_turn()
        
        stream = self.client.chat(
            model = self.model,
            messages = messages,
            tools = tools.get_schemas() if tools else None,
            stream = True,
            think = think,
            options = Options(num_ctx = self.context_size, num_predict = max_new_tokens, seed = seed,
                              temperature = temperature, top_k=top_k, top_p=top_p, min_p=min_p)
        )
    
        # Streaming: accumulate the partial fields
        tool_calls = []        
        for chunk in stream:
            if chunk.message.thinking:
                chat_turn.append_thinking(chunk.message.thinking)                
            if chunk.message.content:
                chat_turn.append_content(chunk.message.content)
            if chunk.message.tool_calls:
                tool_calls.extend(chunk.message.tool_calls)
                for tc in chunk.message.tool_calls:
                    chat_turn.append_tool_call(tc.function.name, tc.function.arguments)
        
        # append accumulated fields to the messages
        if chat_turn.thinking or chat_turn.content or tool_calls:
            messages.append({'role': 'assistant', 'thinking': chat_turn.thinking, 'content': chat_turn.content, 'tool_calls': tool_calls})
    
        # end the loop if there is no more tool calls
        if not tool_calls: 
            return False      
            
        # execute tool calls  
        else:    
            for tc in tool_calls:
                if tools.has_tool(tc.function.name):
                    chat_turn.start_tool_call(tc.function.name)
                    result = tools.call(tc.function.name, tc.function.arguments)
                    chat_turn.end_tool_call(tc.function.name, result)
                else:
                    result = 'Unknown tool'
        
                # append tool call result to the messages 
                messages.append({'role': 'tool', 'tool_name': tc.function.name, 'content': str(result)})

        # continue the loop after tool calls
        return True

In [29]:
model = env.default_model_code
model

'qwen3:30b'

In [30]:
oclient = OllamaModelClient(model, context_size=65000)

ollama: loading model qwen3:30b with context size 65000 ... ok


In [31]:
messages = [{'role': 'user', 'content': 'In one sentence: why is the sky blue?'}]
turns = ChatTurns()
tool_calls_to_process = oclient(messages, turns, think=True, max_new_tokens=1000, seed=42, temperature=2)

> [Thinking] ... thought in 205 words

Sunlight scatters in Earth's atmosphere, with shorter blue wavelengths scattering more effectively than other colors, causing the sky to appear blue.



In [32]:
messages, tool_calls_to_process

([{'role': 'user', 'content': 'In one sentence: why is the sky blue?'},
  {'role': 'assistant',
   'thinking': 'Okay, the user is asking why the sky is blue in one sentence. Hmm, they probably want a quick, straightforward explanation without any extra fluff. Maybe they\'re in a hurry or just need a simple fact for a conversation.  \n\nI remember the main science behind it: Rayleigh scattering. Sunlight is white light, made of all colors, but blue light scatters more because its wavelengths are shorter. That scattered blue light reaches our eyes from all directions, making the sky look blue.  \n\nGotta make sure it\'s concise though—just one sentence. Let me draft: "Sunlight scatters in Earth\'s atmosphere, with shorter blue wavelengths scattering more than other colors, making the sky appear blue." That covers it without technical jargon.  \n\nWait, is the user a kid? A curious adult? Either way, keep it simple. No need for "molecules" or "wavelengths" unless necessary. The key is "bl

In [33]:
messages = [{'role': 'user', 'content': "Using only the provided tools to make no mistake, what is (11545468+78782431)*418742?"}]
turns = ChatTurns()
tools = Tools([add, multiply])
tool_calls_to_process = oclient(messages, turns, tools=tools, think=True)

> [Thinking] ... thought in 2910 words

> [Tool call] ... `add` returned `90327899`

> [Tool call] ... `multiply` returned `37824085083058`



In [34]:
tool_calls_to_process

True

In [35]:
tool_calls_to_process = oclient(messages, turns, tools=tools, think=True)

> [Thinking] ... thought in 2910 words

> [Tool call] ... `add` returned `90327899`

> [Tool call] ... `multiply` returned `37824085083058`

> [Thinking] ... thought in 155 words

The result of (11545468 + 78782431) * 418742 is **37824085083058**.



In [36]:
tool_calls_to_process

False

## openrouter chat client

In [38]:
env = WordslabEnv()

In [40]:
client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=env.cloud_openrouter_api_key)

In [50]:
model = "google/gemini-3-flash-preview"
messages = [{'role': 'user', 'content': 'What is the smallest number palindrome greater than 130?'}]
stream = client.chat.completions.create(model=model, messages=messages, stream=True, extra_body={"reasoning": {"enabled": True}})
for chunk in stream:
    delta = chunk.choices[0].delta
    print(delta)

ChoiceDelta(content='', function_call=None, refusal=None, role='assistant', tool_calls=None, reasoning="**Identifying a Solution**\n\nI've homed in on the core challenge: pinpointing the smallest palindrome exceeding 130. The constraints are clear, and I'm strategizing how to efficiently generate and validate candidate numbers. I am starting by looking at the numbers directly after 130.\n\n\n", reasoning_details=[{'index': 0, 'type': 'reasoning.text', 'text': "**Identifying a Solution**\n\nI've homed in on the core challenge: pinpointing the smallest palindrome exceeding 130. The constraints are clear, and I'm strategizing how to efficiently generate and validate candidate numbers. I am starting by looking at the numbers directly after 130.\n\n\n", 'format': 'google-gemini-v1'}], annotations=[])
ChoiceDelta(content='', function_call=None, refusal=None, role='assistant', tool_calls=None, reasoning="**Determining the Answer**\n\nI've directly confirmed that 131 fulfills all criteria. No 

In [52]:
messages = [{'role': 'user', 'content': "Using only the provided tools to make no mistake, what is (11545468+78782431)*418742?"}]
tools = Tools([add, multiply])
stream = client.chat.completions.create(model=model, messages=messages, tools = tools.get_schemas(), stream=True, extra_body={"reasoning": {"enabled": True}})
for chunk in stream:
    delta = chunk.choices[0].delta
    print(delta)

ChoiceDelta(content='', function_call=None, refusal=None, role='assistant', tool_calls=None, reasoning="**Calculating the Total Sum**\n\nI'm currently focused on the first step: summing the initial numbers. I've successfully employed the `add` function, and the intermediate result is now readily available. It's a significant figure, and I'm ready to proceed to the next stage after a brief review.\n\n\n", reasoning_details=[{'index': 0, 'type': 'reasoning.text', 'text': "**Calculating the Total Sum**\n\nI'm currently focused on the first step: summing the initial numbers. I've successfully employed the `add` function, and the intermediate result is now readily available. It's a significant figure, and I'm ready to proceed to the next stage after a brief review.\n\n\n", 'format': 'google-gemini-v1'}], annotations=[])
ChoiceDelta(content='', function_call=None, refusal=None, role='assistant', tool_calls=None, reasoning="**Initiating Multiplication Operations**\n\nI've got the total sum fr

**Openrouter API reference**

Note: as of January 2026 - the OpenAI-compatible Responses API (Beta) is in beta stage and may have breaking changes. Use with caution in production environments.

=> we will use the competions API for now

https://openrouter.ai/docs/api/reference/overview

REQUEST SCHEMA

```typescript
// Definitions of subtypes are below
type Request = {
  // Either "messages" or "prompt" is required
  messages?: Message[];
  prompt?: string;
  // If "model" is unspecified, uses the user's default
  model?: string; // See "Supported Models" section
  // Allows to force the model to produce specific output format.
  // See models page and note on this docs page for which models support it.
  response_format?: { type: 'json_object' };
  stop?: string | string[];
  stream?: boolean; // Enable streaming
  // See LLM Parameters (openrouter.ai/docs/api/reference/parameters)
  max_tokens?: number; // Range: [1, context_length)
  temperature?: number; // Range: [0, 2]
  // Tool calling
  // Will be passed down as-is for providers implementing OpenAI's interface.
  // For providers with custom interfaces, we transform and map the properties.
  // Otherwise, we transform the tools into a YAML template. The model responds with an assistant message.
  // See models supporting tool calling: openrouter.ai/models?supported_parameters=tools
  tools?: Tool[];
  tool_choice?: ToolChoice;
  // Advanced optional parameters
  seed?: number; // Integer only
  top_p?: number; // Range: (0, 1]
  top_k?: number; // Range: [1, Infinity) Not available for OpenAI models
  frequency_penalty?: number; // Range: [-2, 2]
  presence_penalty?: number; // Range: [-2, 2]
  repetition_penalty?: number; // Range: (0, 2]
  logit_bias?: { [key: number]: number };
  top_logprobs: number; // Integer only
  min_p?: number; // Range: [0, 1]
  top_a?: number; // Range: [0, 1]
  // Reduce latency by providing the model with a predicted output
  // https://platform.openai.com/docs/guides/latency-optimization#use-predicted-outputs
  prediction?: { type: 'content'; content: string };
  // OpenRouter-only parameters
  // See "Prompt Transforms" section: openrouter.ai/docs/guides/features/message-transforms
  transforms?: string[];
  // See "Model Routing" section: openrouter.ai/docs/guides/features/model-routing
  models?: string[];
  route?: 'fallback';
  // See "Provider Routing" section: openrouter.ai/docs/guides/routing/provider-selection
  provider?: ProviderPreferences;
  user?: string; // A stable identifier for your end-users. Used to help detect and prevent abuse.
  
  // Debug options (streaming only)
  debug?: {
    echo_upstream_body?: boolean; // If true, returns the transformed request body sent to the provider
  };
};

// Subtypes:
type TextContent = {
  type: 'text';
  text: string;
};
type ImageContentPart = {
  type: 'image_url';
  image_url: {
    url: string; // URL or base64 encoded image data
    detail?: string; // Optional, defaults to "auto"
  };
};
type ContentPart = TextContent | ImageContentPart;
type Message =
  | {
      role: 'user' | 'assistant' | 'system';
      // ContentParts are only for the "user" role:
      content: string | ContentPart[];
      // If "name" is included, it will be prepended like this
      // for non-OpenAI models: `{name}: {content}`
      name?: string;
    }
  | {
      role: 'tool';
      content: string;
      tool_call_id: string;
      name?: string;
    };
type FunctionDescription = {
  description?: string;
  name: string;
  parameters: object; // JSON Schema object
};
type Tool = {
  type: 'function';
  function: FunctionDescription;
};
type ToolChoice =
  | 'none'
  | 'auto'
  | {
      type: 'function';
      function: {
        name: string;
      };
``` 

RESPONSE SCHEMA

```typescript
// Definitions of subtypes are below
type Response = {
  id: string;
  // Depending on whether you set "stream" to "true" and
  // whether you passed in "messages" or a "prompt", you
  // will get a different output shape
  choices: (NonStreamingChoice | StreamingChoice | NonChatChoice)[];
  created: number; // Unix timestamp
  model: string;
  object: 'chat.completion' | 'chat.completion.chunk';
  system_fingerprint?: string; // Only present if the provider supports it
  // Usage data is always returned for non-streaming.
  // When streaming, you will get one usage object at
  // the end accompanied by an empty choices array.
  usage?: ResponseUsage;
};
// If the provider returns usage, we pass it down
// as-is. Otherwise, we count using the GPT-4 tokenizer.
type ResponseUsage = {
  /** Including images and tools if any */
  prompt_tokens: number;
  /** The tokens generated */
  completion_tokens: number;
  /** Sum of the above two fields */
  total_tokens: number;
};


// Subtypes:
type NonChatChoice = {
  finish_reason: string | null;
  text: string;
  error?: ErrorResponse;
};
type NonStreamingChoice = {
  finish_reason: string | null;
  native_finish_reason: string | null;
  message: {
    content: string | null;
    role: string;
    tool_calls?: ToolCall[];
  };
  error?: ErrorResponse;
};
type StreamingChoice = {
  finish_reason: string | null;
  native_finish_reason: string | null;
  delta: {
    content: string | null;
    role?: string;
    tool_calls?: ToolCall[];
  };
  error?: ErrorResponse;
};
type ErrorResponse = {
  code: number; // See "Error Handling" section
  message: string;
  metadata?: Record<string, unknown>; // Contains additional error information such as provider details, the raw error message, etc.
};
type ToolCall = {
  id: string;
  type: 'function';
  function: FunctionCall;
};
```

In [174]:
#| export
class OpenRouterModelClient(ModelClient):
    def __init__(
        self,
        model: str,
        context_size: Optional[int] = None, # For OpenRouter this parameter is ignored, we inherit the remote model config
        base_url: str = "https://openrouter.ai/api/v1",
        api_key: Optional[str] = None, # If not provided, the mandatory key will be pulled from WordslabEnv
    ):
        super().__init__(model, base_url, api_key, context_size)

        # Initialize API client
        if not api_key:
            env = WordslabEnv()
            api_key = env.cloud_openrouter_api_key
        self.client = OpenAI(base_url=base_url, api_key=api_key)
        
        # Check connection
        print(f"openrouter: testing model {self.model} ... ", end="");
        self.client.chat.completions.create(model=self.model, messages=[{'role': 'user', 'content': 'say yes'}], max_tokens=16)
        print(f"ok");

    def __call__(
        self,
        messages: Sequence[Mapping[str, Any]],
        chat_turns: ChatTurns,
        tools: Tools = None,
        think: Union[bool, Literal["xhigh", "high", "medium", "low", "minimal", "none"], int, None] = None,
        max_new_tokens: Optional[int] = None,
        seed: Optional[int] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,  # Ignored, not supported by the openai chat completions API
        top_p: Optional[float] = None,
        min_p: Optional[float] = None,  # Ignored, not supported by the openai chat completions API
    ) -> bool:
        # Check tools parameter type
        if tools and not isinstance(tools, Tools):
            raise TypeError("Argument tools must be of type wordslab_notebooks_lib.chat.Tools. Create a tools object with the syntax: Tools([func1, func2, func3]), where the parameters are documented python functions.")
        
        # Immediate user feedback
        print(f"openrouter: processing {_messages_words(messages)} words with `{self.model}` ...")
        
        # Observable conversation turn
        chat_turn = chat_turns.new_turn()
        
        # Map "think" → reasoning_effort / max_tokens
        reasoning = None
        if think is True:
            reasoning = {"reasoning": {"enabled": True}} # reasoning on/off
        elif think in ("low", "medium", "high"):
            reasoning = {"reasoning": {"effort": think}} # "xhigh", "high", "medium", "low", "minimal" or "none" (OpenAI-style)
        elif isinstance(think, int):
            reasoning = {"reasoning": {"max_tokens": think}} # specified token budget for extended thinking (Anthropic-style)
        
        stream = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            tools = tools.get_schemas() if tools else None,
            stream=True,
            extra_body= reasoning,
            max_tokens=max_new_tokens,
            seed=seed,
            temperature=temperature,
            top_p=top_p,
        )
    
        # Streaming: accumulate the partial fields
        tool_calls = {}       
        for chunk in stream:
            delta = chunk.choices[0].delta
            if hasattr(delta, "reasoning") and delta.reasoning:
                chat_turn.append_thinking(delta.reasoning)                
            if hasattr(delta, "content") and delta.content:
                chat_turn.append_content(delta.content)
            if hasattr(delta, "tool_calls") and delta.tool_calls:
                for tool_call in delta.tool_calls:
                    idx = tool_call.index
                    # First tool call chunk
                    if idx not in tool_calls:
                        tool_calls[idx] = {
                            "id": tool_call.id,  # only present in first chunk
                            "name": tool_call.function.name,
                            "arguments": ""
                        }       
                    # Append streamed argument fragments
                    tool_calls[idx]["arguments"] += (
                        tool_call.function.arguments or ""
                    )
                    
        # We need to wait the end of the stream to make sure the tool calls are complete
        for tc in tool_calls.values():
            chat_turn.append_tool_call(tc["name"], tc["arguments"])
        
        # Append accumulated fields to the messages
        if chat_turn.content or tool_calls:
            messages.append({
                "role": "assistant",
                "content": chat_turn.content,
                "tool_calls": [
                    {
                        "id": tc["id"],
                        "type": "function",
                        "function": {
                            "name": tc["name"],
                            "arguments": tc["arguments"]
                        }
                    }
                    for tc in tool_calls.values()
                ]
            })
    
        # end the loop if there is no more tool calls
        if not tool_calls: 
            return False      
            
        # execute tool calls  
        else:    
            for tc in tool_calls.values():
                if tools.has_tool(tc["name"]):
                    chat_turn.start_tool_call(tc["name"])
                    result = tools.call(tc["name"], json.loads(tc["arguments"]))
                    chat_turn.end_tool_call(tc["name"], result)
                else:
                    result = 'Unknown tool'
        
                # append tool call result to the messages 
                messages.append({"role": "tool", "tool_call_id": tc["id"], "content": str(result)})

        # continue the loop after tool calls
        return True

In [175]:
model = "anthropic/claude-sonnet-4.5"
orclient = OpenRouterModelClient(model)

openrouter: testing model anthropic/claude-sonnet-4.5 ... ok


In [176]:
messages = [{'role': 'user', 'content': 'What is the smallest number palindrome greater than 130?'}]
turns = ChatTurns()
tool_calls_to_process = orclient(messages, turns)

I need to find the smallest palindrome greater than 130.

Let me check numbers starting from 131:

- 131: Is this a palindrome? 1-3-1 → Yes, it reads the same forwards and backwards.

Therefore, the smallest palindrome greater than 130 is **131**.



In [179]:
messages = [{'role': 'user', 'content': 'What is the smallest number palindrome greater than 130?'}]
turns = ChatTurns()
tool_calls_to_process = orclient(messages, turns, think=1024, max_new_tokens=2000, seed=42, temperature=0.7)

> [Thinking] ... thought in 53 words

The smallest palindrome greater than 130 is **131**.

A palindrome reads the same forwards and backwards, and 131 satisfies this condition.



In [180]:
messages = [{'role': 'user', 'content': "Using only the provided tools to make no mistake, what is (11545468+78782431)*418742?"}]
turns = ChatTurns()
tools = Tools([add, multiply])
tool_calls_to_process = orclient(messages, turns, tools=tools, think=2014)

> [Thinking] ... thought in 40 words

I'll solve this step by step using the provided tools.

First, let me add 11545468 + 78782431:

> [Tool call] ... `add` returned `90327899`



In [181]:
tool_calls_to_process

True

In [182]:
messages

[{'role': 'user',
  'content': 'Using only the provided tools to make no mistake, what is (11545468+78782431)*418742?'},
 {'role': 'assistant',
  'content': "I'll solve this step by step using the provided tools.\n\nFirst, let me add 11545468 + 78782431:",
  'tool_calls': [{'id': 'toolu_vrtx_01Apanb8oW6scZnYaCXhAKQ1',
    'type': 'function',
    'function': {'name': 'add',
     'arguments': '{"a": 11545468, "b": 78782431}'}}]},
 {'role': 'tool',
  'tool_call_id': 'toolu_vrtx_01Apanb8oW6scZnYaCXhAKQ1',
  'content': '90327899'}]

In [183]:
tool_calls_to_process = orclient(messages, turns, tools=tools, think=1024)

> [Thinking] ... thought in 40 words

I'll solve this step by step using the provided tools.

First, let me add 11545468 + 78782431:

> [Tool call] ... `add` returned `90327899`

Now let me multiply that result by 418742:

> [Tool call] ... `multiply` returned `37824085083058`



In [184]:
tool_calls_to_process

True

In [185]:
tool_calls_to_process = orclient(messages, turns, tools=tools, think=True)

> [Thinking] ... thought in 40 words

I'll solve this step by step using the provided tools.

First, let me add 11545468 + 78782431:

> [Tool call] ... `add` returned `90327899`

Now let me multiply that result by 418742:

> [Tool call] ... `multiply` returned `37824085083058`

The answer is **(11545468 + 78782431) × 418742 = 37,824,085,083,058**



## Models providers

### Design concepts

#### User centric workflow

1. identify your self-hosted inference or inference as a service options
2. understand your task type, properties, privacy needs and scale
3. find the best model for your task, given your constraints
4. prepare and start your self hosted inference or connect to your inference as a service provider
5. monitor your resource usage and cost

#### Self-hosted inference or inference as a service

Model families
- architecture name
- parameter size
- training type: base / instruct / thinking
- version: relase date
- quantization

Model constraints
- model capabilities
  - modalities in/out
  - context length
  - instruction
  - thinking
  - tools
- model usage
  - prompt template and special tokens
  - languages supported
  - recommended use cases
  - prompting guidelines 
- model license
  - use case restrictions
  - commercial usage restrictions
  - outputs usage restrictions 
- model transparency

Self-hosted inference constraints
- model requirements
  - size on disk -> download time / load time in vram
  - size in vram -> max context length / num parallel sequence
  - tensor flops -> input tokens/sec
  - memory bandwidth -> output tokens/sec
- inference machine constraints
  - download speed
  - disk size and speed
  - GPU vram, memory bandwidth, tensor flops
- rented machine constraints
  - GPU availability
  - price when you use per GPU
  - price when you don't use per GB (storage)

Inference as a service constraints
- router constraints
  - ... same as provider constraints below ... 
- provider constraints
  - terms of service
  - privacy options
  - inference quotas
  - service availability
- per model provider constraints
  - model capabilities exposed 
  - input/output tokens cost
  - input/output tokens/sec

### List, download and load models

#### Explore ollama API

Get ollama version

In [None]:
Request
curl http://localhost:11434/api/version
Response
{
  "version": "0.5.1"
}

List remote models

As of december 2025, there is no API to get the ollama catalog of models, web scraping is the only solution.

In [None]:
import httpx
import re
from html import unescape

def updated_to_months(updated):
    """
    Convert strings like:
      "1 year ago", "2 years ago",
      "1 month ago", "3 weeks ago",
      "7 days ago", "yesterday",
      "4 hours ago"
    into integer months.
    """
    if not updated:
        return None

    updated = updated.lower().strip()

    # handle 'yesterday' explicitly
    if updated == "yesterday":
        return 0

    # years → months
    m = re.match(r'(\d+)\s+year', updated)
    if m:
        years = int(m.group(1))
        return years * 12

    # months
    m = re.match(r'(\d+)\s+month', updated)
    if m:
        return int(m.group(1))

    # weeks
    m = re.match(r'(\d+)\s+week', updated)
    if m:
        weeks = int(m.group(1))
        return max(0, weeks // 4)

    # days
    m = re.match(r'(\d+)\s+day', updated)
    if m:
        return 0

    # hours / minutes / seconds → treat as < 1 month
    if any(unit in updated for unit in ["hour", "minute", "second"]):
        return 0

    return None

def pulls_to_int(pulls_str):
    """
    Convert a pulls string like:
        '5M', '655.8K', '49K', '73.7M', '957.4K', '27.7M'
    into an integer.
    """
    if not pulls_str:
        return None

    pulls_str = pulls_str.strip().upper()

    match = re.match(r'([\d,.]+)\s*([KM]?)', pulls_str)
    if not match:
        return None

    number, suffix = match.groups()
    # Remove commas and convert to float
    number = float(number.replace(',', ''))

    if suffix == 'M':
        number *= 1_000_000
    elif suffix == 'K':
        number *= 1_000

    return int(number)

def parse_model_list_regex(html):
    models = []

    # --- Extract each <li x-test-model>...</li> block ---
    li_blocks = re.findall(
        r'<li[^>]*x-test-model[^>]*>(.*?)</li>',
        html,
        flags=re.DOTALL
    )

    for block in li_blocks:

        # name from <a href="/library/...">
        name = None
        m = re.search(r'href="/library/([^"]+)"', block)
        if m:
            name = m.group(1)

        # description <p class="max-w-lg ...">...</p>
        description = ""
        m = re.search(
            r'<p[^>]*text-neutral-800[^>]*>(.*?)</p>',
            block,
            flags=re.DOTALL
        )
        if m:
            description = re.sub(r'<.*?>', '', m.group(1)).strip()
            description = unescape(description)

        # capabilities (x-test-capability)
        capabilities = re.findall(
            r'<span[^>]*x-test-capability[^>]*>(.*?)</span>',
            block,
            flags=re.DOTALL
        )
        capabilities = [c.strip() for c in capabilities]

        # check for the special 'cloud' span 
        cloud = False
        if re.search(
            r'<span[^>]*>cloud</span>',
            block,
            flags=re.DOTALL
        ):
            cloud = True

        # sizes (x-test-size)
        sizes = re.findall(
            r'<span[^>]*x-test-size[^>]*>(.*?)</span>',
            block,
            flags=re.DOTALL
        )
        sizes = [s.strip() for s in sizes]

        # pulls <span x-test-pull-count>5M</span>
        pulls = None
        m = re.search(
            r'<span[^>]*x-test-pull-count[^>]*>(.*?)</span>',
            block
        )
        if m:
            pulls = m.group(1).strip()

        # tag count <span x-test-tag-count>5</span>
        tag_count = None
        m = re.search(
            r'<span[^>]*x-test-tag-count[^>]*>(.*?)</span>',
            block
        )
        if m:
            tag_count = m.group(1).strip()

        # updated text <span x-test-updated>...</span>
        updated = None
        m = re.search(
            r'<span[^>]*x-test-updated[^>]*>(.*?)</span>',
            block
        )
        if m:
            updated = m.group(1).strip()

        models.append({
            "name": name,
            "description": description,
            "capabilities": capabilities,
            "cloud": cloud,
            "sizes": sizes,
            "pulls": pulls_to_int(pulls),
            "tag_count": int(tag_count),
            "updated_months": updated_to_months(updated),
            "url": f"https://ollama.com/library/{name}" if name else None
        })

    return models   

def list_models(contains=None):
    """
    Extract model names and properties from https://ollama.com/library
    Optionally filter by substring.
    """

    html = httpx.get("https://ollama.com/library").text
    models = parse_model_list_regex(html)

    if contains:
        models = [
            m for m in models
            if contains.lower() in m["name"].lower()
        ]
        models = sorted(models, key=lambda m:m["name"])

    return models

def list_recent_models_from_family(familyfilter):
    return [f"{m['name']} {m['capabilities'] if len(m['capabilities'])>0 else ''} {m['sizes'] if len(m['sizes'])>0 else ''}{' [cloud]' if m['cloud'] else ''}" for m in list_models(familyfilter) if m["updated_months"] is not None and m["updated_months"]<12]

def list_tags(model):
    """
    Extract valid quantized tags only, without HTML noise,
    and apply the same exclusions as original greps.
    """
    html = httpx.get(f"https://ollama.com/library/{model}/tags").text

    # Capture ONLY the tag part after model:..., e.g. 3b-instruct-q4_K_M
    raw_tags = re.findall(
        rf'{re.escape(model)}:([A-Za-z0-9._-]*q[A-Za-z0-9._-]*)',
        html
    )

    # Re-add full prefix model:<tag>
    tags = [f"{model}:{t}" for t in raw_tags]

    # Exclude text|base|fp|q4_[01]|q5_[01]
    tags = [
        t for t in tags
        if not re.search(r'(text|base|fp|q[45]_[01])', t)
    ]

    # Deduplicate
    return set(tags)

In [None]:
list_models()[:5]

[{'name': 'gpt-oss',
  'description': 'OpenAI’s open-weight models designed for powerful reasoning, agentic tasks, and versatile developer use cases.',
  'capabilities': ['tools', 'thinking'],
  'cloud': True,
  'sizes': ['20b', '120b'],
  'pulls': 5000000,
  'tag_count': 5,
  'updated_months': 1,
  'url': 'https://ollama.com/library/gpt-oss'},
 {'name': 'qwen3-vl',
  'description': 'The most powerful vision-language model in the Qwen model family to date.',
  'capabilities': ['vision', 'tools'],
  'cloud': True,
  'sizes': ['2b', '4b', '8b', '30b', '32b', '235b'],
  'pulls': 656300,
  'tag_count': 59,
  'updated_months': 1,
  'url': 'https://ollama.com/library/qwen3-vl'},
 {'name': 'ministral-3',
  'description': 'The Ministral 3 family is designed for edge deployment, capable of running on a wide range of hardware.',
  'capabilities': ['vision', 'tools'],
  'cloud': True,
  'sizes': ['3b', '8b', '14b'],
  'pulls': 49100,
  'tag_count': 16,
  'updated_months': 0,
  'url': 'https://oll

In [None]:
list_recent_models_from_family("qwen")

["qwen2.5-coder ['tools'] ['0.5b', '1.5b', '3b', '7b', '14b', '32b']",
 "qwen2.5vl ['vision'] ['3b', '7b', '32b', '72b']",
 "qwen3 ['tools', 'thinking'] ['0.6b', '1.7b', '4b', '8b', '14b', '30b', '32b', '235b']",
 "qwen3-coder ['tools'] ['30b', '480b'] [cloud]",
 "qwen3-embedding ['embedding'] ['0.6b', '4b', '8b']",
 "qwen3-vl ['vision', 'tools'] ['2b', '4b', '8b', '30b', '32b', '235b'] [cloud]"]

In [None]:
list_recent_models_from_family("gemma")

["embeddinggemma ['embedding'] ['300m']",
 "gemma3 ['vision'] ['270m', '1b', '4b', '12b', '27b'] [cloud]",
 "gemma3n  ['e2b', 'e4b']"]

In [None]:
list_recent_models_from_family("stral")

["devstral ['tools'] ['24b']",
 "magistral ['tools', 'thinking'] ['24b']",
 "ministral-3 ['vision', 'tools'] ['3b', '8b', '14b'] [cloud]",
 "mistral ['tools'] ['7b']",
 'mistral-large-3   [cloud]',
 "mistral-nemo ['tools'] ['12b']",
 "mistral-small ['tools'] ['22b', '24b']",
 "mistral-small3.1 ['vision', 'tools'] ['24b']",
 "mistral-small3.2 ['vision', 'tools'] ['24b']"]

In [None]:
list_recent_models_from_family("gpt")

["gpt-oss ['tools', 'thinking'] ['20b', '120b'] [cloud]",
 "gpt-oss-safeguard ['tools', 'thinking'] ['20b', '120b']"]

In [None]:
list_recent_models_from_family("deepseek")

["deepseek-ocr ['vision'] ['3b']",
 "deepseek-r1 ['tools', 'thinking'] ['1.5b', '7b', '8b', '14b', '32b', '70b', '671b']",
 "deepseek-v3  ['671b']",
 "deepseek-v3.1 ['tools', 'thinking'] ['671b'] [cloud]"]

In [None]:
list_recent_models_from_family("glm")

['glm-4.6   [cloud]']

In [None]:
list_recent_models_from_family("granite")

["granite-embedding ['embedding'] ['30m', '278m']",
 "granite3.1-dense ['tools'] ['2b', '8b']",
 "granite3.1-moe ['tools'] ['1b', '3b']",
 "granite3.2 ['tools'] ['2b', '8b']",
 "granite3.2-vision ['vision', 'tools'] ['2b']",
 "granite3.3 ['tools'] ['2b', '8b']",
 "granite4 ['tools'] ['350m', '1b', '3b']"]

In [None]:
list_recent_models_from_family("llama")

["llama3.2-vision ['vision'] ['11b', '90b']",
 "llama4 ['vision', 'tools'] ['16x17b', '128x17b']"]

In [None]:
list_recent_models_from_family("phi")

["dolphin-mixtral  ['8x7b', '8x22b']",
 "dolphin3  ['8b']",
 "phi4  ['14b']",
 "phi4-mini ['tools'] ['3.8b']",
 "phi4-mini-reasoning  ['3.8b']",
 "phi4-reasoning  ['14b']"]

In [None]:
list_recent_models_from_family("hermes")

["hermes3 ['tools'] ['3b', '8b', '70b', '405b']",
 "nous-hermes2-mixtral  ['8x7b']"]

In [None]:
list_recent_models_from_family("olmo")

["olmo2  ['7b', '13b']"]

In [None]:
list_recent_models_from_family("embed")

["embeddinggemma ['embedding'] ['300m']",
 "granite-embedding ['embedding'] ['30m', '278m']",
 "qwen3-embedding ['embedding'] ['0.6b', '4b', '8b']"]

In [None]:
list_tags("ministral-3")

{'ministral-3:14b-instruct-2512-q4_K_M',
 'ministral-3:14b-instruct-2512-q8_0',
 'ministral-3:3b-instruct-2512-q4_K_M',
 'ministral-3:3b-instruct-2512-q8_0',
 'ministral-3:8b-instruct-2512-q4_K_M',
 'ministral-3:8b-instruct-2512-q8_0'}

In [None]:
list_tags("mistral-small3.2")

{'mistral-small3.2:24b-instruct-2506-q4_K_M',
 'mistral-small3.2:24b-instruct-2506-q8_0'}

In [None]:
list_tags("qwen3-vl")

{'qwen3-vl:235b-a22b-instruct-q4_K_M',
 'qwen3-vl:235b-a22b-instruct-q8_0',
 'qwen3-vl:235b-a22b-thinking-q4_K_M',
 'qwen3-vl:235b-a22b-thinking-q8_0',
 'qwen3-vl:2b-instruct-q4_K_M',
 'qwen3-vl:2b-instruct-q8_0',
 'qwen3-vl:2b-thinking-q4_K_M',
 'qwen3-vl:2b-thinking-q8_0',
 'qwen3-vl:30b-a3b-instruct-q4_K_M',
 'qwen3-vl:30b-a3b-instruct-q8_0',
 'qwen3-vl:30b-a3b-thinking-q4_K_M',
 'qwen3-vl:30b-a3b-thinking-q8_0',
 'qwen3-vl:32b-instruct-q4_K_M',
 'qwen3-vl:32b-instruct-q8_0',
 'qwen3-vl:32b-thinking-q4_K_M',
 'qwen3-vl:32b-thinking-q8_0',
 'qwen3-vl:4b-instruct-q4_K_M',
 'qwen3-vl:4b-instruct-q8_0',
 'qwen3-vl:4b-thinking-q4_K_M',
 'qwen3-vl:4b-thinking-q8_0',
 'qwen3-vl:8b-instruct-q4_K_M',
 'qwen3-vl:8b-instruct-q8_0',
 'qwen3-vl:8b-thinking-q4_K_M',
 'qwen3-vl:8b-thinking-q8_0'}

https://github.com/ollama/ollama/blob/main/docs/api.md#list-local-models

ollama.list().models -> list(ollama._types.ListResponse.Model)

```yaml
ollama._types.ListResponse.Model
- model: str 'qwen3:4b'
- modified_at: datetime.datetime datetime(2025, 11, 22, 18, 53, 11)
- digest: str '359d7dd4bcdab3d86b87d73ac27966f4dbb9f5efdfcc75d34a8764a09474fae7'
- size: pydantic.types.ByteSize 2497293931
- details: ollama._types.ModelDetails
  - parent_model: str ''
  - format: str 'gguf'
  - family: str 'qwen3'
  - families: Sequence[str] ['qwen3']
  - parameter_size: str '4.0B'
  - quantization_level: str 'Q4_K_M'
```

In [None]:
ollama.list().models[0]

Model(model='qwen3:4b', modified_at=datetime.datetime(2025, 11, 22, 18, 53, 11, 586211, tzinfo=TzInfo(3600)), digest='359d7dd4bcdab3d86b87d73ac27966f4dbb9f5efdfcc75d34a8764a09474fae7', size=2497293931, details=ModelDetails(parent_model='', format='gguf', family='qwen3', families=['qwen3'], parameter_size='4.0B', quantization_level='Q4_K_M'))

https://github.com/ollama/ollama/blob/main/docs/api.md#show-model-information

```yaml
ollama._types.ShowResponse
- modified_at: datetime.datetime datetime.datetime(2025, 11, 22, 18, 53, 11)
- template: str '{{- $lastUserIdx := -1 -}}...\n{{- end }}'
- modelfile: str '...'
- license: str '...'
- details: ollama._types.ModelDetails -> see above
- model_info: Mapping[str, Any]
  -'general.architecture': 'qwen3'
  -'general.basename': 'Qwen3' 
  -'general.file_type': 15
  -'general.finetune': 'Thinking' 
  -'general.license': 'apache-2.0'
  -'general.license.link': 'https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507/blob/main/LICENSE'
  -'general.parameter_count': 4022468096
  -'general.quantization_version': 2, 
  -'general.size_label': '4B'
  -'general.tags': None
  -'general.type': 'model'
  -'general.version': '2507'
  -'qwen3.attention.head_count': 32
  -'qwen3.attention.head_count_kv': 8
  -'qwen3.attention.key_length': 128
  -'qwen3.attention.layer_norm_rms_epsilon': 1e-06
  -'qwen3.attention.value_length': 128
  -'qwen3.block_count': 36
  -'qwen3.context_length': 262144
  -'qwen3.embedding_length': 2560
  -'qwen3.feed_forward_length': 9728
  -'qwen3.rope.freq_base': 5000000
  -'tokenizer.ggml.add_bos_token': False
  -'tokenizer.ggml.bos_token_id': 151643
  -'tokenizer.ggml.eos_token_id': 151645
  -'tokenizer.ggml.merges': None
  -'tokenizer.ggml.model': 'gpt2'
  -'tokenizer.ggml.padding_token_id': 151643
  -'tokenizer.ggml.pre': 'qwen2'
  -'tokenizer.ggml.token_type': None
  -'tokenizer.ggml.tokens': None
- parameters: str 'top_p 0.95\n repeat_penalty 1\n stop "<|im_start|>"\n stop "<|im_end|>"\n temperature 0.6\ n top_k 20'
- capabilities: List[str] ['completion', 'tools', 'thinking']
```

In [None]:
ollama.show('gemma3:4b').capabilities, ollama.show('gemma3:4b').modelinfo

(['completion', 'vision'],
 {'gemma3.attention.head_count': 8,
  'gemma3.attention.head_count_kv': 4,
  'gemma3.attention.key_length': 256,
  'gemma3.attention.sliding_window': 1024,
  'gemma3.attention.value_length': 256,
  'gemma3.block_count': 34,
  'gemma3.context_length': 131072,
  'gemma3.embedding_length': 2560,
  'gemma3.feed_forward_length': 10240,
  'gemma3.mm.tokens_per_image': 256,
  'gemma3.vision.attention.head_count': 16,
  'gemma3.vision.attention.layer_norm_epsilon': 1e-06,
  'gemma3.vision.block_count': 27,
  'gemma3.vision.embedding_length': 1152,
  'gemma3.vision.feed_forward_length': 4304,
  'gemma3.vision.image_size': 896,
  'gemma3.vision.num_channels': 3,
  'gemma3.vision.patch_size': 14,
  'general.architecture': 'gemma3',
  'general.file_type': 15,
  'general.parameter_count': 4299915632,
  'general.quantization_version': 2,
  'tokenizer.ggml.add_bos_token': True,
  'tokenizer.ggml.add_eos_token': False,
  'tokenizer.ggml.add_padding_token': False,
  'tokenize

In [None]:
ollama.pull??

[31mSignature:[39m ollama.pull(model: str, *, insecure: bool = [38;5;28;01mFalse[39;00m, stream: bool = [38;5;28;01mFalse[39;00m) -> Union[ollama._types.ProgressResponse, collections.abc.Iterator[ollama._types.ProgressResponse]]
[31mSource:[39m   
  [38;5;28;01mdef[39;00m pull(
    self,
    model: str,
    *,
    insecure: bool = [38;5;28;01mFalse[39;00m,
    stream: bool = [38;5;28;01mFalse[39;00m,
  ) -> Union[ProgressResponse, Iterator[ProgressResponse]]:
    [33m"""[39m
[33m    Raises `ResponseError` if the request could not be fulfilled.[39m

[33m    Returns `ProgressResponse` if `stream` is `False`, otherwise returns a `ProgressResponse` generator.[39m
[33m    """[39m
    [38;5;28;01mreturn[39;00m self._request(
      ProgressResponse,
      [33m'POST'[39m,
      [33m'/api/pull'[39m,
      json=PullRequest(
        model=model,
        insecure=insecure,
        stream=stream,
      ).model_dump(exclude_none=[38;5;28;01mTrue[39;00m),
      stream=st

In [None]:
ollama.delete??

[31mSignature:[39m ollama.delete(model: str) -> ollama._types.StatusResponse
[31mDocstring:[39m <no docstring>
[31mSource:[39m   
  [38;5;28;01mdef[39;00m delete(self, model: str) -> StatusResponse:
    r = self._request_raw(
      [33m'DELETE'[39m,
      [33m'/api/delete'[39m,
      json=DeleteRequest(
        model=model,
      ).model_dump(exclude_none=[38;5;28;01mTrue[39;00m),
    )
    [38;5;28;01mreturn[39;00m StatusResponse(
      status=[33m'success'[39m [38;5;28;01mif[39;00m r.status_code == [32m200[39m [38;5;28;01melse[39;00m [33m'error'[39m,
    )
[31mFile:[39m      /home/workspace/wordslab-notebooks-lib/.venv/lib/python3.12/site-packages/ollama/_client.py
[31mType:[39m      method

**Streaming responses**

Certain endpoints stream responses as JSON objects. Streaming can be disabled by providing {"stream": false} for these endpoints.

**Structured outputs**

Structured outputs are supported by providing a JSON schema in the format parameter. The model will generate a response that matches the schema. See the structured outputs example below.

**JSON mode**

Enable JSON mode by setting the format parameter to json. This will structure the response as a valid JSON object. 

https://github.com/ollama/ollama/blob/main/docs/api.md#generate-a-completion

Parameters
- model: (required) the model name
- prompt: the prompt to generate a response for
- suffix: the text after the model response
- images: (optional) a list of base64-encoded images (for multimodal models such as llava)
- think: (for thinking models) should the model think before responding?

Advanced parameters (optional):
- format: the format to return a response in. Format can be json or a JSON schema
- options: additional model parameters listed in the documentation for the Modelfile such as temperature
- system: system message to (overrides what is defined in the Modelfile)
- template: the prompt template to use (overrides what is defined in the Modelfile)
- stream: if false the response will be returned as a single response object, rather than a stream of objects
- raw: if true no formatting will be applied to the prompt. You may choose to use the raw parameter if you are specifying a full templated prompt in your request to the API
- keep_alive: controls how long the model will stay loaded into memory following the request (default: 5m)

Response

A stream of JSON objects is returned:

{
  "model": "llama3.2",
  "created_at": "2023-08-04T08:52:19.385406455-07:00",
  "response": "The",
  "done": false
}

The final response in the stream also includes additional data about the generation:
- total_duration: time spent generating the response
- load_duration: time spent in nanoseconds loading the model
- prompt_eval_count: number of tokens in the prompt
- prompt_eval_duration: time spent in nanoseconds evaluating the prompt
- eval_count: number of tokens in the response
- eval_duration: time in nanoseconds spent generating the response
- response: empty if the response was streamed, if not streamed, this will contain the full response

A response can be received in one reply when streaming is off.

To calculate how fast the response is generated in tokens per second (token/s), divide eval_count / eval_duration * 10^9.

**Images**

To submit images to multimodal models, provide a list of base64-encoded images:

- "images": ["iVBORw0KGgoAAAANSUhEUgAAAG0AAABmCAYAAADBPx+VAAAACXBI..."]


In [None]:
ollama.generate(model='gemma3', prompt='Why is the sky blue?')

In [None]:
ollama.chat(model='gemma3', messages=[{'role': 'user', 'content': 'Why is the sky blue?'}])

In [None]:
ollama.embed(model='gemma3', input='The sky is blue because of rayleigh scattering')

In [None]:
ollama.embed(model='gemma3', input=['The sky is blue because of rayleigh scattering', 'Grass is green because of chlorophyll'])

In [None]:
ollama.ps()

ProcessResponse(models=[])

In [None]:
ollama.web_search??

[31mSignature:[39m ollama.web_search(query: str, max_results: int = [32m3[39m) -> ollama._types.WebSearchResponse
[31mSource:[39m   
  [38;5;28;01mdef[39;00m web_search(self, query: str, max_results: int = [32m3[39m) -> WebSearchResponse:
    [33m"""[39m
[33m    Performs a web search[39m

[33m    Args:[39m
[33m      query: The query to search for[39m
[33m      max_results: The maximum number of results to return (default: 3)[39m

[33m    Returns:[39m
[33m      WebSearchResponse with the search results[39m
[33m    Raises:[39m
[33m      ValueError: If OLLAMA_API_KEY environment variable is not set[39m
[33m    """[39m
    [38;5;28;01mif[39;00m [38;5;28;01mnot[39;00m self._client.headers.get([33m'authorization'[39m, [33m''[39m).startswith([33m'Bearer '[39m):
      [38;5;28;01mraise[39;00m ValueError([33m'Authorization header with Bearer token is required for web search'[39m)

    [38;5;28;01mreturn[39;00m self._request(
      WebSearchResponse,

In [None]:
ollama.web_fetch??

[31mSignature:[39m ollama.web_fetch(url: str) -> ollama._types.WebFetchResponse
[31mSource:[39m   
  [38;5;28;01mdef[39;00m web_fetch(self, url: str) -> WebFetchResponse:
    [33m"""[39m
[33m    Fetches the content of a web page for the provided URL.[39m

[33m    Args:[39m
[33m      url: The URL to fetch[39m

[33m    Returns:[39m
[33m      WebFetchResponse with the fetched result[39m
[33m    """[39m
    [38;5;28;01mif[39;00m [38;5;28;01mnot[39;00m self._client.headers.get([33m'authorization'[39m, [33m''[39m).startswith([33m'Bearer '[39m):
      [38;5;28;01mraise[39;00m ValueError([33m'Authorization header with Bearer token is required for web fetch'[39m)

    [38;5;28;01mreturn[39;00m self._request(
      WebFetchResponse,
      [33m'POST'[39m,
      [33m'https://ollama.com/api/web_fetch'[39m,
      json=WebFetchRequest(
        url=url,
      ).model_dump(exclude_none=[38;5;28;01mTrue[39;00m),
    )
[31mFile:[39m      /home/workspace/wordsla