# Orchestrating Agent 

Accessing models via:
- Ollama servers
- Google BigQuery AI / Vertex AI

Resources
- [BigQuery ML Methods with Gemini](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.llm)

**GGUF Instruct Models**

GGUF Model types are compressed variants of the original model for inferencing tasks in workspaces with lower compute configs. Default server used is Ollama and to pull and run the model for inferencing is prefixed in cli: `ollama run *`

```yaml
- model_name: base
  model_id:
    dev: hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q3_K_L
    prod: hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q5_K_S
  url: https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF
- model_name: sql_coder
  model_id:
    dev: hf.co/TheBloke/sqlcoder-GGUF:Q3_K_S
    prod: hf.co/TheBloke/sqlcoder-GGUF:Q4_K_S
  url: https://huggingface.co/TheBloke/sqlcoder-GGUF
- model_name: python_coder
  model_id:
    dev: hf.co/TheBloke/CodeLlama-7B-Python-GGUF:Q3_K_M
    prod: hf.co/TheBloke/CodeLlama-7B-Python-GGUF:Q4_K_M
  url: https://huggingface.co/TheBloke/CodeLlama-7B-Python-GGUF
```
Ones in Testing:
- Chat GPT: https://huggingface.co/unsloth/gpt-oss-20b-GGUF

In [45]:
import os
import sys
from pathlib import Path

def setup_dev_workspace(root_folder_name: str = 'gaby'):
    """ Call in files / notebooks if running workspace in sub-directory path. """

    if Path.cwd().stem == root_folder_name:
        print(f'Path already set to default root directory: {Path.cwd()}')
        return
    else:
        print('Initialized workspace currently at directory:', Path.cwd())

    current = Path().resolve()
    for parent in [current, *current.parents]:
        if parent.name == root_folder_name:
            os.chdir(parent)  # change working directory
            print(f"📂 Working directory set to: {parent}")
            return 

    raise FileNotFoundError(f"Root folder '{root_folder_name}' not found.")

setup_dev_workspace()

Path already set to default root directory: /Users/mimiphan/mimeus-app/backend/gaby


In [46]:
import os
from dotenv import load_dotenv

load_dotenv('/Users/mimiphan/mimeus-app/backend/gaby/.env.local')

LIGHTNING_OLLAMA_HOST_URL = os.getenv('LIGHTNING_OLLAMA_HOST_URL', '')
MODEL_STACK = [
    "hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q5_K_S", # BASE INSTRUCT HELPER TOOL
    "hf.co/TheBloke/sqlcoder-GGUF:Q4_K_M", # SQL CODER
    "hf.co/TheBloke/CodeLlama-7B-Python-GGUF:Q3_K_M"
]


'https://11434-01jxw838wanez4bbetq84ep1qv.cloudspaces.litng.ai/'

## Ollama Client 

In [None]:
# pip install ollama
import ollama
import json, time, hashlib, threading
from collections import OrderedDict
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union

PromptLike = Union[str, Dict[str, Any], List[Dict[str, str]]]

class OllamaCall:
    """
    Decorator class to wrap a function that returns a prompt/messages payload.
    - Your function should return either:
        1) str -> treated as `prompt` (for /api/generate)
        2) list[{"role","content"}] -> treated as chat messages (for /api/chat)
        3) dict -> passed through; must include 'prompt' (generate) or 'messages' (chat)

    Features:
    - mode='chat' or 'generate'
    - streaming optional (stores concatenated text)
    - LRU cache with JSONL persistence
    - thread-safe
    """

    def __init__(
        self,
        model: str,
        mode: str = "chat",            # "chat" | "generate"
        stream: bool = False,
        max_history: int = 1000,
        save_path: Optional[Union[str, Path]] = None,
        key_fn: Optional[Callable[[Dict[str, Any]], str]] = None,
        extra_params: Optional[Dict[str, Any]] = None,  # e.g., {"temperature": 0.2}
    ):
        assert mode in {"chat", "generate"}
        self.model = model
        self.mode = mode
        self.stream = stream
        self.max_history = max_history
        self.save_path = Path(save_path) if save_path else None
        self.key_fn = key_fn
        self.extra_params = extra_params or {}

        self._lock = threading.Lock()
        self._lru: "OrderedDict[str, Dict[str, Any]]" = OrderedDict()

        # Warm-load existing JSONL (optional)
        if self.save_path and self.save_path.exists():
            try:
                with self.save_path.open("r", encoding="utf-8") as f:
                    for line in f:
                        rec = json.loads(line)
                        k = rec.get("key")
                        if k:
                            self._lru[k] = rec
                # keep only most recent max_history
                while len(self._lru) > self.max_history:
                    self._lru.popitem(last=False)
            except Exception:
                # don't blow up on partial/corrupt logs
                pass

    def _norm_payload(self, payload: PromptLike) -> Dict[str, Any]:
        """Normalize user return into {'messages': ...} or {'prompt': ...}."""
        if isinstance(payload, str):
            return {"prompt": payload}
        if isinstance(payload, list):
            # assume chat messages schema
            return {"messages": payload}
        if isinstance(payload, dict):
            if "prompt" in payload or "messages" in payload:
                return payload
        raise TypeError("Return a str, a messages list, or a dict with 'prompt' or 'messages'.")

    def _default_key(self, request: Dict[str, Any]) -> str:
        # Stable key from: model + mode + request + extra_params
        blob = json.dumps(
            {"model": self.model, "mode": self.mode, "req": request, "extra": self.extra_params},
            sort_keys=True,
            ensure_ascii=False,
        )
        return hashlib.sha256(blob.encode("utf-8")).hexdigest()

    def _record(self, key: str, request: Dict[str, Any], response_text: str) -> Dict[str, Any]:
        return {
            "key": key,
            "ts": time.time(),
            "model": self.model,
            "mode": self.mode,
            "request": request,
            "response": response_text,
            "params": self.extra_params,
        }

    def _persist(self, rec: Dict[str, Any]):
        if not self.save_path:
            return
        self.save_path.parent.mkdir(parents=True, exist_ok=True)
        with self.save_path.open("a", encoding="utf-8") as f:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

    def history(self) -> List[Dict[str, Any]]:
        """Return recent conversation records (most-recent last)."""
        with self._lock:
            return list(self._lru.values())

    def get(self, key: str) -> Optional[Dict[str, Any]]:
        with self._lock:
            return self._lru.get(key)

    def __call__(self, fn: Callable[..., PromptLike]):
        def wrapper(*args, **kwargs) -> str:
            payload = self._norm_payload(fn(*args, **kwargs))

            # Build request body
            if self.mode == "chat":
                if "messages" not in payload:
                    # if only 'prompt' given, wrap as one user message
                    payload = {"messages": [{"role": "user", "content": payload["prompt"]}]}
                request = {"model": self.model, "messages": payload["messages"], **self.extra_params}
            else:
                if "prompt" not in payload:
                    # if only 'messages' given, squeeze into a single prompt
                    merged = "\n".join([m.get("content", "") for m in payload["messages"]])
                    payload = {"prompt": merged}
                request = {"model": self.model, "prompt": payload["prompt"], **self.extra_params}

            # Cache key
            key = self.key_fn(request) if self.key_fn else self._default_key(request)

            # LRU check
            with self._lock:
                if key in self._lru:
                    rec = self._lru.pop(key)         # mark as recently used
                    self._lru[key] = rec
                    return rec["response"]

            # Call Ollama
            if self.mode == "chat":
                if self.stream:
                    text = []
                    for chunk in ollama.chat(model=self.model, messages=request["messages"], stream=True, **self.extra_params):
                        part = chunk.get("message", {}).get("content", "")
                        if part:
                            text.append(part)
                    out = "".join(text)
                else:
                    resp = ollama.chat(model=self.model, messages=request["messages"], **self.extra_params)
                    out = resp["message"]["content"]
            else:
                if self.stream:
                    text = []
                    for chunk in ollama.generate(model=self.model, prompt=request["prompt"], stream=True, **self.extra_params):
                        part = chunk.get("response", "")
                        if part:
                            text.append(part)
                    out = "".join(text)
                else:
                    resp = ollama.generate(model=self.model, prompt=request["prompt"], **self.extra_params)
                    out = resp["response"]

            rec = self._record(key, request, out)

            # Update LRU + persist
            with self._lock:
                self._lru[key] = rec
                # enforce LRU size
                while len(self._lru) > self.max_history:
                    self._lru.popitem(last=False)
            self._persist(rec)

            return out

        return wrapper

In [47]:
import ollama

client = ollama.Client(LIGHTNING_OLLAMA_HOST_URL)

In [52]:
client.list().models

[Model(model='hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q5_K_S', modified_at=datetime.datetime(2025, 9, 27, 7, 43, 31, 225350, tzinfo=TzInfo(UTC)), digest='2ea8ba65d59e6cfee05595e37890bc6a68d91aac807d9d12474b74cddb48128c', size=2269514322, details=ModelDetails(parent_model='', format='gguf', family='llama', families=['llama'], parameter_size='3.21B', quantization_level='unknown'))]

### Detailed Config of the model

In [64]:
from ollama import ShowResponse

In [65]:
m: ShowResponse = client.show(model="hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q5_K_S")

In [69]:
try:
    m: ShowResponse = client.show(model="hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q3_K_L")
except ollama.ResponseError as e:
    print('Ollama Client Error', e)

Ollama Client Error model 'hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q3_K_L' not found (status code: 404)


In [72]:
m.modelfile

'# Modelfile generated by "ollama show"\n# To build a new Modelfile based on this, replace FROM with:\n# FROM hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q5_K_S\n\nFROM /root/.ollama/models/blobs/sha256-fc58b1880ce451d10f9aa11dd1b566a50cd41b1f72981c5f8a35313e397358f7\nTEMPLATE """{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>\n{{- if .System }}\n\n{{ .System }}\n{{- end }}\n{{- if .Tools }}\n\nCutting Knowledge Date: December 2023\n\nWhen you receive a tool call response, use the output to format an answer to the orginal user question.\n\nYou are a helpful assistant with tool calling capabilities.\n{{- end }}<|eot_id|>\n{{- end }}\n{{- range $i, $_ := .Messages }}\n{{- $last := eq (len (slice $.Messages $i)) 1 }}\n{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>\n{{- if and $.Tools $last }}\n\nGiven the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.\n\nRespond in th

In [75]:
print(m.template)

{{- if or .System .Tools }}<|start_header_id|>system<|end_header_id|>
{{- if .System }}

{{ .System }}
{{- end }}
{{- if .Tools }}

Cutting Knowledge Date: December 2023

When you receive a tool call response, use the output to format an answer to the orginal user question.

You are a helpful assistant with tool calling capabilities.
{{- end }}<|eot_id|>
{{- end }}
{{- range $i, $_ := .Messages }}
{{- $last := eq (len (slice $.Messages $i)) 1 }}
{{- if eq .Role "user" }}<|start_header_id|>user<|end_header_id|>
{{- if and $.Tools $last }}

Given the following functions, please respond with a JSON for a function call with its proper arguments that best answers the given prompt.

Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. Do not use variables.

{{ range $.Tools }}
{{- . }}
{{ end }}
Question: {{ .Content }}<|eot_id|>
{{- else }}

{{ .Content }}<|eot_id|>
{{- end }}{{ if $last }}<|start_header_id|>assistant<|end_header_id|>

{{ e

In [71]:
dir(m)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_computed_fields__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializ

## Missing Dataset Case

In [None]:
from ollama import Options

DEFAULT_OPTIONS = Options(
    num_ctx=1024,         # shorter context → less overhead
    temperature=0.3,      # still stable, but a touch livelier
    top_p=0.9,            # good nucleus sampling
    top_k=40,             # typical safe default
    repeat_penalty=1.05,  # lighter repetition check → less compute
    num_predict=128,      # cap on tokens (speeds up response)
    num_thread=6,         # match physical CPU cores (adjust to your machine)
    num_gpu=1,            # offload to GPU if you have one
    low_vram=False,       # only True if you’re memory-starved
    f16_kv=True,          # faster key/value cache
    use_mmap=True,        # mmap the model for faster loading
    use_mlock=False,      # set True if you want to lock into RAM
    seed=None             # nondeterministic, so cache doesn’t collide
)

### Prompts

- OrchestrateBook = high-level workflow (like a book outline: Chapters).
- OrchestrateChapters = sub-stages within a given Story step (like the Scenes in a Chapter).

In [None]:
from ollama import ChatResponse
from dataclasses import dataclass, field
from src.gaby_agent.core.agent._core import Instructor, GabyBasement

from src.gaby_agent.core.config import LocalConfig

In [172]:
config = LocalConfig()
REASONING_AGENT_ID = "hf.co/ggml-org/gpt-oss-20b-GGUF"

In [146]:
client = ollama.Client(config.lightning_ollama)

In [173]:
client.pull(REASONING_AGENT_ID)

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [175]:
client.list().models

[Model(model='hf.co/ggml-org/gpt-oss-20b-GGUF:latest', modified_at=datetime.datetime(2025, 9, 27, 17, 9, 25, 749660, tzinfo=TzInfo(UTC)), digest='eb9eabc5cfb912fb3e99d1c3ea9b648fac7d808369e9f70b484ce9054ee2aed9', size=12109567696, details=ModelDetails(parent_model='', format='gguf', family='gpt-oss', families=['gpt-oss'], parameter_size='20.9B', quantization_level='unknown')),
 Model(model='hf.co/bartowski/Llama-3.2-3B-Instruct-GGUF:Q5_K_S', modified_at=datetime.datetime(2025, 9, 27, 16, 6, 2, 145118, tzinfo=TzInfo(UTC)), digest='2ea8ba65d59e6cfee05595e37890bc6a68d91aac807d9d12474b74cddb48128c', size=2269514322, details=ModelDetails(parent_model='', format='gguf', family='llama', families=['llama'], parameter_size='3.21B', quantization_level='unknown'))]

In [182]:
# Narrator
import inspect

K_INTERVAL = 5

class Summarizer(
    GabyBasement,
    prompt=Instructor(
        prompt="You are a project summarizing assistant that summarizes a data team's progression in data cleaning workflow. Given the user's input project workflow description, history actions and evaluation, your task is to summarize the events in no more than 5 sentences.",
        input_template="""
        Data Cleaning Project Workflow Steps
        {project_meta}

        Previous Actions
        {action_history}

        Current Stage
        {most_recent_stage}
        """
    )
):
    def __init__(self, project_stages: str, narrate_interval: int, **kwargs):
        super().__init__(**kwargs)
        self.k = narrate_interval
        self.history = []
        self.action_history = []
        self.project_meta = project_stages # should be the substages of one of the data processing workflow e.g. data cleaning - missing dataset handler etc.

    def post_process(self, response: ChatResponse):
        content = f"{response.created_at}\n"
        content += response.message.get('content', None).strip()
        return content

    def step(self):
        if len(self.history) % self.k == 0:
            print("Summarizing previous actions now.")

            kwargs = {}

            if len(self.history) == 0:
                kwargs.update({"action_history": ""})
                kwargs.update({"most_recent_stage": ""})
            else:
                kwargs.update({"action_history": "".join(self.history)})
                kwargs.update({"most_recent_stage": self.history[-1]})

            kwargs.update({"project_meta": self.project_meta})

            response = self.run(**kwargs)
            self.history += [response]
        else:
            print(f"Not summarizing actions, current counter: {len(self.history)}.")

    def add_action(self, cls):
        """
        Method version of the footprint decorator.
        Example:
            @summary.add_footprint
            class MyAgent: ...
        """
        doc = inspect.getdoc(cls) or "No description available."
        entry = f"Class {cls.__name__}: {doc}\n"
        self.action_history.append(entry)
        return cls

writer = Summarizer(project_stages="Missing Data Values", narrate_interval=K_INTERVAL)

TypeError: object.__new__() takes exactly one argument (the type to instantiate)

In [None]:
# OVERALL PROMPTS NEWLY ADDED
DATA_TASK_IDS = (
    "data_cleaning",
    "data_analytics",
    "data_business_insights",
    "data_model_cycle"
)

# DATA CLEANING PARENT & CHILD STAGES
_DATA_CLEANING_META = (
    ("data_brief", "Summarize, document, identify and understand the dataset and data columns."),
    ("missing_data", "Identify and process the missing Data Values existing per data field"),
    ("anomality_detection", "Identify and process anomality"),
    ("data_transform", "Transforming data columns, for example, encoding binary labels.")
)
_MISSING_DATA_META = (
   ("define_missing_type", "Distinguish type of missing data values: MCAR, MNAR or MAR."),
   ("missing_report", "Summarize Missing Reports for each Data Column")
)

# ====================================================
# Missing Data Stage Config
# ====================================================
MISSING_DATA_CONFIG = {
    "distinguish_mcar_mar": {
        "prompt": (
            "Stage: Distinguish MCAR vs MAR\n\n"
            "Hypotheses:\n"
            "- H0: Missingness is completely at random (MCAR).\n"
            "- H1: Missingness depends on observed variables (MAR).\n\n"
            "Dataset Preview:\n{dataset_preview}\n\n"
            "Instruction:\n"
            "Test whether missingness in '{target_col}' is independent of observed variables (MCAR) "
            "or dependent on observed variables (MAR).\n\n"
            "Respond only with the most appropriate tool name from the following list:\n"
            "{tools}"
        ),
        "input_template": (
            "Please analyze missingness for '{target_col}' against '{features}' or '{group_col}' "
            "using chi-square, logistic regression, or random forest feature importance."
        ),
        "tools": [
            "littles_mcar_test",
            "chi_square_missingness",
            "test_uniform_missing_multilabel",
            "logistic_regression_missingness",
            "random_forest_importance"
        ]
    },

    "distinguish_mar_mnar": {
        "prompt": (
            "Stage: Distinguish MAR vs MNAR\n\n"
            "Hypotheses:\n"
            "- H0: Missingness is at random (MAR).\n"
            "- H1: Missingness depends on the unobserved/missing value itself (MNAR).\n\n"
            "Dataset Preview:\n{dataset_preview}\n\n"
            "Instruction:\n"
            "Test whether missingness in '{target_col}' is fully explained by observed variables (MAR) "
            "or depends on unobserved values (MNAR).\n\n"
            "Respond only with the most appropriate tool name from the following list:\n"
            "{tools}"
        ),
        "input_template": (
            "Please analyze missingness for '{target_col}' using Heckman selection models "
            "or sensitivity analysis (extremes/bounds)."
        ),
        "tools": [
            "heckman_selection",
            "sensitivity_analysis"
        ]
    }
}

#### Mock Dataset

In [None]:

MOCK_PROMPT = """You are a reasoning agent for missing data classification.
Task:
Given the dataset field summary (table of data columns, their data types, and missing value ratios) and the available diagnostic tools, your job is to choose the most appropriate action (tool) to test whether the missingness of a given target column is best explained as:

- MCAR (Missing Completely At Random)
- MAR (Missing At Random)
- MNAR (Missing Not At Random)

Definitions:
- MCAR: Missingness is completely random, unrelated to observed or unobserved variables.
- MAR: Missingness depends only on observed variables (e.g., age, gender).
- MNAR: Missingness depends on the missing/unobserved value itself (e.g., high income not reported).

Dataset Field Summary (example):
| Column Name | Data Type     | Missing Ratio |
|-------------|---------------|---------------|
| age         | numeric       | 0.02          |
| income      | numeric       | 0.15          |
| gender      | categorical   | 0.01          |
| region      | categorical   | 0.00          |

Target Column:
'age'

Available Tools:
- littles_mcar_test: Correlation among missingness indicators, proxy for Little’s MCAR test.
- chi_square_missingness: Test missingness in target_col against a group_col using chi-square.
- test_uniform_missing_multilabel: Goodness-of-fit for uniform missing across labels.
- logistic_regression_missingness: Logistic regression of missingness ~ observed covariates.
- random_forest_importance: Predict missingness using observed covariates with feature importances.
- heckman_selection: Selection model to test dependence on unobserved values (MNAR suspicion).
- sensitivity_analysis: Impute with extremes/bounds to test MNAR robustness.

Instruction:
1. Carefully review the dataset field summary, target column, and tool descriptions.
2. Identify whether the missingness for the target column should be tested under MCAR, MAR, or MNAR conditions.
3. Respond **only with the single most appropriate tool name** from the provided list that should be applied first.

Output Format:
Return only the tool name (string), no reasoning or explanation."""

TOOL_CALLER_TRANSCRIPT = """You are a reasoning agent for missing data classification.
Task:
Given the dataset field summary (table of data columns, their data types, and missing value ratios) and the available diagnostic tools, your job is to choose the most appropriate action (tool) to test whether the missingness of a given target column is best explained as:
{input_task_description}

Dataset Field Summary:
{input_data_field_summary}

Target Column:
{input_target_col}

Available Tools:
{input_tools_list}

Instruction:
{input_instruction}

Output Format:
Return only the tool name (string), no reasoning or explanation."""

TOOL_CALLER_MISSING_TARGET = """You are a reasoning agent for missing data classification.
Task:
Given the dataset field summary (table of data columns, their data types, and missing value ratios) and the available diagnostic tools, your job is to choose the most appropriate action (tool) to test whether the missingness of a given target column is best explained as:

- MCAR (Missing Completely At Random)
- MAR (Missing At Random)
- MNAR (Missing Not At Random)

Definitions:
- MCAR: Missingness is completely random, unrelated to observed or unobserved variables.
- MAR: Missingness depends only on observed variables (e.g., age, gender).
- MNAR: Missingness depends on the missing/unobserved value itself (e.g., high income not reported).

Dataset Field Summary (example):
{input_data_field_summary}

Target Column:
{input_target_col}

Available Tools:
- littles_mcar_test: Correlation among missingness indicators, proxy for Little’s MCAR test.
- chi_square_missingness: Test missingness in target_col against a group_col using chi-square.
- test_uniform_missing_multilabel: Goodness-of-fit for uniform missing across labels.
- logistic_regression_missingness: Logistic regression of missingness ~ observed covariates.
- random_forest_importance: Predict missingness using observed covariates with feature importances.
- heckman_selection: Selection model to test dependence on unobserved values (MNAR suspicion).
- sensitivity_analysis: Impute with extremes/bounds to test MNAR robustness.

Instruction:
1. Carefully review the dataset field summary, target column, and tool descriptions.
2. Identify whether the missingness for the target column should be tested under MCAR, MAR, or MNAR conditions.
3. Respond **only with the single most appropriate tool name** from the provided list that should be applied first.

Output Format:
Return only the tool name (string), no reasoning or explanation."""


In [None]:
# MISSING DATA STAGE CONFIGURATION & PROMPTS

# Missing Data Classification Agent
MISSING_DATA_CLASSIFIER_PROMPT = """ 
You are a Data Science Project manager for a data cleaning workflow. Given the current project's workflow, task objectives, data field summary defining all data columns in the dataset, your task is to decide on the next stage to action for your team.

# Project Task Description

The project task is to distinguish the missing dataset to one of the following labels:

- MCAR (Missing Completely At Random):
  Missingness occurs entirely by chance and is unrelated to both observed and unobserved variables.
  Example: survey responses lost due to a server glitch, affecting everyone equally.

- MAR (Missing At Random):
  Missingness depends only on observed variables, not on the missing values themselves.
  Example: older participants are less likely to answer a tech question, but age is recorded in the dataset.

- MNAR (Missing Not At Random):
  Missingness depends directly on the unobserved/missing value itself, even after accounting for observed variables.
  Example: people with very high income choose not to report their salary, specifically because of its value.

# Project Procedure Steps

1. Describe the testing method suitable with the datatype of the given data column type.
2. If MCAR is rejected, test for MAR. Ensure the test aligns with the data column and data types.
3. Only classify as MNAR if residual dependence suggests missingness depends on unobserved values.
5. End with the classification label: MCAR, MAR, or MNAR.

# Current Project Step

Data Field Summary:
{data_field_summary}

Target Data Field Name
{target_column}

# Previous History Actions

Previous Actions
{action_history_summary}

Given the information above, respond the next stage to action from one of the listed steps under Project Procedure.
"""


#AVAILABLE_FUNCTIONS = {}

### Workflow

In [None]:
# pip install ollama

CHAT_CONFIG = dict(
    stream=False,
    # think='low',
    options=DEFAULT_OPTIONS,
    keep_alive='15m',
    # tools = FUNCTION CALLABLES
)

class OllamaBasement:
    """
    Minimal decorator for wrapping functions that return prompts/messages.
    Only supports chat mode (no generate).
    """

    def __init__(self, model: str, prompt: Instructor, **kwargs: dict):
        self.model_id = model
        self.prompt = prompt
        self.kwargs = CHAT_CONFIG if not kwargs else kwargs

    @property
    def system_prompt(self):# -> list[dict[str, Any]]:
        text = self.prompt.prompt if isinstance(self.prompt, Instructor) else str(self.prompt)
        return [{"role": "system", "content": text}]

    def __call__(self, fn: Callable[...]):
        def wrapper(*args, **kwargs) -> str:
            try:
                inputs: str = fn(*args, **kwargs)
                message = self.system_prompt + [{"role": "user", "content": inputs}]
                client = ollama.Client(LIGHTNING_OLLAMA_HOST_URL)
                response = client.chat(model=self.model_id, messages=message, **self.kwargs)
                return response
            except Exception as e:
                raise e

        return wrapper

In [130]:
writer = Summarizer(
            project_stages="Data Cleaning Pipeline: 1) Data Loading, 2) Missing Value Detection, 3) Outlier Analysis, 4) Data Validation"
)

TypeError: object.__new__() takes exactly one argument (the type to instantiate)