In [1]:
import httpx
import json
from typing import Optional, Dict, Any, Iterator

In [2]:
BASE_URL = "http://host.docker.internal:8000"
USERNAME = "admin"
PASSWORD = "admin"


In [3]:
# Create a client session
client = httpx.Client(base_url=BASE_URL, timeout=30.0)

try:
    print(f"--- 1. Connecting & Authenticating to {BASE_URL} ---")
    
    # Login
    login_response = client.post(
        "/api/auth/login",
        json={"username": USERNAME, "password": PASSWORD, "rememberMe": False}
    )
    login_response.raise_for_status()
    
    # Extract token and set headers for future requests
    login_data = login_response.json()
    token = login_data["token"]
    client.headers.update({"Authorization": f"Bearer {token}"})
    
    print("‚úÖ Login Successful")
    print(f"User: {login_data.get('user', {}).get('username')}")
    print(f"Token (first 20 chars): {token[:20]}...")

    print("\n--- 2. Testing Connection (Health Check) ---")
    
    # Simple health check (often /api/health or /health)
    # Based on your swagger, it is likely /api/health
    try:
        health = client.get("/api/health")
        if health.status_code == 200:
            print("‚úÖ Health Check Passed")
            print(health.json())
        else:
            print(f"‚ö†Ô∏è Health Check returned status: {health.status_code}")
    except httpx.HTTPStatusError:
        print("‚ö†Ô∏è Health Check failed or endpoint not found.")

    print("\n--- 3. Fetching Available Models ---")
    
    # Get Models
    models_response = client.get("/api/models")
    models_response.raise_for_status()
    models = models_response.json()
    
    print(f"‚úÖ Found {len(models)} models:")
    for m in models:
        # Adjust 'id' or 'name' based on the actual response structure
        print(f" - {m.get('id', 'Unknown ID')}: {m.get('name', 'No Name')}")

except httpx.HTTPStatusError as e:
    print(f"‚ùå HTTP Error: {e.response.status_code} - {e.response.text}")
except Exception as e:
    print(f"‚ùå Connection Error: {e}")
finally:
    client.close()

--- 1. Connecting & Authenticating to http://host.docker.internal:8000 ---
‚úÖ Login Successful
User: admin
Token (first 20 chars): eyJhbGciOiJIUzI1NiIs...

--- 2. Testing Connection (Health Check) ---
‚úÖ Health Check Passed
{'status': 'healthy'}

--- 3. Fetching Available Models ---
‚úÖ Found 3 models:
 - Unknown ID: deepseek-v3.2:cloud
 - Unknown ID: mistral-large-3:675b-cloud
 - Unknown ID: kimi-k2-thinking:cloud


In [None]:
import httpx
import json
from deepeval.test_case import LLMTestCase

BASE_URL = "http://host.docker.internal:8000"
USERNAME = "admin"
PASSWORD = "admin"
QUESTION = "How is the chair of Food Standards Scotland appointed"

def run_debug_script():
    print(f"Targeting: {BASE_URL}")
    
    with httpx.Client(base_url=BASE_URL, timeout=60.0) as client:
        
        # 1. Login & Auth
        print("\nüîê Logging in...")
        try:
            login_resp = client.post("/api/auth/login", 
                                   json={"username": USERNAME, "password": PASSWORD})
            login_resp.raise_for_status()
            token = login_resp.json()["token"]
            client.headers.update({"Authorization": f"Bearer {token}"})
            print("‚úÖ Login successful")
        except Exception as e:
            print(f"‚ùå Login failed: {e}")
            return

        # 2. Get Models
        print("\nü§ñ Fetching models...")
        models_resp = client.get("/api/models")
        models_resp.raise_for_status()
        models = models_resp.json()
        
        # Select a model
        if len(models) > 1:
            model_id = models[1].get("id") or models[1].get("name")
        else:
            model_id = models[0].get("id") or models[0].get("name")
            
        print(f"üëâ Using model: {model_id}")

        # 3. Generate Answer AND Capture Retrieval Context (Streaming)
        print(f"\nüí¨ Sending Query: '{QUESTION}'")
        print("‚è≥ Streaming answer...\n")
        
        chat_payload = {
            "messages": [{"role": "user", "content": QUESTION}],
            "model": model_id,
            "deep_research": True  # Important: enables tool calling!
        }
        
        actual_output = ""
        retrieval_context = []  # Capture context during streaming
        
        # Connect to stream
        with client.stream("POST", "/api/chat", json=chat_payload) as response:
            response.raise_for_status()
            
            # Iterate over SSE lines
            for line in response.iter_lines():
                if line.startswith("data: "):
                    try:
                        json_str = line[6:]
                        if not json_str.strip() or json_str == "[DONE]":
                            continue
                            
                        data = json.loads(json_str)
                        event_type = data.get("type")
                        
                        # Capture streaming tokens
                        if event_type == "token":
                            chunk = data.get("content", "")
                            actual_output += chunk
                            print(chunk, end="", flush=True)
                        
                        # Capture tool calls
                        elif event_type == "tool_call":
                            tool_name = data.get("tool_name", "unknown")
                            arguments = data.get("arguments", {})
                            print(f"\n\nüîß [TOOL CALL: {tool_name}]")
                            print(f"   Arguments: {json.dumps(arguments, indent=2)}")
                        
                        # Capture tool results (THIS IS THE RETRIEVAL CONTEXT!)
                        elif event_type == "tool_result":
                            tool_name = data.get("tool_name", "unknown")
                            result = data.get("result", "")
                            
                            if result and result.strip():
                                retrieval_context.append(result)
                                
                                print(f"\n\nüì• [TOOL RESULT: {tool_name}]")
                                print(f"   Retrieved {len(result)} characters")
                                preview = result[:200] + "..." if len(result) > 200 else result
                                print(f"   Preview: {preview}\n")
                        
                        # Capture final result (if provided)
                        elif event_type == "result":
                            final_message = data.get("message", "")
                            if final_message and not actual_output:
                                actual_output = final_message
                            print("\n\n‚úÖ Response complete")
                        
                        # Handle errors
                        elif event_type == "error":
                            error_msg = data.get("error", "Unknown error")
                            print(f"\n\n‚ùå Error: {error_msg}")
                            
                    except json.JSONDecodeError as e:
                        print(f"\n‚ö†Ô∏è  JSON decode error: {e}")
                        continue
        
        print("\n\n" + "="*80)

        # 4. Verify we got context
        print(f"\nüìä CAPTURE SUMMARY")
        print("="*80)
        print(f"‚úÖ Generated output: {len(actual_output)} characters")
        print(f"‚úÖ Retrieved context: {len(retrieval_context)} documents")
        
        if not retrieval_context:
            print("\n‚ö†Ô∏è  WARNING: No retrieval context captured!")
            print("   This could mean:")
            print("   - deep_research was not enabled")
            print("   - No tools were called for this query")
            print("   - Tool results were empty")
        else:
            print(f"\nüìö Context Documents:")
            for i, ctx in enumerate(retrieval_context, 1):
                print(f"\n   Document {i}: {len(ctx)} chars")
                print(f"   Preview: {ctx[:150]}...")

        # 5. Prepare DeepEval Test Case
        print("\n" + "="*80)
        print("üß™ DEEPEVAL TEST CASE")
        print("="*80)
        
        test_case = LLMTestCase(
            input=QUESTION,
            actual_output=actual_output,
            retrieval_context=retrieval_context
        )
        
        print(f"\nINPUT:")
        print(f"{test_case.input}")
        
        print(f"\nACTUAL OUTPUT ({len(test_case.actual_output)} chars):")
        print(f"{test_case.actual_output[:300]}...")
        
        print(f"\nRETRIEVAL CONTEXT ({len(test_case.retrieval_context)} documents):")
        for i, ctx in enumerate(test_case.retrieval_context, 1):
            print(f"\n--- Document {i} ({len(ctx)} chars) ---")
            print(f"{ctx[:200]}...")
            
        print("\n" + "="*80)
        print("‚úÖ Test case ready for evaluation!")
        print("\nNext steps:")
        print("  1. from deepeval.metrics import FaithfulnessMetric")
        print("  2. metric = FaithfulnessMetric(threshold=0.7, model='gpt-4')")
        print("  3. metric.measure(test_case)")
        print("  4. print(metric.score, metric.reason)")
        
        return test_case

if __name__ == "__main__":
    test_case = run_debug_script()

In [None]:
import httpx
import json
from deepeval.test_case import LLMTestCase, ToolCall

BASE_URL = "http://host.docker.internal:8000"
AUTH_DATA = {"username": "admin", "password": "admin"}

def get_authenticated_client():
    """
    Creates and authenticates a client. 
    We do NOT use 'with' inside here so the caller can manage the lifecycle.
    """
    client = httpx.Client(base_url=BASE_URL, timeout=None)
    
    print("üîê Logging in...")
    try:
        # Perform login
        resp = client.post("/api/auth/login", json=AUTH_DATA)
        resp.raise_for_status()
        
        token = resp.json()["token"]
        client.headers.update({"Authorization": f"Bearer {token}"})
        print("‚úÖ Login successful")
        return client
    except Exception as e:
        client.close()
        print(f"‚ùå Authentication failed: {e}")
        raise

def capture_legal_test_case(client, question, model_id, deep_research=False):
    """Executes the query and captures context for DeepEval."""
    
    chat_payload = {
        "messages": [{"role": "user", "content": question}],
        "model": model_id,
        "deep_research": deep_research,
        "stream": True
    }

    actual_output = ""
    retrieval_context = []
    tools_used = []

    print(f"‚è≥ Processing legal research for: '{question}'")

    with client.stream("POST", "/api/system/chat", json=chat_payload) as response:
        response.raise_for_status()
        
        for line in response.iter_lines():
            if not line.startswith("data: "): continue
            
            json_str = line[6:]
            if json_str == "[DONE]": break
            
            try:
                data = json.loads(json_str)
                event_type = data.get("type")

                if event_type == "tool_start":
                    tool_name = data.get("tool")
                    if tool_name:
                        tools_used.append(ToolCall(name=tool_name))

                # Capture LLM Tokens
                if event_type == "token":
                    chunk = data.get("content", "")
                    actual_output += chunk
                
                # Inside your capture loop:
                elif event_type == "api_call_end":
                    resp_body = data.get("response", {})
                    
                    # PRIORITY 1: Full Legislation Text (The most important for Faithfulness)
                    if "full_text" in resp_body:
                        legislation_title = resp_body.get("legislation", {}).get("title", "Unknown Act")
                        content = resp_body["full_text"]
                        retrieval_context.append(f"Source: {legislation_title}\nContent: {content}")
                    
                    # PRIORITY 2: Search Results (If full text wasn't fetched yet)
                    elif "results" in resp_body:
                        for item in resp_body["results"]:
                            title = item.get("title", "Unknown Source")
                            desc = item.get("description", "")
                            if desc:
                                retrieval_context.append(f"Source: {title}\nSummary: {desc}")

                elif event_type == "tool_result":
                    # The 'delegate_research' tool result provides a summary of the agent's findings.
                    # This is useful for 'Answer Relevancy' but 'Faithfulness' needs the raw full_text above.
                    res_text = data.get("result", "")
                    if res_text:
                        retrieval_context.append(f"Agent Summary: {res_text}")

            except json.JSONDecodeError:
                continue

    # Return the formatted test case for DeepEval
    return LLMTestCase(
        input=question,
        actual_output=actual_output,
        tools_called=tools_used,
        retrieval_context=list(dict.fromkeys(retrieval_context)) # Deduplicate
    )



In [None]:
from deepeval.test_case import LLMTestCase, ToolCall
import json

def capture_with_audit(client, question, model_id, deep_research=False):
    chat_payload = {
        "messages": [{"role": "user", "content": question}],
        "model": model_id,
        "deep_research": deep_research,
        "stream": True
    }

    print(f"‚è≥ Auditing research for: '{question}'")
    
    actual_output = ""
    retrieval_context = []
    tools_captured = []
    
    # --- STATE MACHINE VARIABLES ---
    # We use these to "hold" data as we stream through the events
    current_tool = {} 

    with client.stream("POST", "/api/system/chat", json=chat_payload) as response:
        for line in response.iter_lines():
            if not line.startswith("data: "): continue
            json_str = line[6:]
            if json_str == "[DONE]": break
            
            try:
                data = json.loads(json_str)
                event_type = data.get("type")

                # 1. Capture the Answer Text
                if event_type == "token":
                    actual_output += data.get("content", "")

                # 2. START A NEW TOOL
                elif event_type == "tool_start":
                    # If we were tracking a previous tool, save it (failsafe)
                    if current_tool.get("name"):
                         tools_captured.append(ToolCall(
                             name=current_tool["name"],
                             input_parameters=current_tool.get("input_parameters", {}),
                             output=current_tool.get("output", "No output captured")
                         ))
                    
                    # Start fresh
                    current_tool = {
                        "name": data.get("tool"),
                        "input_parameters": {},
                        "output": None
                    }

                # 3. CAPTURE INPUTS (The API Payload)
                elif event_type == "api_call_start":
                    # This event contains the actual query sent to the legal database
                    if current_tool:
                        current_tool["input_parameters"] = data.get("payload", {})

                # 4. CAPTURE OUTPUTS (The API Response)
                elif event_type == "api_call_end":
                    resp = data.get("response", {})
                    
                    # A: Save for Faithfulness (Context)
                    if "full_text" in resp:
                         retrieval_context.append(resp["full_text"])
                    elif "results" in resp:
                        for r in resp["results"]:
                            retrieval_context.append(r.get("description", ""))
                            
                    # B: Save for Auditing (Tool Output)
                    if current_tool:
                        # Convert complex JSON to string for the ToolCall object
                        current_tool["output"] = str(resp)[:1000] + "..." # Truncate if huge

                # 5. CLOSE THE TOOL
                elif event_type == "tool_end":
                    if current_tool.get("name"):
                        # If we missed the api_call_end, try to use the generic result
                        if not current_tool["output"]:
                            current_tool["output"] = str(data.get("result", ""))
                        
                        # Create the DeepEval Object
                        tools_captured.append(ToolCall(
                            name=current_tool["name"],
                            input_parameters=current_tool["input_parameters"],
                            output=current_tool["output"]
                        ))
                        current_tool = {} # Reset

            except json.JSONDecodeError:
                continue

    return LLMTestCase(
        input=question,
        actual_output=actual_output,
        retrieval_context=list(dict.fromkeys(retrieval_context)),
        tools_called=tools_captured
    )

In [None]:
import json

def inspect_tool_outputs(client, question, model_id):
    """
    A diagnostic function to print the raw JSON structure of LEX tool returns.
    """
    payload = {
        "messages": [{"role": "user", "content": question}],
        "model": model_id,
        "deep_research": False, # Switch off web/deep research as requested
        "stream": True
    }

    print(f"\nüîç INSPECTING RAW TOOL RESPONSES FOR: '{question}'")
    print("-" * 60)

    with client.stream("POST", "/api/system/chat", json=payload) as response:
        response.raise_for_status()
        
        for line in response.iter_lines():
            if not line.startswith("data: "):
                continue
            
            json_str = line[6:]
            if json_str == "[DONE]":
                break
                
            try:
                data = json.loads(json_str)
                event_type = data.get("type")

                # Focus on the events that actually contain retrieved legal data
                if event_type in ["api_call_end", "tool_result"]:
                    print(f"\nüì¶ EVENT TYPE: {event_type}")
                    # Pretty-print the whole JSON block so you can see the keys
                    print(json.dumps(data, indent=4))
                    print("-" * 30)
                
                # Optionally print tokens just to see progress
                elif event_type == "token":
                    print(".", end="", flush=True)

            except json.JSONDecodeError:
                continue
    print("\n\n‚úÖ Inspection Complete.")

In [None]:
if __name__ == "__main__":
    # The client is created and authenticated here
    client = get_authenticated_client()
    
    try:
        legal_query = "How is the chair of Food Standards Scotland appointed"
        # Pass the already open client to your capture function
        
        #test_case = capture_legal_test_case(client, legal_query, "mistral-large-3:675b-cloud")
        #test_case = capture_with_audit(client, legal_query, "mistral-large-3:675b-cloud")
        inspect_case = inspect_tool_outputs(client, legal_query, "mistral-large-3:675b-cloud")
    finally:
        # Crucial: Always close the client when finished to free up Docker resources
        client.close()
        print("üîå Client connection closed.")

In [None]:
test_case