In [1]:
import os
import sys
import asyncio
import sqlite3
import json
import logging
import re
from typing import Dict, Any, List, Optional
from dotenv import load_dotenv

sys.path.append('../src')
load_dotenv()

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Reduce noise from autogen
logging.getLogger('autogen_core').setLevel(logging.WARNING)

In [2]:
from pathlib import Path
from keyvalue_memory import KeyValueMemory
from task_context_manager import TaskContextManager
from query_tree_manager import QueryTreeManager
from database_schema_manager import DatabaseSchemaManager
from node_history_manager import NodeHistoryManager
from query_analyzer_agent import QueryAnalyzerAgent
from schema_reader import SchemaReader
from schema_linker_agent import SchemaLinkerAgent

data_path = "/home/norman/work/text-to-sql/MAC-SQL/data/bird"
tables_json_path = Path(data_path) / "dev_tables.json"
db_name = "california_schools"

In [3]:
task_id = "experimental-test"

query = "What is the highest eligible free rate for K-12 students in schools in Alameda County?"
node_intent = "Find the maximum eligible free rate for K-12 students in schools located in Alameda County"
memory = KeyValueMemory()
        
# Initialize task
task_manager = TaskContextManager(memory)
await task_manager.initialize(task_id, query, db_name)

# Load schema
schema_manager = DatabaseSchemaManager(memory)
await schema_manager.initialize()

schema_reader = SchemaReader(
    data_path=data_path,
    tables_json_path=str(tables_json_path),
    dataset_name="bird",
    lazy=False
)
await schema_manager.load_from_schema_reader(schema_reader, db_name)

tree_manager = QueryTreeManager(memory)
node_id = await tree_manager.initialize(query)

2025-05-29 17:06:22,814 - TaskContextManager - INFO - Initialized task context for task experimental-test
2025-05-29 17:06:22,815 - DatabaseSchemaManager - INFO - Initialized empty database schema


load json file from /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_tables.json

Loading all database info...
Found 11 databases in bird dataset


2025-05-29 17:06:35,349 - DatabaseSchemaManager - INFO - Initialized empty database schema
2025-05-29 17:06:35,350 - DatabaseSchemaManager - INFO - Added table 'frpm' to schema
2025-05-29 17:06:35,350 - DatabaseSchemaManager - INFO - Added table 'satscores' to schema
2025-05-29 17:06:35,351 - DatabaseSchemaManager - INFO - Added table 'schools' to schema
2025-05-29 17:06:35,351 - DatabaseSchemaManager - INFO - Loaded schema for database 'california_schools' with 3 tables
2025-05-29 17:06:35,351 - QueryTreeManager - INFO - Initialized query tree with root node root


In [5]:
# await memory.get('databaseSchema')

In [6]:
agent = SchemaLinkerAgent(memory, llm_config={
    "model_name": "gpt-4o",
    "temperature": 0.1,
    "timeout": 60
}, debug=True)

2025-05-29 17:08:38,606 - SchemaLinkerAgent - DEBUG - Created AssistantAgent: schema_linker
2025-05-29 17:08:38,607 - SchemaLinkerAgent - DEBUG - Created MemoryAgentTool for schema_linker
2025-05-29 17:08:38,607 - SchemaLinkerAgent - INFO - Initialized schema_linker with model gpt-4o


In [7]:
result = await agent.run(query)

2025-05-29 17:08:39,622 - SchemaLinkerAgent - INFO - Available tables in schema: ['frpm', 'satscores', 'schools']
2025-05-29 17:08:39,623 - SchemaLinkerAgent - INFO - Schema linking context prepared for node: root
2025-05-29 17:08:39,623 - SchemaLinkerAgent - INFO - Node details: {'nodeId': 'root', 'status': 'created', 'childIds': [], 'intent': 'What is the highest eligible free rate for K-12 students in schools in Alameda County?', 'schema_linking': {}, 'generation': {}, 'evaluation': {}}
2025-05-29 17:08:52,361 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-29 17:08:52,365 - SchemaLinkerAgent - INFO - Raw LLM output: ```xml
<schema_linking>
  <available_schema>
    <tables>
      <table name="frpm">
        <columns>
          <column name="CDSCode" type="text" sample_values="['01', '02', '03']"/>
          <column name="County Name" type="text" sample_values="['Los Angeles', 'San Diego', 'Alameda']"/>
          <column name="

In [8]:
for message in result.messages:
    print(f"\n[{getattr(message, 'source', 'Unknown')}]:")
    print(message.content)
    print("-" * 40)


[user]:
I'm providing you with context from previous interactions:

### Original Query
What is the highest eligible free rate for K-12 students in schools in Alameda County?

### Database Name
california_schools

### Current Node
{
  "nodeId": "root",
  "status": "created",
  "childIds": [],
  "intent": "What is the highest eligible free rate for K-12 students in schools in Alameda County?",
  "schema_linking": {},
  "generation": {},
  "evaluation": {}
}

### Full Schema
<database_schema>
  <total_tables>3</total_tables>
  <tables>
    <table name="frpm">
      <column_count>29</column_count>
      <columns>
        <column name="CDSCode">
          <type>text</type>
          <nullable>True</nullable>
          <primary_key>true</primary_key>
          <foreign_key>
            <references_table>schools</references_table>
            <references_column>CDSCode</references_column>
          </foreign_key>
        </column>
        <column name="Academic Year">
          <type>text</t