# Tools Test Notebook

This notebook tests the functionality of the updated `tools.py` module in the dispatcher package, which now uses the actual SchemaManager and SQLExecutor instead of mock data.

## 1. Setup and Environment

In [1]:
import os
import sys
import json
from typing import Dict, List, Any, Tuple

# Add the project root to the path to import the code
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

# Import the tools module
from dispatcher.tools import (
    read_database_schema_and_records,
    execute_sql_and_return_output,
    bird_schema_manager,
    bird_sql_executor,
    spider_schema_manager,
    spider_sql_executor,
    BIRD_DATA_PATH,
    BIRD_TABLES_JSON
)

# Import core utilities
from core.utils import load_json_file

load json file from /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_tables.json
load json file from /home/norman/work/text-to-sql/MAC-SQL/data/spider/tables.json


## 2. Verify Environment Configuration

In [2]:
# Verify paths and configuration
print("Checking configuration and paths...")
print(f"Bird data path: {BIRD_DATA_PATH}")
print(f"Bird tables JSON: {BIRD_TABLES_JSON}")
print(f"Bird schema manager initialized: {bird_schema_manager is not None}")
print(f"Bird SQL executor initialized: {bird_sql_executor is not None}")
print(f"Spider schema manager initialized: {spider_schema_manager is not None}")
print(f"Spider SQL executor initialized: {spider_sql_executor is not None}")

# Load Bird database IDs
bird_tables_data = load_json_file(BIRD_TABLES_JSON) if os.path.exists(BIRD_TABLES_JSON) else []
bird_db_ids = [item["db_id"] for item in bird_tables_data]
print(f"\nNumber of Bird databases: {len(bird_db_ids)}")
if bird_db_ids:
    print(f"Sample Bird database IDs: {bird_db_ids[:5]}")

Checking configuration and paths...
Bird data path: /home/norman/work/text-to-sql/MAC-SQL/data/bird
Bird tables JSON: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_tables.json
Bird schema manager initialized: True
Bird SQL executor initialized: True
Spider schema manager initialized: True
Spider SQL executor initialized: True
load json file from /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_tables.json

Number of Bird databases: 11
Sample Bird database IDs: ['debit_card_specializing', 'financial', 'formula_1', 'california_schools', 'card_games']


## 3. Test Database Schema Retrieval

Test the `read_database_schema_and_records` function with different parameters.

In [3]:
# Select a test database from BIRD
test_db_id = bird_db_ids[0] if bird_db_ids else None

if test_db_id:
    print(f"Testing with database: {test_db_id}")
    
    # Test 1: Basic schema retrieval (no sample data)
    print("\nTest 1: Basic schema retrieval (no sample data)")
    schema = read_database_schema_and_records(test_db_id)
    # Check if there's an error
    if isinstance(schema, dict) and "error" in schema:
        print(f"Error retrieving schema: {schema['error']}")
    else:
        # Print schema overview
        print(f"Retrieved schema with {len(schema)} tables")
        for table_name, table_info in schema.items():
            columns = table_info.get("columns", {})
            foreign_keys = table_info.get("foreign_keys", [])
            print(f"  - Table: {table_name}")
            print(f"    Columns: {len(columns)}")
            print(f"    Foreign keys: {len(foreign_keys)}")
            
            # Show a few column examples
            if columns:
                print("    Column examples:")
                for col_name, col_type in list(columns.items())[:3]:
                    print(f"      {col_name}: {col_type}")
                
            # Show foreign key examples
            if foreign_keys:
                print("    Foreign key examples:")
                for fk in foreign_keys[:2]:
                    print(f"      {fk[0]} -> {fk[1]}.{fk[2]}")
else:
    print("No test database available")

Testing with database: debit_card_specializing

Test 1: Basic schema retrieval (no sample data)
Generating schema description for database 'debit_card_specializing'...
Retrieved schema with 5 tables
  - Table: customers
    Columns: 3
    Foreign keys: 0
    Column examples:
      CustomerID: CustomerID
      Segment: client segment
      Currency: Currency
  - Table: gasstations
    Columns: 4
    Foreign keys: 0
    Column examples:
      GasStationID: Gas Station ID
      ChainID: Chain ID
      Country: Country
  - Table: products
    Columns: 2
    Foreign keys: 0
    Column examples:
      ProductID: Product ID
      Description: Description
  - Table: transactions_1k
    Columns: 9
    Foreign keys: 0
    Column examples:
      TransactionID: Transaction ID
      Date: Date
      Time: Time
  - Table: yearmonth
    Columns: 3
    Foreign keys: 1
    Column examples:
      CustomerID: Customer ID
      Date: Date
      Consumption: Consumption
    Foreign key examples:
      Cust

In [4]:
# Test 2: Schema retrieval with sample data
if test_db_id:
    print("\nTest 2: Schema retrieval with sample data")
    
    # Get all tables first to select a subset
    all_tables_schema = read_database_schema_and_records(test_db_id)
    
    # Check if there's an error
    if isinstance(all_tables_schema, dict) and "error" in all_tables_schema:
        print(f"Error retrieving schema: {all_tables_schema['error']}")
    else:
        table_names = list(all_tables_schema.keys())
        
        # Select up to 2 tables for sample data retrieval
        sample_tables = table_names[:min(2, len(table_names))]
        
        # Retrieve schema with sample data for specific tables
        schema_with_samples = read_database_schema_and_records(
            test_db_id,
            table_names=sample_tables,
            include_sample_data=True
        )
        
        # Check if there's an error
        if isinstance(schema_with_samples, dict) and "error" in schema_with_samples:
            print(f"Error retrieving schema with samples: {schema_with_samples['error']}")
        else:
            # Print schema and sample data
            print(f"Retrieved schema for {len(schema_with_samples)} tables with sample data")
            for table_name, table_info in schema_with_samples.items():
                columns = table_info.get("columns", {})
                sample_data = table_info.get("sample_data", [])
                
                print(f"\n  Table: {table_name}")
                print(f"  Columns: {len(columns)}")
                print(f"  Sample data rows: {len(sample_data)}")
                
                # Display sample data in a readable format
                if sample_data:
                    print("\n  Sample data:")
                    for i, row in enumerate(sample_data[:3]):
                        print(f"    Row {i+1}: {row}")


Test 2: Schema retrieval with sample data
Generating schema description for database 'debit_card_specializing'...
Generating schema description for database 'debit_card_specializing'...
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql/MAC-SQL/data/bird
BIRD_DB_DIRECTORY: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases
Checking database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite, exists: True
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql/MAC-SQL/data/bird
BIRD_DB_DIRECTORY: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases
Checking database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite, exists: True
Retrieved schema for 2 tables with sample data

  Table: customers
  Columns:

## 4. Test SQL Query Execution

Test the `execute_sql_and_return_output` function with various queries.

In [5]:
# Test 3: Execute basic SQL query
if test_db_id:
    print("\nTest 3: Execute basic SQL query")
    
    # Get a table name from the schema
    all_tables_schema = read_database_schema_and_records(test_db_id)
    
    # Check if there's an error
    if isinstance(all_tables_schema, dict) and "error" in all_tables_schema:
        print(f"Error retrieving schema: {all_tables_schema['error']}")
    else:
        table_names = list(all_tables_schema.keys())
        
        if table_names:
            test_table = table_names[0]
            
            # Execute a simple SELECT query
            query = f"SELECT * FROM {test_table} LIMIT 5"
            print(f"Executing query: {query}")
            
            result = execute_sql_and_return_output(query, test_db_id)
            
            # Print query result information
            print(f"\nQuery execution successful: {result.get('success', False)}")
            
            if result.get('success', False):
                print(f"Result rows: {result.get('row_count', 0)}")
                print(f"Columns: {result.get('column_names', [])}")
                
                # Print result rows
                rows = result.get('result', [])
                if rows:
                    print("\nSample results:")
                    for i, row in enumerate(rows[:3]):
                        print(f"  Row {i+1}: {row}")


Test 3: Execute basic SQL query
Generating schema description for database 'debit_card_specializing'...
Executing query: SELECT * FROM customers LIMIT 5

🤖 Executing SQL: SELECT * FROM customers LIMIT 5
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql/MAC-SQL/data/bird
BIRD_DB_DIRECTORY: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases
Checking database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite, exists: True
 Using database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite
 Query Result: [{'CustomerID': 3, 'Segment': 'SME', 'Currency': 'EUR'}, {'CustomerID': 5, 'Segment': 'LAM', 'Currency': 'EUR'}, {'CustomerID': 6, 'Segment': 'SME', 'Currency': 'EUR'}, {'CustomerID': 7, 'Segment': 'LAM', 'Currency': 'EUR'}, {'CustomerID': 9, 'Segment': 'SME', 'Currency': '

In [6]:
# Test 4: Execute SQL query with filtering
if test_db_id:
    print("\nTest 4: Execute SQL query with filtering")
    
    # Get tables from schema (do this again to avoid dependency on previous cells)
    all_tables_schema = read_database_schema_and_records(test_db_id)
    
    # Check if there's an error
    if isinstance(all_tables_schema, dict) and "error" in all_tables_schema:
        print(f"Error retrieving schema: {all_tables_schema['error']}")
    else:
        table_names = list(all_tables_schema.keys())
        
        if table_names:
            # First, get column information for the test table
            test_table = table_names[0]
            table_info = all_tables_schema.get(test_table, {})
            columns = list(table_info.get("columns", {}).keys())
            
            if columns:
                # Choose the first column for filtering
                filter_column = columns[0]
                
                # Execute a query to get possible values for the filter
                value_query = f"SELECT DISTINCT {filter_column} FROM {test_table} LIMIT 1"
                value_result = execute_sql_and_return_output(value_query, test_db_id)
                
                if value_result.get('success', False) and value_result.get('result', []):
                    # Get a sample value for filtering
                    filter_value = list(value_result['result'][0].values())[0]
                    
                    # Create a filter query
                    if isinstance(filter_value, str):
                        filter_query = f"SELECT * FROM {test_table} WHERE {filter_column} = '{filter_value}' LIMIT 5"
                    else:
                        filter_query = f"SELECT * FROM {test_table} WHERE {filter_column} = {filter_value} LIMIT 5"
                        
                    print(f"Executing filter query: {filter_query}")
                    
                    # Execute the filter query
                    filter_result = execute_sql_and_return_output(filter_query, test_db_id)
                    
                    # Print filter result information
                    print(f"\nQuery execution successful: {filter_result.get('success', False)}")
                    
                    if filter_result.get('success', False):
                        print(f"Result rows: {filter_result.get('row_count', 0)}")
                        
                        # Print result rows
                        rows = filter_result.get('result', [])
                        if rows:
                            print("\nFiltered results:")
                            for i, row in enumerate(rows[:3]):
                                print(f"  Row {i+1}: {row}")


Test 4: Execute SQL query with filtering
Generating schema description for database 'debit_card_specializing'...

🤖 Executing SQL: SELECT DISTINCT CustomerID FROM customers LIMIT 1
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql/MAC-SQL/data/bird
BIRD_DB_DIRECTORY: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases
Checking database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite, exists: True
 Using database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite
 Query Result: [{'CustomerID': 3}]
 Total Rows: 1
Executing filter query: SELECT * FROM customers WHERE CustomerID = 3 LIMIT 5

🤖 Executing SQL: SELECT * FROM customers WHERE CustomerID = 3 LIMIT 5
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql

In [7]:
# Test 5: Execute SQL query with joins
if test_db_id:
    print("\nTest 5: Execute SQL query with joins")
    
    # Get tables from schema (do this again to avoid dependency on previous cells)
    all_tables_schema = read_database_schema_and_records(test_db_id)
    
    # Check if there's an error
    if isinstance(all_tables_schema, dict) and "error" in all_tables_schema:
        print(f"Error retrieving schema: {all_tables_schema['error']}")
    else:
        table_names = list(all_tables_schema.keys())
        
        if len(table_names) >= 2:
            # Get foreign key information
            foreign_keys = []
            for table_name, table_info in all_tables_schema.items():
                for fk in table_info.get("foreign_keys", []):
                    foreign_keys.append((table_name, fk[0], fk[1], fk[2]))
            
            # Try to find tables with a foreign key relationship
            if foreign_keys:
                # Use the first foreign key for the join
                from_table, from_col, to_table, to_col = foreign_keys[0]
                
                # Create a join query
                join_query = f"""SELECT 
                    t1.*, t2.* 
                FROM 
                    {from_table} t1 
                JOIN 
                    {to_table} t2 
                ON 
                    t1.{from_col} = t2.{to_col} 
                LIMIT 5"""
                    
                print(f"Executing join query:\n{join_query}")
                
                # Execute the join query
                join_result = execute_sql_and_return_output(join_query, test_db_id)
                
                # Print join result information
                print(f"\nQuery execution successful: {join_result.get('success', False)}")
                
                if join_result.get('success', False):
                    print(f"Result rows: {join_result.get('row_count', 0)}")
                    print(f"Columns: {join_result.get('column_names', [])}")
                    
                    # Print result rows
                    rows = join_result.get('result', [])
                    if rows:
                        print("\nJoin results (first 2 rows):")
                        for i, row in enumerate(rows[:2]):
                            print(f"  Row {i+1}:")
                            # For join results, print each column on a new line for readability
                            for col, val in row.items():
                                print(f"    {col}: {val}")
            else:
                print("No foreign key relationships found for join testing")
        else:
            print("Not enough tables for join testing")


Test 5: Execute SQL query with joins
Generating schema description for database 'debit_card_specializing'...
Executing join query:
SELECT 
                    t1.*, t2.* 
                FROM 
                    yearmonth t1 
                JOIN 
                    customers t2 
                ON 
                    t1.CustomerID = t2.CustomerID 
                LIMIT 5

🤖 Executing SQL: SELECT 
                    t1.*, t2.* 
                FROM 
                    yearmonth t1 
                JOIN 
                    customers t2 
                ON 
                    t1.CustomerID = t2.CustomerID 
                LIMIT 5
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql/MAC-SQL/data/bird
BIRD_DB_DIRECTORY: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases
Checking database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite, e

## 5. Test Error Handling

Test how the functions handle errors and edge cases.

In [8]:
# Test 6: Invalid database ID
print("\nTest 6: Invalid database ID")

# Try to get schema for a non-existent database
invalid_db_id = "non_existent_database"
invalid_db_result = read_database_schema_and_records(invalid_db_id)

# Check for error
if "error" in invalid_db_result:
    print(f"✅ Correctly handled invalid database ID with error: {invalid_db_result['error']}")
else:
    print(f"❌ Failed to handle invalid database ID properly, got: {invalid_db_result}")


Test 6: Invalid database ID
Database ID 'non_existent_database' not found in schema_manager.db2dbjsons
Available database IDs: ['debit_card_specializing', 'financial', 'formula_1', 'california_schools', 'card_games']...
✅ Correctly handled invalid database ID with error: Database ID 'non_existent_database' not found. Available databases: ['debit_card_specializing', 'financial', 'formula_1', 'california_schools', 'card_games']...


In [9]:
# Test 7: Invalid SQL query
if test_db_id:
    print("\nTest 7: Invalid SQL query")
    
    # Execute a query with syntax error
    invalid_query = "SELECT * FORM invalid_table"
    print(f"Executing invalid query: {invalid_query}")
    
    invalid_query_result = execute_sql_and_return_output(invalid_query, test_db_id)
    
    # Check for error
    if not invalid_query_result.get("success", False) and "error" in invalid_query_result:
        print(f"✅ Correctly handled invalid SQL query with error: {invalid_query_result['error']}")
    else:
        print(f"❌ Failed to handle invalid SQL query properly, got: {invalid_query_result}")


Test 7: Invalid SQL query
Executing invalid query: SELECT * FORM invalid_table

🤖 Executing SQL: SELECT * FORM invalid_table
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql/MAC-SQL/data/bird
BIRD_DB_DIRECTORY: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases
Checking database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite, exists: True
 Using database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite
 Query Execution Error: near "FORM": syntax error
✅ Correctly handled invalid SQL query with error: near "FORM": syntax error


In [10]:
# Test 8: Non-existent table
if test_db_id:
    print("\nTest 8: Non-existent table")
    
    # Execute a query with a non-existent table
    nonexistent_table_query = "SELECT * FROM non_existent_table LIMIT 5"
    print(f"Executing query with non-existent table: {nonexistent_table_query}")
    
    nonexistent_result = execute_sql_and_return_output(nonexistent_table_query, test_db_id)
    
    # Check for error
    if not nonexistent_result.get("success", False) and "error" in nonexistent_result:
        print(f"✅ Correctly handled non-existent table with error: {nonexistent_result['error']}")
    else:
        print(f"❌ Failed to handle non-existent table properly, got: {nonexistent_result}")


Test 8: Non-existent table
Executing query with non-existent table: SELECT * FROM non_existent_table LIMIT 5

🤖 Executing SQL: SELECT * FROM non_existent_table LIMIT 5
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql/MAC-SQL/data/bird
BIRD_DB_DIRECTORY: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases
Checking database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite, exists: True
 Using database path: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases/debit_card_specializing/debit_card_specializing.sqlite
 Query Execution Error: no such table: non_existent_table
✅ Correctly handled non-existent table with error: no such table: non_existent_table


## 6. Spider Dataset Testing (if available)

Test the functions with the Spider dataset if available.

In [11]:
# Test Spider dataset if available
if spider_schema_manager is not None and spider_sql_executor is not None:
    print("\nTesting with Spider dataset")
    
    # Get Spider database IDs (from schema manager's db2dbjsons)
    spider_db_ids = list(spider_schema_manager.db2dbjsons.keys()) if spider_schema_manager else []
    print(f"Number of Spider databases: {len(spider_db_ids)}")
    
    if spider_db_ids:
        # Select a test database
        spider_test_db_id = spider_db_ids[0]
        print(f"Testing with Spider database: {spider_test_db_id}")
        
        # Get schema for the test database
        spider_schema = read_database_schema_and_records(
            spider_test_db_id,
            dataset_name="spider"
        )
        
        # Check if there's an error
        if isinstance(spider_schema, dict) and "error" in spider_schema:
            print(f"Error retrieving Spider schema: {spider_schema['error']}")
        else:
            # Print schema overview
            print(f"Retrieved schema with {len(spider_schema)} tables")
            spider_table_names = list(spider_schema.keys())
            
            if spider_table_names:
                # Execute a simple query
                spider_test_table = spider_table_names[0]
                spider_query = f"SELECT * FROM {spider_test_table} LIMIT 5"
                
                print(f"\nExecuting Spider query: {spider_query}")
                spider_result = execute_sql_and_return_output(
                    spider_query,
                    spider_test_db_id,
                    dataset_name="spider"
                )
                
                # Print query result
                print(f"Query execution successful: {spider_result.get('success', False)}")
                
                if spider_result.get('success', False):
                    print(f"Result rows: {spider_result.get('row_count', 0)}")
                    print(f"Columns: {spider_result.get('column_names', [])}")
                    
                    # Print sample results
                    rows = spider_result.get('result', [])
                    if rows:
                        print("\nSample results:")
                        for i, row in enumerate(rows[:3]):
                            print(f"  Row {i+1}: {row}")
else:
    print("\nSpider dataset is not available for testing")


Testing with Spider dataset
Number of Spider databases: 166
Testing with Spider database: perpetrator
Generating schema description for database 'perpetrator'...
Retrieved schema with 2 tables

Executing Spider query: SELECT * FROM perpetrator LIMIT 5

🤖 Executing SQL: SELECT * FROM perpetrator LIMIT 5
Current directory: /home/norman/work/text-to-sql/MAC-SQL/dispatcher
BIRD_DATA_PATH: /home/norman/work/text-to-sql/MAC-SQL/data/bird
BIRD_DB_DIRECTORY: /home/norman/work/text-to-sql/MAC-SQL/data/bird/dev_databases
Checking database path: /home/norman/work/text-to-sql/MAC-SQL/data/spider/database/perpetrator/perpetrator.sqlite, exists: True
 Using database path: /home/norman/work/text-to-sql/MAC-SQL/data/spider/database/perpetrator/perpetrator.sqlite
 Query Result: [{'Perpetrator_ID': 1, 'People_ID': 1, 'Date': '04.26 April 26/27', 'Year': 1982.0, 'Location': 'Uiryeong', 'Country': 'South Korea', 'Killed': 56, 'Injured': 37}, {'Perpetrator_ID': 2, 'People_ID': 3, 'Date': '11.18 Nov. 18', 