# SchemaReader & DatabaseSchemaManager Demo\n\nThis notebook demonstrates how to use **SchemaReader** and **DatabaseSchemaManager** to load database schema information and visualize what gets stored in memory.\n\n## 🎯 What You'll Learn:\n- How to load real database schemas from BIRD dataset\n- What schema information gets stored in the `databaseSchema` memory key\n- How to query and explore loaded schema data\n- Memory structure and content organization\n- Schema manager API usage patterns\n\n## 📊 Memory Location:\n- **Key:** `databaseSchema`\n- **Manager:** `DatabaseSchemaManager`\n- **Content:** Tables, columns, foreign keys, sample data, metadata

In [None]:
# Setup imports and path\nimport sys\nimport json\nimport asyncio\nimport os\nfrom pathlib import Path\nfrom pprint import pprint\n\n# Add src to path\nsys.path.append(str(Path('..') / 'src'))\n\nfrom keyvalue_memory import KeyValueMemory\nfrom database_schema_manager import DatabaseSchemaManager\nfrom schema_reader import SchemaReader\nfrom memory_content_types import TableSchema, ColumnInfo\n\nprint(\"✅ All imports successful\")\nprint(f\"📁 Working directory: {os.getcwd()}\")

In [None]:
# Helper functions for memory visualization\ndef display_schema_memory(schema_data, title=\"Schema Memory State\"):\n    \"\"\"\n    Display schema memory content in a detailed, formatted way.\n    \"\"\"\n    print(f\"\\n{'='*60}\")\n    print(f\"{title.upper()}\")\n    print(f\"{'='*60}\")\n    \n    if not schema_data:\n        print(\"📝 Schema memory is empty\")\n        return\n    \n    # Display metadata\n    metadata = schema_data.get('metadata', {})\n    if metadata:\n        print(f\"\\n🏷️ METADATA:\")\n        for key, value in metadata.items():\n            print(f\"   {key}: {value}\")\n    \n    # Display tables\n    tables = schema_data.get('tables', {})\n    print(f\"\\n📊 TABLES ({len(tables)} total):\")\n    \n    for table_name, table_data in tables.items():\n        print(f\"\\n📁 Table: {table_name}\")\n        \n        # Columns\n        columns = table_data.get('columns', {})\n        print(f\"   📋 Columns ({len(columns)}):\")\n        for col_name, col_info in columns.items():\n            data_type = col_info.get('dataType', 'UNKNOWN')\n            nullable = '?' if col_info.get('nullable', True) else '!'\n            pk = '🔑' if col_info.get('isPrimaryKey', False) else '  '\n            fk = '🔗' if col_info.get('isForeignKey', False) else '  '\n            print(f\"      {pk}{fk} {col_name:<20} {data_type:<15} {nullable}\")\n            \n            # Show foreign key reference\n            if col_info.get('isForeignKey') and col_info.get('references'):\n                ref = col_info['references']\n                print(f\"         └── References: {ref.get('table')}.{ref.get('column')}\")\n        \n        # Sample data\n        sample_data = table_data.get('sampleData', [])\n        if sample_data:\n            print(f\"   📄 Sample Data ({len(sample_data)} rows):\")\n            for i, row in enumerate(sample_data[:3]):\n                row_display = dict(list(row.items())[:3]) if len(row) > 3 else row\n                print(f\"      Row {i+1}: {row_display}{'...' if len(row) > 3 else ''}\")\n            if len(sample_data) > 3:\n                print(f\"      ... and {len(sample_data)-3} more rows\")\n        \n        # Metadata\n        table_metadata = table_data.get('metadata', {})\n        if table_metadata:\n            print(f\"   ℹ️ Metadata: {table_metadata}\")\n    \n    print(f\"\\n{'='*60}\")\n\nasync def show_schema_state(memory, title=\"Current Schema State\"):\n    \"\"\"Get and display schema memory state.\"\"\"\n    schema_data = await memory.get(\"databaseSchema\")\n    display_schema_memory(schema_data, title)\n    return schema_data\n\ndef show_schema_summary(schema_data):\n    \"\"\"Show a quick summary of schema data.\"\"\"\n    if not schema_data:\n        print(\"📝 No schema data available\")\n        return\n    \n    tables = schema_data.get('tables', {})\n    total_columns = sum(len(t.get('columns', {})) for t in tables.values())\n    total_fks = sum(sum(1 for c in t.get('columns', {}).values() if c.get('isForeignKey')) for t in tables.values())\n    total_pks = sum(sum(1 for c in t.get('columns', {}).values() if c.get('isPrimaryKey')) for t in tables.values())\n    \n    print(f\"\\n📊 SCHEMA SUMMARY:\")\n    print(f\"   📁 Tables: {len(tables)}\")\n    print(f\"   📋 Total Columns: {total_columns}\")\n    print(f\"   🔑 Primary Keys: {total_pks}\")\n    print(f\"   🔗 Foreign Keys: {total_fks}\")\n\nprint(\"✅ Helper functions defined\")

## 1. Initialize Memory and Managers\n\nSet up the shared memory and initialize the DatabaseSchemaManager.

In [None]:
# Initialize shared memory and schema manager\nmemory = KeyValueMemory()\nschema_manager = DatabaseSchemaManager(memory)\n\nprint(\"🏗️ Initializing DatabaseSchemaManager...\")\nawait schema_manager.initialize()\n\n# Show initial empty state\ninitial_data = await show_schema_state(memory, \"Initial Empty Schema Memory\")\n\nprint(\"\\n✅ DatabaseSchemaManager initialized and ready\")

## 2. Manual Schema Creation (Simple Example)\n\nFirst, let's manually create a simple schema to understand the data structure.

In [None]:
print(\"📝 Creating a simple manual schema first...\")\n\n# Create a simple customers table\ncustomers_table = TableSchema(\n    name=\"customers\",\n    columns={\n        \"id\": ColumnInfo(\n            dataType=\"INTEGER\",\n            nullable=False,\n            isPrimaryKey=True,\n            isForeignKey=False,\n            typicalValues=[1, 2, 3, 100, 999]\n        ),\n        \"name\": ColumnInfo(\n            dataType=\"VARCHAR(100)\",\n            nullable=False,\n            isPrimaryKey=False,\n            isForeignKey=False,\n            typicalValues=[\"John Doe\", \"Jane Smith\", \"Bob Johnson\"]\n        ),\n        \"email\": ColumnInfo(\n            dataType=\"VARCHAR(100)\",\n            nullable=True,\n            isPrimaryKey=False,\n            isForeignKey=False,\n            typicalValues=[\"john@example.com\", \"jane@company.com\", None]\n        )\n    },\n    sampleData=[\n        {\"id\": 1, \"name\": \"John Doe\", \"email\": \"john@example.com\"},\n        {\"id\": 2, \"name\": \"Jane Smith\", \"email\": \"jane@company.com\"},\n        {\"id\": 3, \"name\": \"Bob Johnson\", \"email\": None}\n    ],\n    metadata={\"rowCount\": 1000, \"lastUpdated\": \"2024-01-01\", \"tablespace\": \"main\"}\n)\n\nawait schema_manager.add_table(customers_table)\nprint(\"✅ Added customers table\")\n\nawait show_schema_state(memory, \"After Adding Customers Table\")

In [None]:
print(\"📝 Adding orders table with foreign key relationship...\")\n\n# Create orders table with foreign key\norders_table = TableSchema(\n    name=\"orders\",\n    columns={\n        \"id\": ColumnInfo(\n            dataType=\"INTEGER\",\n            nullable=False,\n            isPrimaryKey=True,\n            isForeignKey=False,\n            typicalValues=[1001, 1002, 1003]\n        ),\n        \"customer_id\": ColumnInfo(\n            dataType=\"INTEGER\",\n            nullable=False,\n            isPrimaryKey=False,\n            isForeignKey=True,\n            references={\"table\": \"customers\", \"column\": \"id\"},\n            typicalValues=[1, 2, 3]\n        ),\n        \"order_date\": ColumnInfo(\n            dataType=\"DATE\",\n            nullable=False,\n            isPrimaryKey=False,\n            isForeignKey=False,\n            typicalValues=[\"2024-01-01\", \"2024-01-02\", \"2024-01-03\"]\n        ),\n        \"total\": ColumnInfo(\n            dataType=\"DECIMAL(10,2)\",\n            nullable=False,\n            isPrimaryKey=False,\n            isForeignKey=False,\n            typicalValues=[99.99, 149.50, 75.25]\n        )\n    },\n    sampleData=[\n        {\"id\": 1001, \"customer_id\": 1, \"order_date\": \"2024-01-01\", \"total\": 99.99},\n        {\"id\": 1002, \"customer_id\": 2, \"order_date\": \"2024-01-02\", \"total\": 149.50},\n        {\"id\": 1003, \"customer_id\": 1, \"order_date\": \"2024-01-03\", \"total\": 75.25}\n    ],\n    metadata={\"rowCount\": 5000, \"lastUpdated\": \"2024-01-03\"}\n)\n\nawait schema_manager.add_table(orders_table)\nprint(\"✅ Added orders table with foreign key\")\n\nawait show_schema_state(memory, \"After Adding Orders Table with Foreign Key\")

## 3. Query Schema Information\n\nDemonstrate how to query the loaded schema using DatabaseSchemaManager methods.

In [None]:
print(\"🔍 Querying Schema Information\")\nprint(\"=\" * 40)\n\n# Get basic schema information\ntable_names = await schema_manager.get_table_names()\nprint(f\"\\n📁 Table Names: {table_names}\")\n\n# Get columns for a specific table\ncustomer_columns = await schema_manager.get_columns(\"customers\")\nprint(f\"\\n📋 Customers Table Columns: {list(customer_columns.keys())}\")\n\n# Get primary keys\ncustomers_pks = await schema_manager.get_primary_keys(\"customers\")\norders_pks = await schema_manager.get_primary_keys(\"orders\")\nprint(f\"\\n🔑 Primary Keys:\")\nprint(f\"   customers: {customers_pks}\")\nprint(f\"   orders: {orders_pks}\")\n\n# Get foreign keys\norders_fks = await schema_manager.get_foreign_keys(\"orders\")\nprint(f\"\\n🔗 Foreign Keys in orders table: {orders_fks}\")\n\n# Search columns by type\ninteger_columns = await schema_manager.search_columns_by_type(\"INTEGER\")\nprint(f\"\\n🔢 INTEGER columns: {integer_columns}\")\n\n# Find relationships\nrelationships = await schema_manager.find_relationships(\"orders\", \"customers\")\nprint(f\"\\n🔗 Relationships between orders and customers: {relationships}\")\n\n# Get schema summary\nschema_summary = await schema_manager.get_schema_summary()\nprint(f\"\\n📊 Schema Summary: {schema_summary}\")

## 4. Load Real Schema from BIRD Dataset\n\nNow let's load a real database schema from the BIRD dataset using SchemaReader.

In [None]:
# Check if BIRD dataset is available\ndata_path = \"/home/norman/work/text-to-sql/MAC-SQL/data/bird\"\ntables_json_path = str(Path(data_path) / \"dev_tables.json\")\n\nprint(f\"🔍 Checking for BIRD dataset...\")\nprint(f\"📁 Data path: {data_path}\")\nprint(f\"📄 Tables JSON: {tables_json_path}\")\n\nif os.path.exists(data_path) and os.path.exists(tables_json_path):\n    print(\"✅ BIRD dataset found!\")\n    bird_available = True\nelse:\n    print(\"❌ BIRD dataset not found\")\n    print(\"📝 Will continue with manual schema examples\")\n    bird_available = False\n\n# List available databases if BIRD is available\nif bird_available:\n    try:\n        schema_reader = SchemaReader(\n            data_path=data_path,\n            tables_json_path=tables_json_path,\n            dataset_name=\"bird\",\n            lazy=True\n        )\n        \n        # Get list of available databases from db2dbjsons\n        databases = list(schema_reader.db2dbjsons.keys())\n        print(f\"\\n📊 Available databases ({len(databases)}):\")\n        for i, db in enumerate(databases[:10]):\n            print(f\"   {i+1:2d}. {db}\")\n        if len(databases) > 10:\n            print(f\"   ... and {len(databases)-10} more\")\n            \n    except Exception as e:\n        print(f\"❌ Error reading BIRD dataset: {e}\")\n        bird_available = False

In [None]:
if bird_available:\n    print(\"🏫 Loading california_schools database schema...\")\n    \n    # Clear previous manual schema\n    await memory.clear()\n    schema_manager = DatabaseSchemaManager(memory)\n    await schema_manager.initialize()\n    \n    # Load california_schools database\n    db_id = \"california_schools\"\n    await schema_manager.load_from_schema_reader(schema_reader, db_id)\n    \n    print(f\"✅ Loaded {db_id} schema\")\n    \n    # Show the loaded schema\n    schema_data = await show_schema_state(memory, f\"BIRD Dataset - {db_id} Schema\")\n    show_schema_summary(schema_data)\n    \nelse:\n    print(\"📝 Continuing with manual schema (BIRD dataset not available)\")\n    schema_data = await memory.get(\"databaseSchema\")\n    show_schema_summary(schema_data)

## 5. Cleanup and Summary\n\nCleanup and provide a comprehensive summary of what we learned.

In [None]:
print(\"📋 Demo Summary and Cleanup\")\nprint(\"=\" * 35)\n\n# Final statistics\nfinal_stats = await schema_manager.get_schema_summary()\ntable_names = await schema_manager.get_table_names()\n\nprint(f\"\\n📊 Final Schema Statistics:\")\nprint(f\"   📁 Tables loaded: {len(table_names)}\")\nprint(f\"   📋 Total columns: {final_stats.get('total_columns', 0)}\")\nprint(f\"   🔑 Primary keys: {final_stats.get('total_primary_keys', 0)}\")\nprint(f\"   🔗 Foreign keys: {final_stats.get('total_foreign_keys', 0)}\")\n\n# Show what's in memory one final time\nprint(f\"\\n💾 Final Memory State:\")\nschema_data = await memory.get(\"databaseSchema\")\nif schema_data:\n    print(f\"   ✅ databaseSchema key contains: {type(schema_data).__name__}\")\n    print(f\"   📊 Memory size: {len(str(schema_data)):,} characters\")\n    tables = schema_data.get('tables', {})\n    print(f\"   📁 Tables in memory: {len(tables)}\")\nelse:\n    print(f\"   ❌ No schema data in memory\")\n\n# Cleanup\nprint(f\"\\n🧹 Cleaning up...\")\nawait memory.clear()\nprint(f\"✅ Memory cleared\")\n\n# Verify cleanup\ncleaned_data = await memory.get(\"databaseSchema\")\nprint(f\"📝 After cleanup: {cleaned_data}\")\n\nprint(f\"\\n🎉 Demo Summary:\")\nprint(f\"   ✅ Learned how to use SchemaReader to load real database schemas\")\nprint(f\"   ✅ Understood DatabaseSchemaManager API and capabilities\")\nprint(f\"   ✅ Explored memory structure at 'databaseSchema' key\")\nprint(f\"   ✅ Analyzed schema data including tables, columns, relationships\")\nprint(f\"   ✅ Demonstrated querying and search capabilities\")\n\nprint(f\"\\n🎯 Key Takeaways:\")\nprint(f\"   🔑 Schema data is stored at 'databaseSchema' memory key\")\nprint(f\"   🏗️ DatabaseSchemaManager provides rich querying capabilities\")\nprint(f\"   📊 Schema includes tables, columns, relationships, sample data\")\nprint(f\"   🔍 Supports complex searches by type, pattern, relationships\")\nprint(f\"   ⚡ Fast in-memory operations for real-time querying\")\nprint(f\"   🔗 SchemaReader integrates seamlessly with real datasets\")\n\nprint(f\"\\n🎉 Database Schema Demo completed successfully!\")