# Simple Parish Extraction Demo

This notebook demonstrates the complete parish extraction pipeline in just a few cells.

**Prerequisites**: 
1. Run `00_Colab_Setup.ipynb` first
2. Run `01_Build_Dioceses_Database.ipynb` to populate dioceses

**What this does**:
- Finds parish directory pages for dioceses
- Extracts parish data using AI-powered analysis
- Saves results to your Supabase database

In [None]:
# Cell 1: Setup and Configuration
from config.settings import get_config, set_config
from src.pipeline import ParishExtractionPipeline

# Get configuration (should be set from setup notebook)
try:
    config = get_config()
    print("✅ Configuration loaded")
    print(f"   Database: {'Connected' if config.supabase else 'Not connected'}")
    print(f"   AI: {'Enabled' if config.genai_enabled else 'Mock mode'}")
    print(f"   Max dioceses: {config.max_dioceses}")
except RuntimeError:
    print("❌ Configuration not found. Please run 00_Colab_Setup.ipynb first")
    raise

# Override max dioceses for demo (you can change this)
config.max_dioceses = 3
print(f"\n🎯 Demo will process {config.max_dioceses} dioceses")

In [None]:
# Cell 2: Run the Complete Pipeline
print("🚀 Starting parish extraction pipeline...\n")

# Create and run the pipeline
pipeline = ParishExtractionPipeline(config)
results = pipeline.run_full_extraction()

print("\n✅ Pipeline completed!")

In [None]:
# Cell 3: Analyze Results
if results:
    print("📊 DETAILED ANALYSIS\n")
    
    total_parishes = sum(len(r.parishes) for r in results)
    successful_extractions = sum(1 for r in results if r.success)
    total_saved = sum(r.saved_count for r in results)
    
    print(f"Summary Statistics:")
    print(f"  📋 Dioceses processed: {len(results)}")
    print(f"  ✅ Successful extractions: {successful_extractions}")
    print(f"  🏛️ Total parishes found: {total_parishes}")
    print(f"  💾 Total parishes saved: {total_saved}")
    
    if successful_extractions > 0:
        avg_parishes = total_parishes / successful_extractions
        print(f"  📈 Average parishes per diocese: {avg_parishes:.1f}")
    
    # Show site types detected
    site_types = {}
    for result in results:
        if result.success:
            site_type = result.site_type.value
            site_types[site_type] = site_types.get(site_type, 0) + 1
    
    if site_types:
        print(f"\n🔍 Website Types Detected:")
        for site_type, count in site_types.items():
            print(f"  {site_type.replace('_', ' ').title()}: {count} dioceses")
    
    # Show individual results
    print(f"\n📋 Individual Diocese Results:")
    for result in results:
        status = "✅" if result.success else "❌"
        parishes_info = f"{len(result.parishes)} parishes" if result.success else "Failed"
        saved_info = f" ({result.saved_count} saved)" if result.saved_count > 0 else ""
        print(f"  {status} {result.diocese_name}: {parishes_info}{saved_info}")
        
        if result.errors:
            for error in result.errors:
                print(f"      Error: {error}")
    
    # Show sample parishes
    all_parishes = []
    for result in results:
        all_parishes.extend(result.parishes)
    
    if all_parishes:
        print(f"\n🏛️ Sample Parishes Extracted:")
        for i, parish in enumerate(all_parishes[:5], 1):
            print(f"  {i}. {parish.name}")
            if parish.city:
                print(f"     📍 {parish.city}")
            if parish.phone:
                print(f"     📞 {parish.phone}")
            if parish.website:
                print(f"     🌐 {parish.website}")
        
        if len(all_parishes) > 5:
            print(f"     ... and {len(all_parishes) - 5} more parishes")

else:
    print("❌ No results to analyze")

In [None]:
# Cell 4: Save Detailed Results (Optional)
if results:
    print("💾 Saving detailed results...")
    
    try:
        pipeline.save_results_to_file(results, 'demo_extraction_results.json')
        
        # Download file in Colab
        try:
            from google.colab import files
            files.download('demo_extraction_results.json')
            print("⬇️ Results file downloaded")
        except ImportError:
            print("📁 Results saved locally")
            
    except Exception as e:
        print(f"❌ Error saving results: {e}")

else:
    print("❌ No results to save")

In [None]:
# Cell 5: Next Steps and Tips
print("🎉 Demo Complete!\n")

if config.supabase:
    print("✅ Check your Supabase database for the extracted parish data:")
    print("   - Table: Parishes (main parish data)")
    print("   - Table: DiocesesParishDirectory (directory URLs found)")
else:
    print("⚠️ Database not configured - data only exists in this session")

print("\n🔧 To customize the extraction:")
print("   - Increase config.max_dioceses to process more dioceses")
print("   - Modify config.ai_confidence_threshold to change AI sensitivity")
print("   - Check individual notebook files for specific extraction tasks")

print("\n📚 Available notebooks:")
print("   - 01_Build_Dioceses_Database.ipynb: Extract diocese info from USCCB")
print("   - 02_Find_Parish_Directories.ipynb: Find parish directory URLs")
print("   - 03_Extract_Parish_Data.ipynb: Extract detailed parish information")
print("   - 99_Simple_Demo.ipynb: This complete pipeline demo")

if results and any(r.success for r in results):
    print("\n🎯 Success! The system successfully found and extracted parish data.")
    print("You can now scale this up to process more dioceses.")
else:
    print("\n🔍 If you didn't get good results, try:")
    print("   - Checking your API keys and database connection")
    print("   - Running the individual notebooks to debug specific steps")
    print("   - Increasing the number of dioceses processed")