# Extract Parish Data

This notebook extracts detailed parish information from discovered parish directory pages.

**What this does**:
- Sets up the complete environment (no separate setup notebook needed)
- Detects website patterns and selects optimal extraction strategies
- Extracts comprehensive parish data including addresses, contacts, and schedules
- Handles multiple website platforms (eCatholic, SquareSpace, WordPress, etc.)
- Saves extracted parish data to Supabase database

**Prerequisites**: 
- Run `01_Build_Dioceses_Database.ipynb` to populate dioceses
- Run `02_Find_Parish_Directories.ipynb` to discover directory URLs

In [None]:
# Cell 1: Complete Environment Setup
import os
import sys
import warnings
warnings.filterwarnings('ignore')

print("🚀 Setting up USCCB Parish Extraction Environment...\n")

# Step 1: Clone repository if needed
repo_path = '/content/usccb-parish-extraction'
if not os.path.exists(repo_path):
    print("📁 Cloning repository...")
    !git clone https://github.com/tomknightatl/usccb-parish-extraction.git
    print("✅ Repository cloned")
else:
    print("✅ Repository already exists")
    os.chdir(repo_path)
    !git pull --quiet
    print("✅ Repository updated")

# Step 2: Set working directory and Python path
os.chdir(repo_path)
if repo_path not in sys.path:
    sys.path.insert(0, repo_path)
print(f"📂 Working directory: {os.getcwd()}")

# Step 3: Install required packages
print("\n📦 Installing packages...")
!pip install --quiet selenium==4.15.0 webdriver-manager==4.0.1
!pip install --quiet beautifulsoup4==4.12.2 lxml
!pip install --quiet google-generativeai==0.3.0 tenacity==8.2.3
!pip install --quiet "supabase>=2.15.0"
print("✅ Packages installed")

# Step 4: Import required modules
print("\n🧪 Testing imports...")
try:
    import time
    from datetime import datetime
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import json
    print("✅ External packages imported")
    
    from config.settings import setup_environment, set_config, get_config
    from src.models import Diocese, Parish, ExtractionResult, SiteType
    from src.utils.webdriver import setup_driver
    from src.extractors import get_extractor
    from src.utils.ai_analysis import detect_site_type
    print("✅ Project modules imported")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("\n🔧 Try restarting runtime and running this cell again")
    raise

# Step 5: Configure APIs
print("\n🔑 Configuring APIs...")
from google.colab import userdata

try:
    supabase_url = userdata.get('SUPABASE_URL')
    supabase_key = userdata.get('SUPABASE_KEY')
    genai_key = userdata.get('GENAI_API_KEY_USCCB')
    
    config = setup_environment(
        supabase_url=supabase_url,
        supabase_key=supabase_key,
        genai_api_key=genai_key,
        max_dioceses=3  # Process 3 dioceses for testing
    )
    set_config(config)
    
    print("✅ Configuration complete")
    print(f"   📊 Database: {'Connected' if config.supabase else 'Not connected'}")
    print(f"   🤖 AI: {'Enabled' if config.genai_enabled else 'Mock mode'}")
    
except Exception as e:
    print(f"❌ Configuration error: {e}")
    print("\n🔧 Make sure to add your API keys to Colab Secrets:")
    print("   • SUPABASE_URL")
    print("   • SUPABASE_KEY")
    print("   • GENAI_API_KEY_USCCB")
    config = None

print("\n🎉 Environment setup complete!")

In [None]:
# Cell 2: Parish Extraction Functions

def get_dioceses_with_directories(limit=None):
    """Get dioceses that have parish directory URLs and need parish extraction."""
    if not config or not config.supabase:
        print("❌ No database connection")
        return []
    
    try:
        # Get dioceses with parish directory URLs
        try:
            response = config.supabase.table('DiocesesParishDirectory').select(
                'diocese_url, parish_directory_url'
            ).not_.is_('parish_directory_url', 'null').not_.eq('parish_directory_url', '').execute()
            
            diocese_directories = response.data or []
        except:
            print("❌ DiocesesParishDirectory table not found")
            print("\n🔧 Run 02_Find_Parish_Directories.ipynb first")
            return []
        
        # Get diocese names
        if diocese_directories:
            diocese_urls = [item['diocese_url'] for item in diocese_directories]
            
            names_response = config.supabase.table('Dioceses').select(
                'Website, Name'
            ).in_('Website', diocese_urls).execute()
            
            url_to_name = {item['Website']: item['Name'] for item in (names_response.data or [])}
            
            # Combine data
            dioceses_to_process = []
            for item in diocese_directories:
                diocese_url = item['diocese_url']
                diocese_name = url_to_name.get(diocese_url, 'Unknown Diocese')
                
                dioceses_to_process.append({
                    'name': diocese_name,
                    'url': diocese_url,
                    'parish_directory_url': item['parish_directory_url']
                })
            
            if limit and len(dioceses_to_process) > limit:
                import random
                dioceses_to_process = random.sample(dioceses_to_process, limit)
            
            return dioceses_to_process
        
        return []
        
    except Exception as e:
        print(f"❌ Error fetching dioceses with directories: {e}")
        return []

def save_parishes_to_database(parishes, diocese_url, directory_url, extraction_method):
    """Save parishes to Supabase database with improved error handling"""
    if not config or not config.supabase:
        print("  📝 Would save parishes to database (no connection)")
        return len(parishes)
    
    saved_count = 0
    failed_count = 0
    
    # Save in batches to avoid timeouts
    batch_size = 10
    for i in range(0, len(parishes), batch_size):
        batch = parishes[i:i + batch_size]
        batch_data = []
        
        for parish in batch:
            try:
                parish_data = {
                    'name': parish.name,
                    'address': parish.address,
                    'city': parish.city,
                    'state': parish.state,
                    'zip_code': parish.zip_code,
                    'phone': parish.phone,
                    'email': parish.email,
                    'website': parish.website,
                    'pastor': parish.pastor,
                    'mass_times': parish.mass_times,
                    'latitude': parish.latitude,
                    'longitude': parish.longitude,
                    'diocese_url': diocese_url,
                    'directory_url': directory_url,
                    'extraction_method': extraction_method,
                    'confidence_score': parish.confidence,
                    'extracted_at': datetime.now().isoformat()
                }
                batch_data.append(parish_data)
            except Exception as e:
                print(f"    ⚠️ Error preparing parish data: {e}")
                failed_count += 1
        
        # Save the batch
        if batch_data:
            try:
                response = config.supabase.table('Parishes').insert(batch_data).execute()
                saved_count += len(batch_data)
            except Exception as e:
                print(f"    ❌ Error saving batch to database: {e}")
                failed_count += len(batch_data)
    
    if failed_count > 0:
        print(f"    ⚠️ {failed_count} parishes failed to save")
    
    return saved_count

def extract_parishes_from_directory(diocese_info, driver):
    """Extract parishes from a single diocese directory page."""
    diocese_name = diocese_info['name']
    diocese_url = diocese_info['url']
    directory_url = diocese_info['parish_directory_url']
    
    print(f"\n🏛️ Extracting parishes from: {diocese_name}")
    print(f"  📍 Diocese URL: {diocese_url}")
    print(f"  📂 Directory URL: {directory_url}")
    
    try:
        # Load the parish directory page
        print(f"  📥 Loading directory page...")
        driver.get(directory_url)
        time.sleep(3)  # Give time for JS to load
        
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Detect site type
        print(f"  🔍 Detecting website pattern...")
        site_type = detect_site_type(soup, directory_url)
        print(f"    📊 Detected type: {site_type.value}")
        
        # Get appropriate extractor
        extractor = get_extractor(site_type.value)
        print(f"    🔧 Using extractor: {extractor.name}")
        
        # Extract parishes
        print(f"  ⚙️ Extracting parish data...")
        parishes = extractor.extract(soup, directory_url, driver)
        
        print(f"  ✅ Extracted {len(parishes)} parishes")
        
        # Create extraction result
        result = ExtractionResult(
            diocese_name=diocese_name,
            diocese_url=diocese_url,
            directory_url=directory_url,
            parishes=parishes,
            site_type=site_type,
            success=len(parishes) > 0
        )
        
        # Save to database
        if parishes:
            print(f"  💾 Saving parishes to database...")
            saved_count = save_parishes_to_database(
                parishes, diocese_url, directory_url, site_type.value
            )
            result.saved_count = saved_count
            print(f"    📊 Saved {saved_count} parishes")
        
        return result
        
    except Exception as e:
        error_msg = str(e)[:100]
        print(f"  ❌ Error extracting from {diocese_name}: {error_msg}")
        
        return ExtractionResult(
            diocese_name=diocese_name,
            diocese_url=diocese_url,
            directory_url=directory_url,
            parishes=[],
            site_type=SiteType.GENERIC,
            success=False,
            errors=[error_msg]
        )

print("✅ Parish extraction functions loaded")

In [None]:
# Cell 3: Main Extraction Process

# Set processing limit (you can change this)
MAX_DIOCESES_TO_PROCESS = 3  # Process 3 dioceses as a test

print(f"🚀 Starting parish data extraction...")
print(f"📊 Will process up to {MAX_DIOCESES_TO_PROCESS} dioceses")

# Get dioceses with directory URLs
dioceses_to_process = get_dioceses_with_directories(limit=MAX_DIOCESES_TO_PROCESS)

if not dioceses_to_process:
    print("❌ No dioceses with parish directory URLs found")
    print("\n🔧 Make sure you've run 02_Find_Parish_Directories.ipynb first")
else:
    print(f"📋 Found {len(dioceses_to_process)} dioceses with directory URLs")
    
    # Show what we'll process
    print(f"\n📋 Dioceses to process:")
    for i, diocese in enumerate(dioceses_to_process, 1):
        print(f"  {i}. {diocese['name']}")
        print(f"     Directory: {diocese['parish_directory_url']}")
    
    # Setup WebDriver
    driver = setup_driver()
    
    if not driver:
        print("❌ Failed to setup WebDriver")
    else:
        results = []
        
        try:
            for i, diocese_info in enumerate(dioceses_to_process, 1):
                print(f"\n{'='*70}")
                print(f"Processing diocese {i}/{len(dioceses_to_process)}")
                
                result = extract_parishes_from_directory(diocese_info, driver)
                results.append(result)
                
                # Be respectful - pause between requests
                if i < len(dioceses_to_process):
                    print(f"  ⏱️ Waiting {config.request_delay} seconds...")
                    time.sleep(config.request_delay)
        
        finally:
            driver.quit()
            print("\n🧹 WebDriver closed")
        
        # Print comprehensive summary
        print(f"\n{'='*70}")
        print(f"📊 EXTRACTION SUMMARY")
        print(f"{'='*70}")
        
        total_parishes = sum(len(r.parishes) for r in results)
        successful_extractions = sum(1 for r in results if r.success)
        total_saved = sum(getattr(r, 'saved_count', 0) for r in results)
        
        print(f"Total dioceses processed: {len(results)}")
        print(f"Successful extractions: {successful_extractions}")
        print(f"Total parishes found: {total_parishes}")
        print(f"Total parishes saved: {total_saved}")
        
        if successful_extractions > 0:
            print(f"Average parishes per diocese: {total_parishes/successful_extractions:.1f}")
            print(f"Success rate: {successful_extractions/len(results)*100:.1f}%")
        
        # Show site types detected
        site_types = {}
        for result in results:
            if result.success:
                site_type = result.site_type.value
                site_types[site_type] = site_types.get(site_type, 0) + 1
        
        if site_types:
            print(f"\n🔍 Website Types Detected:")
            for site_type, count in site_types.items():
                print(f"  {site_type.replace('_', ' ').title()}: {count} dioceses")
        
        # Show detailed results
        print(f"\n📋 Detailed Results:")
        for result in results:
            status = "✅" if result.success else "❌"
            parishes_info = f"{len(result.parishes)} parishes" if result.success else "Failed"
            saved_info = f" ({getattr(result, 'saved_count', 0)} saved)" if getattr(result, 'saved_count', 0) > 0 else ""
            
            print(f"  {status} {result.diocese_name}: {parishes_info}{saved_info}")
            print(f"      Site Type: {result.site_type.value}")
            print(f"      Directory: {result.directory_url}")
            
            if result.errors:
                for error in result.errors:
                    print(f"      Error: {error}")
            
        # Show sample parishes
        all_parishes = []
        for result in results:
            all_parishes.extend(result.parishes)
        
        if all_parishes:
            print(f"\n🏛️ Sample Parishes Extracted:")
            for i, parish in enumerate(all_parishes[:10], 1):
                print(f"  {i}. {parish.name}")
                if parish.city:
                    print(f"     📍 {parish.city}")
                if parish.phone:
                    print(f"     📞 {parish.phone}")
                if parish.website:
                    print(f"     🌐 {parish.website}")
            
            if len(all_parishes) > 10:
                print(f"     ... and {len(all_parishes) - 10} more parishes")
        
        # Save detailed results to file
        try:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            filename = f'parish_extraction_results_{timestamp}.json'
            
            # Convert results to serializable format
            serializable_results = []
            for result in results:
                serializable_results.append({
                    'diocese_name': result.diocese_name,
                    'diocese_url': result.diocese_url,
                    'directory_url': result.directory_url,
                    'parish_count': len(result.parishes),
                    'site_type': result.site_type.value,
                    'success': result.success,
                    'saved_count': getattr(result, 'saved_count', 0),
                    'errors': result.errors,
                    'parishes': [
                        {
                            'name': p.name,
                            'city': p.city,
                            'address': p.address,
                            'phone': p.phone,
                            'website': p.website,
                            'confidence': p.confidence
                        }
                        for p in result.parishes
                    ]
                })
            
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(serializable_results, f, indent=2, ensure_ascii=False)
            
            print(f"💾 Detailed results saved to: {filename}")
            
            # Download file in Colab
            try:
                from google.colab import files
                files.download(filename)
                print(f"⬇️ Results file downloaded")
            except ImportError:
                print(f"📁 Results saved locally")
                
        except Exception as e:
            print(f"❌ Error saving results: {e}")
        
        print("\n🎉 Parish data extraction complete!")
        print("\n📊 Check your Supabase database for the extracted parish data!")