<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Enhanced_Pattern_Based_Parish_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
# =============================================================================
# CELL 1: Install and Import Dependencies (FIXED)
# =============================================================================

# Install additional dependencies for the enhanced system
!pip install supabase dataclasses-json beautifulsoup4 selenium webdriver-manager tenacity

# Complete imports including missing ones
import os
import time
import json
import random
import sqlite3
import pandas as pd
import subprocess  # FIXED: Added for Chrome installation
import re  # FIXED: Added for pattern detection
from datetime import datetime
from dataclasses import dataclass, asdict
from enum import Enum
from typing import List, Dict, Optional, Any
from urllib.parse import urljoin, urlparse

# Web scraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Your existing Supabase and AI imports
from google.colab import userdata
from supabase import create_client, Client

print("✅ All dependencies installed and imported successfully!")

✅ All dependencies installed and imported successfully!


In [56]:
# =============================================================================
# CELL 2: Configuration (Reuse your existing setup)
# =============================================================================

# Reuse your existing configuration logic
print("=== ENHANCED PARISH EXTRACTOR CONFIGURATION ===")

# GitHub and database setup (copy from your existing notebooks)
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# Supabase configuration (copy from your existing setup)
SUPABASE_URL = userdata.get('SUPABASE_URL')
SUPABASE_KEY = userdata.get('SUPABASE_KEY')

if SUPABASE_URL and SUPABASE_KEY:
    supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
    print("✅ Supabase client initialized")
else:
    print("❌ Supabase credentials not found")
    supabase = None

# Processing configuration
MAX_DIOCESES_TO_PROCESS = 1  # Start small for testing
ENABLE_PATTERN_DETECTION = True
SAVE_DETAILED_LOGS = True

print(f"📊 Will process {MAX_DIOCESES_TO_PROCESS} dioceses with pattern detection")

=== ENHANCED PARISH EXTRACTOR CONFIGURATION ===
✅ Supabase client initialized
📊 Will process 1 dioceses with pattern detection


In [57]:
# =============================================================================
# CELL 2.5: Chrome Installation for Google Colab (FIXED)
# =============================================================================

import subprocess  # This was missing!
import os

def ensure_chrome_installed():
    """Ensures Chrome is installed in the Colab environment."""
    try:
        # Check if Chrome is already available
        result = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True)
        if result.returncode == 0:
            print("✅ Chrome is already installed and available.")
            return True

        print("🔧 Chrome not found. Installing Chrome for Selenium...")

        # Install Chrome
        os.system('apt-get update > /dev/null 2>&1')
        os.system('wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - > /dev/null 2>&1')
        os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list')
        os.system('apt-get update > /dev/null 2>&1')
        os.system('apt-get install -y google-chrome-stable > /dev/null 2>&1')

        # Verify installation
        result = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            print(f"✅ Chrome installed successfully: {result.stdout.strip()}")
            return True
        else:
            print("❌ Chrome installation may have failed.")
            return False

    except Exception as e:
        print(f"❌ Error during Chrome installation: {e}")
        return False

# Run the installation check
print("🔧 Checking Chrome installation...")
chrome_ready = ensure_chrome_installed()

if chrome_ready:
    print("🚀 Ready to proceed with Selenium operations!")
else:
    print("⚠️ You may need to restart the runtime if Chrome installation failed.")

# Test Chrome installation
if chrome_ready:
    try:
        result = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
        print(f"📋 Chrome version: {result.stdout.strip()}")
    except Exception as e:
        print(f"⚠️ Chrome test failed: {e}")
        chrome_ready = False

print(f"Final Chrome status: {'✅ Ready' if chrome_ready else '❌ Not Ready'}")

🔧 Checking Chrome installation...
✅ Chrome is already installed and available.
🚀 Ready to proceed with Selenium operations!
📋 Chrome version: Google Chrome 137.0.7151.55
Final Chrome status: ✅ Ready


In [58]:
# =============================================================================
# CELL 2.6: Driver Setup Function (MISSING - NOW ADDED)
# =============================================================================

def setup_enhanced_driver():
    """Setup Chrome driver with enhanced options for parish extraction"""

    try:
        print("🔧 Setting up Chrome driver...")

        # Chrome options
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run in background
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        # Setup driver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # Additional setup to avoid detection
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

        # Set timeouts
        driver.implicitly_wait(10)
        driver.set_page_load_timeout(30)

        print("✅ Chrome driver setup complete")
        return driver

    except Exception as e:
        print(f"❌ Failed to setup driver: {e}")
        raise

print("✅ Driver setup function added")

✅ Driver setup function added


In [59]:
# =============================================================================
# CELL 3: Enhanced Pattern Detection Classes (UPDATED for detailed extraction)
# =============================================================================

import re  # This was also missing!

class DiocesePlatform(Enum):
    SQUARESPACE = "squarespace"
    WORDPRESS = "wordpress"
    DRUPAL = "drupal"
    CUSTOM_CMS = "custom"
    STATIC_HTML = "static"
    ECATHOLIC = "ecatholic"  # NEW: Added for sites like Tulsa
    DIOCESAN_CUSTOM = "diocesan_custom"  # NEW: For custom diocese sites like Salt Lake City
    UNKNOWN = "unknown"

class ParishListingType(Enum):
    INTERACTIVE_MAP = "interactive_map"
    STATIC_TABLE = "static_table"
    CARD_GRID = "card_grid"
    SIMPLE_LIST = "simple_list"
    PAGINATED_LIST = "paginated_list"
    SEARCHABLE_DIRECTORY = "searchable_directory"
    PARISH_FINDER = "parish_finder"  # NEW: Added for interactive parish finders
    DIOCESE_CARD_LAYOUT = "diocese_card_layout"  # NEW: For Salt Lake City style layouts
    PDF_DIRECTORY = "pdf_directory"
    UNKNOWN = "unknown"

@dataclass
class ParishData:
    name: str
    address: Optional[str] = None
    city: Optional[str] = None
    state: Optional[str] = None
    zip_code: Optional[str] = None
    phone: Optional[str] = None
    website: Optional[str] = None
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    pastor: Optional[str] = None
    mass_times: Optional[str] = None
    # NEW: Enhanced fields for detailed extraction
    street_address: Optional[str] = None
    full_address: Optional[str] = None
    parish_detail_url: Optional[str] = None
    clergy_info: Optional[str] = None
    service_times: Optional[str] = None
    # Metadata fields
    confidence_score: float = 0.5
    extraction_method: str = "unknown"
    diocese_url: Optional[str] = None
    parish_directory_url: Optional[str] = None
    detail_extraction_success: bool = False
    detail_extraction_error: Optional[str] = None

@dataclass
class DioceseSitePattern:
    platform: DiocesePlatform
    listing_type: ParishListingType
    confidence_score: float
    extraction_method: str
    specific_selectors: Dict[str, str]
    javascript_required: bool
    pagination_pattern: Optional[str] = None
    notes: str = ""

class PatternDetector:
    """Detects patterns in diocese websites for targeted extraction"""

    def detect_pattern(self, html_content: str, url: str) -> DioceseSitePattern:
        """Analyze website content and detect the best extraction pattern"""
        soup = BeautifulSoup(html_content, 'html.parser')
        html_lower = html_content.lower()

        # Platform detection
        platform = self._detect_platform(html_lower, url)

        # Listing type detection
        listing_type = self._detect_listing_type(html_lower, soup, url)

        # JavaScript requirement
        js_required = self._requires_javascript(html_lower)

        # Determine extraction method and confidence
        extraction_method, confidence, selectors, notes = self._determine_extraction_strategy(
            platform, listing_type, soup, html_lower, url
        )

        return DioceseSitePattern(
            platform=platform,
            listing_type=listing_type,
            confidence_score=confidence,
            extraction_method=extraction_method,
            specific_selectors=selectors,
            javascript_required=js_required,
            notes=notes
        )

    def _detect_platform(self, html_lower: str, url: str) -> DiocesePlatform:
        """Detect CMS/platform"""
        if 'ecatholic.com' in url or 'ecatholic' in html_lower:
            return DiocesePlatform.ECATHOLIC
        elif 'squarespace' in html_lower:
            return DiocesePlatform.SQUARESPACE
        elif 'wp-content' in html_lower or 'wordpress' in html_lower:
            return DiocesePlatform.WORDPRESS
        elif 'drupal' in html_lower:
            return DiocesePlatform.DRUPAL
        elif 'dioslc.org' in url or 'utahcatholicdiocese.org' in url:
            return DiocesePlatform.DIOCESAN_CUSTOM
        else:
            return DiocesePlatform.CUSTOM_CMS

    def _detect_listing_type(self, html_lower: str, soup: BeautifulSoup, url: str) -> ParishListingType:
        """Detect how parishes are listed"""

        # Check for Salt Lake City style card layout
        if ('col-lg location' in html_lower and 'card-title' in html_lower and
            'dioslc.org' in url):
            return ParishListingType.DIOCESE_CARD_LAYOUT

        # Check for eCatholic parish finder pattern (like Tulsa)
        if ('parishfinder' in url.lower() or 'parish-finder' in url.lower() or
            'finderCore' in html_lower or 'finder.js' in html_lower or
            'parish finder' in html_lower):
            return ParishListingType.PARISH_FINDER

        # Interactive map indicators
        map_indicators = ['leaflet', 'google.maps', 'mapbox', 'parish-map', 'interactive']
        if any(indicator in html_lower for indicator in map_indicators):
            return ParishListingType.INTERACTIVE_MAP

        # Table indicators
        if soup.find('table') and ('parish' in html_lower or 'church' in html_lower):
            return ParishListingType.STATIC_TABLE

        # Card/grid layout (generic)
        if soup.find_all(class_=re.compile(r'(card|grid|parish-item)', re.I)):
            return ParishListingType.CARD_GRID

        # Pagination
        if any(word in html_lower for word in ['pagination', 'page-numbers', 'next-page']):
            return ParishListingType.PAGINATED_LIST

        return ParishListingType.SIMPLE_LIST

    def _requires_javascript(self, html_lower: str) -> bool:
        """Check if JavaScript is required"""
        js_indicators = ['react', 'angular', 'vue', 'leaflet', 'google.maps', 'ajax', 'finder.js']
        return any(indicator in html_lower for indicator in js_indicators)

    def _determine_extraction_strategy(self, platform, listing_type, soup, html_lower, url):
        """Determine the best extraction strategy"""

        if listing_type == ParishListingType.DIOCESE_CARD_LAYOUT:
            return (
                "diocese_card_extraction_with_details",
                0.95,
                {
                    "parish_cards": ".col-lg.location",
                    "parish_name": ".card-title",
                    "parish_city": ".card-body",
                    "parish_link": "a.card"
                },
                "Diocese card layout detected - specialized extraction for Salt Lake City style with detail page navigation"
            )

        elif listing_type == ParishListingType.PARISH_FINDER:
            return (
                "parish_finder_extraction",
                0.95,
                {
                    "parish_list": ".site, li.site",
                    "parish_name": ".name",
                    "parish_city": ".city",
                    "parish_info": ".siteInfo",
                    "parish_details": ".details"
                },
                "Parish finder interface detected - specialized extraction for interactive directory"
            )

        elif listing_type == ParishListingType.INTERACTIVE_MAP:
            return (
                "interactive_map_extraction",
                0.9,
                {"map_container": "#map, .map-container, .parish-map"},
                "Interactive map detected - will extract from JS data and markers"
            )

        elif listing_type == ParishListingType.STATIC_TABLE:
            return (
                "table_extraction",
                0.95,
                {"table": "table", "rows": "tr:not(:first-child)"},
                "HTML table detected - most reliable extraction method"
            )

        elif platform == DiocesePlatform.SQUARESPACE:
            return (
                "squarespace_extraction",
                0.8,
                {"items": ".summary-item, .parish-item", "title": ".summary-title"},
                "SquareSpace platform - using platform-specific selectors"
            )

        else:
            return (
                "generic_extraction",
                0.4,
                {"containers": "[class*='parish'], [class*='church']"},
                "Using generic extraction patterns"
            )

print("✅ Enhanced pattern detection classes loaded with detailed extraction support")

✅ Enhanced pattern detection classes loaded with detailed extraction support


In [60]:
# =============================================================================
# CELL 4: Enhanced DiocesesCardExtractor with Detail Page Navigation (UPDATED)
# =============================================================================

class EnhancedDiocesesCardExtractor(BaseExtractor):
    """Enhanced extractor that clicks on each parish card to get detailed information"""

    def __init__(self, pattern: DioceseSitePattern):
        super().__init__(pattern)
        self.detail_extraction_count = 0
        self.detail_extraction_errors = 0

    def extract(self, driver, soup: BeautifulSoup, url: str) -> List[ParishData]:
        parishes = []

        try:
            print("    📍 Enhanced diocese card layout detected - extracting with detail pages")

            # Find all parish cards using the specific Salt Lake City structure
            parish_cards = soup.find_all('div', class_='col-lg location')
            print(f"    📊 Found {len(parish_cards)} parish cards")

            for i, card in enumerate(parish_cards, 1):
                try:
                    print(f"    🔄 Processing parish {i}/{len(parish_cards)}")
                    parish_data = self._extract_parish_from_card_with_details(card, url, driver, i)
                    if parish_data:
                        parishes.append(parish_data)
                        if parish_data.detail_extraction_success:
                            self.detail_extraction_count += 1
                        else:
                            self.detail_extraction_errors += 1
                except Exception as e:
                    print(f"    ⚠️ Error extracting from card {i}: {str(e)[:100]}...")
                    self.detail_extraction_errors += 1
                    continue

            print(f"    📊 Summary: {self.detail_extraction_count} detailed extractions successful, {self.detail_extraction_errors} failed")

        except Exception as e:
            print(f"    ⚠️ Enhanced diocese card extraction error: {str(e)[:100]}...")

        return parishes

    def _extract_parish_from_card_with_details(self, card, base_url: str, driver, card_number: int) -> Optional[ParishData]:
        """Extract parish data from a single card and navigate to detail page"""
        try:
            # Step 1: Extract basic information from the card
            card_link = card.find('a', class_='card')
            if not card_link:
                return None

            # Extract parish name from card title
            title_elem = card_link.find('h4', class_='card-title')
            if not title_elem:
                return None

            name = self.clean_text(title_elem.get_text())
            if not name or len(name) < 3:
                return None

            # Skip non-parish entries
            skip_terms = [
                'no parish registration', 'contact', 'chancery', 'pastoral center',
                'tv mass', 'directory', 'finder', 'diocese', 'bishop', 'office'
            ]
            if any(term in name.lower() for term in skip_terms):
                return None

            # Extract city from card body
            card_body = card_link.find('div', class_='card-body')
            city = None
            state = None
            if card_body:
                body_text = card_body.get_text()
                lines = [line.strip() for line in body_text.split('\n') if line.strip()]

                # The city is usually the second line (after the parish name)
                if len(lines) >= 2:
                    city_line = lines[1]
                    if city_line and not city_line.startswith('Learn More'):
                        city = self.clean_text(city_line)

            # Extract parish detail URL
            parish_detail_url = None
            href = card_link.get('href')
            if href:
                if href.startswith('/'):
                    from urllib.parse import urljoin
                    parish_detail_url = urljoin(base_url, href)
                else:
                    parish_detail_url = href

            # Extract state from city if present (format: "City, ST")
            if city and ', ' in city:
                city_parts = city.split(', ')
                if len(city_parts) == 2:
                    city = city_parts[0].strip()
                    state = city_parts[1].strip()

            # Step 2: Navigate to detail page and extract additional information
            detailed_info = self._extract_details_from_parish_page(driver, parish_detail_url, name)

            # Step 3: Create comprehensive parish data object
            parish_data = ParishData(
                name=name,
                city=city,
                state=state,
                parish_detail_url=parish_detail_url,
                confidence_score=0.9,
                extraction_method="enhanced_diocese_card_extraction"
            )

            # Add detailed information if extraction was successful
            if detailed_info['success']:
                parish_data.street_address = detailed_info.get('street_address')
                parish_data.full_address = detailed_info.get('full_address')
                parish_data.zip_code = detailed_info.get('zip_code')
                parish_data.phone = detailed_info.get('phone')
                parish_data.website = detailed_info.get('website')
                parish_data.clergy_info = detailed_info.get('clergy_info')
                parish_data.service_times = detailed_info.get('service_times')
                parish_data.detail_extraction_success = True
                parish_data.confidence_score = 0.95  # Higher confidence with detailed info
                print(f"      ✅ {name}: Complete details extracted")
            else:
                parish_data.detail_extraction_success = False
                parish_data.detail_extraction_error = detailed_info.get('error')
                print(f"      ⚠️ {name}: Basic info only - {detailed_info.get('error', 'Unknown error')}")

            return parish_data

        except Exception as e:
            print(f"    ⚠️ Error parsing card {card_number}: {str(e)[:50]}...")
            return None

    def _extract_details_from_parish_page(self, driver, parish_url: str, parish_name: str) -> Dict:
        """Navigate to parish detail page and extract detailed information"""

        if not parish_url:
            return {'success': False, 'error': 'No detail URL available'}

        try:
            print(f"      🔗 Navigating to: {parish_url}")

            # Navigate to the parish detail page
            driver.get(parish_url)
            time.sleep(2)  # Wait for page to load

            # Get the page source and parse it
            detail_html = driver.page_source
            detail_soup = BeautifulSoup(detail_html, 'html.parser')

            # Initialize result dictionary
            result = {
                'success': True,
                'street_address': None,
                'full_address': None,
                'zip_code': None,
                'phone': None,
                'website': None,
                'clergy_info': None,
                'service_times': None
            }

            # Extract contact information from the detail page
            self._extract_contact_info(detail_soup, result)
            self._extract_service_times(detail_soup, result)
            self._extract_clergy_info(detail_soup, result)

            return result

        except Exception as e:
            error_msg = f"Failed to extract details: {str(e)[:100]}"
            print(f"      ❌ {parish_name}: {error_msg}")
            return {'success': False, 'error': error_msg}

    def _extract_contact_info(self, soup: BeautifulSoup, result: Dict):
        """Extract contact information from parish detail page"""
        try:
            # Look for contact info section
            contact_sections = soup.find_all(['div', 'section'], class_=re.compile(r'contact', re.I))

            # Also look for FA icons which often indicate contact info
            fa_ul_sections = soup.find_all('ul', class_='fa-ul')

            all_contact_sections = contact_sections + fa_ul_sections

            for section in all_contact_sections:
                text_content = section.get_text()

                # Extract phone number
                if not result['phone']:
                    phone_links = section.find_all('a', href=re.compile(r'^tel:'))
                    if phone_links:
                        phone = phone_links[0].get_text().strip()
                        result['phone'] = self.clean_text(phone)
                    else:
                        # Look for phone patterns in text
                        phone_match = re.search(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text_content)
                        if phone_match:
                            result['phone'] = phone_match.group()

                # Extract website
                if not result['website']:
                    website_links = section.find_all('a', href=re.compile(r'^http'))
                    for link in website_links:
                        href = link.get('href', '')
                        # Skip social media and diocese links
                        if not any(skip in href.lower() for skip in ['facebook', 'twitter', 'instagram', 'dioslc.org']):
                            result['website'] = href
                            break

                # Extract address
                if not result['full_address']:
                    # Look for address patterns
                    address_lines = []
                    list_items = section.find_all('li')

                    for li in list_items:
                        li_text = li.get_text().strip()
                        # Check if this looks like an address (contains numbers and common address words)
                        if re.search(r'\d+.*(?:street|st|avenue|ave|road|rd|drive|dr|way|lane|ln|boulevard|blvd)', li_text, re.I):
                            address_lines.append(li_text)
                        elif re.search(r'\d+\s+[A-Za-z]', li_text) and ',' in li_text:
                            # Format like "331 East South Temple Street, Salt Lake City, UT 84111"
                            address_lines.append(li_text)

                    if address_lines:
                        full_address = address_lines[0]
                        result['full_address'] = full_address

                        # Try to parse street address and zip code
                        self._parse_address_components(full_address, result)

        except Exception as e:
            print(f"        ⚠️ Error extracting contact info: {str(e)[:50]}")

    def _parse_address_components(self, full_address: str, result: Dict):
        """Parse full address into street address and zip code"""
        try:
            # Extract zip code (5 digits, possibly followed by 4 more)
            zip_match = re.search(r'\b(\d{5}(?:-\d{4})?)\b', full_address)
            if zip_match:
                result['zip_code'] = zip_match.group(1)

            # Extract street address (everything before the first comma, or before city/state)
            address_parts = full_address.split(',')
            if len(address_parts) > 0:
                potential_street = address_parts[0].strip()
                # Make sure it looks like a street address
                if re.search(r'\d+', potential_street):
                    result['street_address'] = potential_street

        except Exception as e:
            print(f"        ⚠️ Error parsing address: {str(e)[:30]}")

    def _extract_service_times(self, soup: BeautifulSoup, result: Dict):
        """Extract service times from parish detail page"""
        try:
            # Look for service times section
            service_sections = soup.find_all(['div', 'section'],
                                           string=re.compile(r'service.*times|mass.*times|masses|schedule', re.I))

            # Also look for h3 headers that might indicate service times
            service_headers = soup.find_all(['h3', 'h4'],
                                          string=re.compile(r'service.*times|mass.*times|masses|schedule', re.I))

            # Look for lists that might contain service information
            service_lists = []
            for header in service_headers:
                next_sibling = header.find_next_sibling(['ul', 'div'])
                if next_sibling:
                    service_lists.append(next_sibling)

            all_service_sections = service_sections + service_lists

            for section in all_service_sections:
                if section:
                    service_text = section.get_text()
                    # Extract meaningful service time information
                    lines = [line.strip() for line in service_text.split('\n') if line.strip()]
                    # Filter out very short lines and focus on schedule information
                    schedule_lines = [line for line in lines if len(line) > 10 and
                                    any(keyword in line.lower() for keyword in
                                        ['sunday', 'saturday', 'daily', 'mass', 'service', 'am', 'pm'])]

                    if schedule_lines:
                        result['service_times'] = '; '.join(schedule_lines[:5])  # Limit to first 5 lines
                        break

        except Exception as e:
            print(f"        ⚠️ Error extracting service times: {str(e)[:50]}")

    def _extract_clergy_info(self, soup: BeautifulSoup, result: Dict):
        """Extract clergy information from parish detail page"""
        try:
            # Look for clergy section
            clergy_sections = soup.find_all(['div', 'section'], class_=re.compile(r'clergy|pastor|priest', re.I))

            # Also look for directory cards that might contain clergy info
            directory_cards = soup.find_all(['div'], class_=re.compile(r'directory|card', re.I))

            all_clergy_sections = clergy_sections + directory_cards

            clergy_info = []
            for section in all_clergy_sections:
                # Look for names and titles
                titles = section.find_all(['h4', 'h5'], class_=re.compile(r'title|name', re.I))
                for title in titles:
                    title_text = title.get_text().strip()
                    # Check if this looks like a clergy title
                    if any(clergy_word in title_text.lower() for clergy_word in
                           ['reverend', 'father', 'pastor', 'deacon', 'rev.', 'fr.', 'dcn.']):

                        # Get associated role/description
                        role_elem = title.find_next_sibling(['p', 'div'])
                        role_text = role_elem.get_text().strip() if role_elem else ""

                        if role_text:
                            clergy_info.append(f"{title_text}: {role_text}")
                        else:
                            clergy_info.append(title_text)

            if clergy_info:
                result['clergy_info'] = '; '.join(clergy_info[:3])  # Limit to first 3 clergy members

        except Exception as e:
            print(f"        ⚠️ Error extracting clergy info: {str(e)[:50]}")

# Update the original DiocesesCardExtractor to use the enhanced version
DiocesesCardExtractor = EnhancedDiocesesCardExtractor

print("✅ Enhanced DiocesesCardExtractor loaded with detail page navigation")

✅ Enhanced DiocesesCardExtractor loaded with detail page navigation


In [61]:
# =============================================================================
# CELL 4.5: Improved Map Extractor (INSERT AFTER CELL 4)
# =============================================================================

class ImprovedInteractiveMapExtractor(BaseExtractor):
    """Improved extractor for JavaScript-powered maps with better error handling"""

    def extract(self, driver, soup: BeautifulSoup, url: str) -> List[ParishData]:
        parishes = []

        try:
            # Try to find map containers with more flexible selectors
            map_selectors = [
                "#map", ".map", ".parish-map", ".church-map",
                "[id*='map']", "[class*='map']",
                "#parish-finder", ".parish-finder"
            ]

            map_found = False
            for selector in map_selectors:
                try:
                    WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                    )
                    map_found = True
                    print(f"    📍 Found map container: {selector}")
                    break
                except:
                    continue

            if not map_found:
                print(f"    ℹ️ No map container found, trying direct JS extraction...")

            # Method 1: Extract from JavaScript variables (works even without visible map)
            parishes.extend(self._extract_from_js_variables(driver))

            # Method 2: Look for parish data in script tags
            if not parishes:
                parishes.extend(self._extract_from_script_tags(soup))

            # Method 3: Extract from map markers (only if map found)
            if not parishes and map_found:
                parishes.extend(self._extract_from_markers(driver))

        except Exception as e:
            print(f"    ℹ️ Map extraction completed with info: {str(e)[:100]}...")

        return parishes

    def _extract_from_script_tags(self, soup: BeautifulSoup) -> List[ParishData]:
        """Extract parish data from script tags containing JSON"""
        parishes = []

        try:
            script_tags = soup.find_all('script')

            for script in script_tags:
                if not script.string:
                    continue

                script_content = script.string

                # Look for JSON-like data containing parish information
                if any(keyword in script_content.lower() for keyword in
                       ['parish', 'church', 'location', 'marker']):

                    # Try to extract JSON objects
                    import json

                    # Look for common patterns
                    patterns = [
                        r'parishes\s*[:=]\s*(\[.*?\])',
                        r'locations\s*[:=]\s*(\[.*?\])',
                        r'markers\s*[:=]\s*(\[.*?\])',
                        r'churches\s*[:=]\s*(\[.*?\])'
                    ]

                    for pattern in patterns:
                        matches = re.findall(pattern, script_content, re.DOTALL)
                        for match in matches:
                            try:
                                data = json.loads(match)
                                if isinstance(data, list):
                                    for item in data:
                                        parish = self._parse_js_parish_object(item)
                                        if parish:
                                            parishes.append(parish)
                            except:
                                continue

                        if parishes:
                            break

                if parishes:
                    break

        except Exception as e:
            print(f"    ℹ️ Script tag extraction info: {str(e)[:50]}...")

        return parishes

    def _extract_from_js_variables(self, driver) -> List[ParishData]:
        """Extract from common JavaScript variable names with better error handling"""
        parishes = []

        # Expanded list of common variable names
        js_vars = [
            "parishes", "parishData", "locations", "markers", "churchData",
            "parishList", "churches", "mapData", "data", "items",
            "parishInfo", "churchInfo", "mapMarkers", "points"
        ]

        for var_name in js_vars:
            try:
                js_data = driver.execute_script(f"""
                    try {{
                        return window.{var_name};
                    }} catch(e) {{
                        return null;
                    }}
                """)

                if js_data and isinstance(js_data, list) and len(js_data) > 0:
                    print(f"    📊 Found data in window.{var_name}: {len(js_data)} items")

                    for item in js_data:
                        parish = self._parse_js_parish_object(item)
                        if parish:
                            parishes.append(parish)

                    if parishes:
                        break

            except Exception as e:
                continue

        return parishes

    def _parse_js_parish_object(self, data: Dict) -> Optional[ParishData]:
        """Enhanced parsing of parish data from JavaScript object"""
        if not isinstance(data, dict):
            return None

        # Enhanced field mapping for name
        name = None
        for field in ['name', 'title', 'parishName', 'churchName', 'parish_name',
                      'church_name', 'label', 'text', 'Name', 'Title']:
            if field in data and data[field]:
                name = str(data[field]).strip()
                break

        if not name or len(name) < 3:
            return None

        # Skip non-parish entries
        if any(skip_word in name.lower() for skip_word in
               ['finder', 'directory', 'map', 'search', 'filter']):
            return None

        # Enhanced field mapping for other data
        address = None
        for field in ['address', 'location', 'fullAddress', 'street', 'addr']:
            if field in data and data[field]:
                address = str(data[field]).strip()
                break

        phone = None
        for field in ['phone', 'telephone', 'phoneNumber', 'tel', 'Phone']:
            if field in data and data[field]:
                phone = str(data[field]).strip()
                break

        website = None
        for field in ['website', 'url', 'link', 'web', 'Website', 'URL']:
            if field in data and data[field]:
                website = str(data[field]).strip()
                break

        # Coordinates
        lat = data.get('lat', data.get('latitude', data.get('Lat')))
        lng = data.get('lng', data.get('longitude', data.get('lon', data.get('Lng'))))

        return ParishData(
            name=name,
            address=address,
            phone=phone,
            website=website,
            latitude=float(lat) if lat else None,
            longitude=float(lng) if lng else None,
            confidence_score=0.8,
            extraction_method="improved_js_extraction"
        )

    def _extract_from_markers(self, driver) -> List[ParishData]:
        """Extract by clicking map markers with improved error handling"""
        parishes = []

        try:
            # More flexible marker selectors
            marker_selectors = [
                ".marker", ".leaflet-marker", ".map-marker",
                "[class*='marker']", ".gm-style-iw", ".mapboxgl-marker"
            ]

            markers = []
            for selector in marker_selectors:
                try:
                    found_markers = driver.find_elements(By.CSS_SELECTOR, selector)
                    if found_markers:
                        markers = found_markers
                        print(f"    📍 Found {len(markers)} markers using {selector}")
                        break
                except:
                    continue

            if not markers:
                print(f"    ℹ️ No clickable markers found")
                return parishes

            # Limit markers to avoid timeout
            for i, marker in enumerate(markers[:5]):  # Only try first 5
                try:
                    # Scroll marker into view
                    driver.execute_script("arguments[0].scrollIntoView(true);", marker)
                    time.sleep(0.5)

                    # Click marker
                    driver.execute_script("arguments[0].click();", marker)
                    time.sleep(1)

                    # Look for popup content with multiple selectors
                    popup_selectors = [
                        ".popup", ".info-window", ".mapboxgl-popup",
                        ".leaflet-popup", ".gm-style-iw-d"
                    ]

                    popup_text = None
                    for popup_selector in popup_selectors:
                        try:
                            popup = driver.find_element(By.CSS_SELECTOR, popup_selector)
                            popup_text = popup.text
                            break
                        except:
                            continue

                    if popup_text and len(popup_text) > 10:
                        parish_data = self._parse_popup_content(popup_text)
                        if parish_data:
                            parishes.append(parish_data)

                except Exception as e:
                    continue

        except Exception as e:
            print(f"    ℹ️ Marker extraction completed: {str(e)[:50]}...")

        return parishes

    def _parse_popup_content(self, popup_text: str) -> Optional[ParishData]:
        """Parse parish information from popup text"""
        lines = [line.strip() for line in popup_text.split('\n') if line.strip()]

        if not lines:
            return None

        name = lines[0]  # First line is usually the name

        # Skip if it doesn't look like a parish name
        if not any(indicator in name.lower() for indicator in
                  ['parish', 'church', 'st.', 'saint', 'our lady', 'holy', 'cathedral']):
            return None

        address = None
        phone = None

        # Look for address and phone in remaining lines
        for line in lines[1:]:
            if self.extract_phone(line):
                phone = self.extract_phone(line)
            elif re.search(r'\d+.*(?:street|st|avenue|ave|road|rd|drive|dr)', line, re.I):
                address = line

        return ParishData(
            name=name,
            address=address,
            phone=phone,
            confidence_score=0.6,
            extraction_method="marker_popup_extraction"
        )

# Replace the InteractiveMapExtractor in the main processing function
print("✅ Improved map extractor loaded with better error handling")

✅ Improved map extractor loaded with better error handling


In [62]:
# =============================================================================
# CELL 5: Enhanced Database Integration Functions (COMPLETE - UPDATED for detailed parish data)
# =============================================================================

def prepare_parish_for_supabase(parish_data: ParishData, diocese_name: str, diocese_url: str, parish_directory_url: str) -> Dict:
    """Convert ParishData to format compatible with your existing Supabase schema (ENHANCED)"""

    # Use street address if available, otherwise fall back to full address
    street_address = parish_data.street_address or parish_data.full_address or parish_data.address

    return {
        'Name': parish_data.name,
        'Status': 'Parish',  # Default status
        'Deanery': None,  # Will be populated later if available
        'Street Address': street_address,
        'City': parish_data.city,
        'State': parish_data.state,
        'Zip Code': parish_data.zip_code,
        'Phone Number': parish_data.phone,
        'Web': parish_data.website,
        'diocese_url': diocese_url,
        'parish_directory_url': parish_directory_url,
        'parish_detail_url': parish_data.parish_detail_url,
        'extraction_method': parish_data.extraction_method,
        # Enhanced metadata fields
        'confidence_score': parish_data.confidence_score,
        'detail_extraction_success': parish_data.detail_extraction_success,
        'detail_extraction_error': parish_data.detail_extraction_error,
        'clergy_info': parish_data.clergy_info,
        'service_times': parish_data.service_times,
        'full_address': parish_data.full_address,
        'extracted_at': datetime.now().isoformat()
    }

def enhanced_safe_upsert_to_supabase(parishes: List[ParishData], diocese_name: str, diocese_url: str, parish_directory_url: str):
    """Enhanced version of your existing Supabase upsert function with detailed data support"""

    if not supabase:
        print("  ❌ Supabase not available")
        return False

    success_count = 0
    detail_success_count = 0

    for parish in parishes:
        try:
            # Skip non-parish items (like "Parish Finder", "Contact Info", etc.)
            if any(skip_word in parish.name.lower() for skip_word in
                   ['finder', 'contact', 'chancery', 'pastoral center', 'tv mass', 'directory']):
                print(f"    ⏭️ Skipped: {parish.name} (not a parish)")
                continue

            # Convert to your existing schema format
            supabase_data = prepare_parish_for_supabase(parish, diocese_name, diocese_url, parish_directory_url)

            # Remove None values and empty strings
            clean_data = {k: v for k, v in supabase_data.items()
                         if v is not None and v != ""}

            # Must have a name to proceed
            if not clean_data.get('Name') or len(clean_data.get('Name', '')) < 3:
                print(f"    ⏭️ Skipped: Invalid name for parish")
                continue

            # Use your existing upsert logic
            response = supabase.table('Parishes').insert(clean_data).execute()

            if hasattr(response, 'error') and response.error:
                print(f"    ❌ Database error for {parish.name}: {response.error}")
            else:
                success_count += 1
                if parish.detail_extraction_success:
                    detail_success_count += 1
                    detail_indicator = "📍" # Pin icon for detailed info
                else:
                    detail_indicator = "📌" # Basic pin for basic info

                print(f"    ✅ {detail_indicator} Saved: {parish.name} (confidence: {parish.confidence_score:.2f})")

                # Show what detailed fields were captured
                if parish.detail_extraction_success:
                    details = []
                    if parish.street_address: details.append("address")
                    if parish.phone: details.append("phone")
                    if parish.zip_code: details.append("zip")
                    if parish.website: details.append("website")
                    if parish.clergy_info: details.append("clergy")
                    if parish.service_times: details.append("schedule")

                    if details:
                        print(f"        📋 Details: {', '.join(details)}")

        except Exception as e:
            print(f"    ❌ Error saving {parish.name}: {e}")

    print(f"  📊 Successfully saved {success_count}/{len(parishes)} parishes")
    print(f"  📍 Detailed information captured for {detail_success_count}/{success_count} parishes")
    return success_count > 0

# Enhanced metadata tracking with detailed extraction statistics
def create_enhanced_extraction_metadata_record(diocese_name: str, diocese_url: str, parishes_found: List[ParishData], pattern_info: Dict):
    """Store enhanced extraction metadata including detail extraction statistics"""

    if not supabase:
        return False

    try:
        # Calculate detail extraction statistics
        total_parishes = len(parishes_found)
        successful_details = sum(1 for p in parishes_found if p.detail_extraction_success)

        # Count which detail fields were successfully extracted
        field_stats = {
            'addresses_extracted': sum(1 for p in parishes_found if p.street_address or p.full_address),
            'phones_extracted': sum(1 for p in parishes_found if p.phone),
            'websites_extracted': sum(1 for p in parishes_found if p.website),
            'zip_codes_extracted': sum(1 for p in parishes_found if p.zip_code),
            'clergy_info_extracted': sum(1 for p in parishes_found if p.clergy_info),
            'service_times_extracted': sum(1 for p in parishes_found if p.service_times)
        }

        metadata = {
            'diocese_name': diocese_name,
            'diocese_url': diocese_url,
            'parishes_count': total_parishes,
            'detail_extraction_success_count': successful_details,
            'detail_extraction_success_rate': (successful_details / total_parishes * 100) if total_parishes > 0 else 0,
            'extraction_timestamp': datetime.now().isoformat(),
            'pattern_platform': pattern_info.get('platform'),
            'pattern_listing_type': pattern_info.get('listing_type'),
            'pattern_confidence': pattern_info.get('confidence'),
            'extraction_methods': ', '.join(pattern_info.get('methods_used', [])),
            'success': total_parishes > 0,
            # Field extraction statistics
            'addresses_extracted': field_stats['addresses_extracted'],
            'phones_extracted': field_stats['phones_extracted'],
            'websites_extracted': field_stats['websites_extracted'],
            'zip_codes_extracted': field_stats['zip_codes_extracted'],
            'clergy_info_extracted': field_stats['clergy_info_extracted'],
            'service_times_extracted': field_stats['service_times_extracted']
        }

        # You would need to create this enhanced table in Supabase:
        # CREATE TABLE extraction_metadata (
        #   id SERIAL PRIMARY KEY,
        #   diocese_name TEXT,
        #   diocese_url TEXT,
        #   parishes_count INTEGER,
        #   detail_extraction_success_count INTEGER,
        #   detail_extraction_success_rate FLOAT,
        #   extraction_timestamp TIMESTAMP,
        #   pattern_platform TEXT,
        #   pattern_listing_type TEXT,
        #   pattern_confidence FLOAT,
        #   extraction_methods TEXT,
        #   success BOOLEAN,
        #   addresses_extracted INTEGER,
        #   phones_extracted INTEGER,
        #   websites_extracted INTEGER,
        #   zip_codes_extracted INTEGER,
        #   clergy_info_extracted INTEGER,
        #   service_times_extracted INTEGER
        # );

        response = supabase.table('extraction_metadata').insert(metadata).execute()
        return not (hasattr(response, 'error') and response.error)

    except Exception as e:
        print(f"    ⚠️ Could not save extraction metadata: {e}")
        return False

# Alternative: Create a separate table for detailed parish information if needed
def create_detailed_parish_table_record(parish_data: ParishData, diocese_name: str):
    """Store detailed parish information in a separate detailed table (optional)"""

    if not supabase:
        return False

    try:
        # This creates a separate table for enhanced parish details
        detailed_data = {
            'parish_name': parish_data.name,
            'diocese_name': diocese_name,
            'parish_detail_url': parish_data.parish_detail_url,
            'full_address': parish_data.full_address,
            'street_address': parish_data.street_address,
            'clergy_info': parish_data.clergy_info,
            'service_times': parish_data.service_times,
            'extraction_timestamp': datetime.now().isoformat(),
            'extraction_method': parish_data.extraction_method,
            'confidence_score': parish_data.confidence_score,
            'detail_extraction_success': parish_data.detail_extraction_success,
            'detail_extraction_error': parish_data.detail_extraction_error
        }

        # Remove None values
        clean_detailed_data = {k: v for k, v in detailed_data.items()
                              if v is not None and v != ""}

        # You would need to create this table in Supabase:
        # CREATE TABLE parish_detailed_info (
        #   id SERIAL PRIMARY KEY,
        #   parish_name TEXT,
        #   diocese_name TEXT,
        #   parish_detail_url TEXT,
        #   full_address TEXT,
        #   street_address TEXT,
        #   clergy_info TEXT,
        #   service_times TEXT,
        #   extraction_timestamp TIMESTAMP,
        #   extraction_method TEXT,
        #   confidence_score FLOAT,
        #   detail_extraction_success BOOLEAN,
        #   detail_extraction_error TEXT
        # );

        response = supabase.table('parish_detailed_info').insert(clean_detailed_data).execute()
        return not (hasattr(response, 'error') and response.error)

    except Exception as e:
        print(f"    ⚠️ Could not save detailed parish info: {e}")
        return False

# Utility function to analyze extraction quality
def analyze_extraction_quality(parishes: List[ParishData]) -> Dict:
    """Analyze the quality and completeness of parish data extraction"""

    if not parishes:
        return {'error': 'No parishes to analyze'}

    total_parishes = len(parishes)

    analysis = {
        'total_parishes': total_parishes,
        'basic_info_quality': {
            'names_present': sum(1 for p in parishes if p.name and len(p.name) > 2),
            'cities_present': sum(1 for p in parishes if p.city),
            'states_present': sum(1 for p in parishes if p.state)
        },
        'detailed_info_quality': {
            'addresses_present': sum(1 for p in parishes if p.street_address or p.full_address),
            'phones_present': sum(1 for p in parishes if p.phone),
            'websites_present': sum(1 for p in parishes if p.website),
            'zip_codes_present': sum(1 for p in parishes if p.zip_code),
            'clergy_info_present': sum(1 for p in parishes if p.clergy_info),
            'service_times_present': sum(1 for p in parishes if p.service_times)
        },
        'extraction_success': {
            'detail_extractions_attempted': sum(1 for p in parishes if hasattr(p, 'detail_extraction_success')),
            'detail_extractions_successful': sum(1 for p in parishes if p.detail_extraction_success),
            'high_confidence_extractions': sum(1 for p in parishes if p.confidence_score >= 0.8)
        }
    }

    # Calculate percentages
    analysis['basic_info_percentages'] = {
        f"{key}_percentage": (value / total_parishes * 100)
        for key, value in analysis['basic_info_quality'].items()
    }

    analysis['detailed_info_percentages'] = {
        f"{key}_percentage": (value / total_parishes * 100)
        for key, value in analysis['detailed_info_quality'].items()
    }

    # Overall quality score (0-100)
    basic_score = sum(analysis['basic_info_quality'].values()) / (total_parishes * 3) * 100
    detailed_score = sum(analysis['detailed_info_quality'].values()) / (total_parishes * 6) * 100
    analysis['overall_quality_score'] = (basic_score + detailed_score) / 2

    return analysis

# Function to export parish data to CSV for analysis
def export_parishes_to_csv(parishes: List[ParishData], diocese_name: str) -> str:
    """Export parish data to CSV file for external analysis"""

    import csv
    import os

    if not parishes:
        return None

    # Create filename
    safe_diocese_name = "".join(c for c in diocese_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"parishes_{safe_diocese_name}_{timestamp}.csv"

    try:
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = [
                'Name', 'City', 'State', 'Street Address', 'Full Address', 'Zip Code',
                'Phone', 'Website', 'Parish Detail URL', 'Clergy Info', 'Service Times',
                'Confidence Score', 'Extraction Method', 'Detail Extraction Success',
                'Detail Extraction Error'
            ]

            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for parish in parishes:
                writer.writerow({
                    'Name': parish.name,
                    'City': parish.city,
                    'State': parish.state,
                    'Street Address': parish.street_address,
                    'Full Address': parish.full_address,
                    'Zip Code': parish.zip_code,
                    'Phone': parish.phone,
                    'Website': parish.website,
                    'Parish Detail URL': parish.parish_detail_url,
                    'Clergy Info': parish.clergy_info,
                    'Service Times': parish.service_times,
                    'Confidence Score': parish.confidence_score,
                    'Extraction Method': parish.extraction_method,
                    'Detail Extraction Success': parish.detail_extraction_success,
                    'Detail Extraction Error': parish.detail_extraction_error
                })

        print(f"📁 Parish data exported to: {filename}")
        return filename

    except Exception as e:
        print(f"❌ Error exporting to CSV: {e}")
        return None

print("✅ Complete enhanced database integration functions loaded with detailed parish data support")

✅ Complete enhanced database integration functions loaded with detailed parish data support


In [63]:
# =============================================================================
# CELL 5.5: WebDriver Setup Function (ADD THIS BEFORE CELL 6)
# =============================================================================

def setup_enhanced_driver():
    """Set up Chrome WebDriver with options optimized for parish extraction"""

    print("🔧 Setting up enhanced Chrome WebDriver...")

    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-plugins')
    chrome_options.add_argument('--disable-images')  # Speed up loading
    chrome_options.add_argument('--disable-javascript-harmony-shipping')
    chrome_options.add_argument('--disable-background-timer-throttling')
    chrome_options.add_argument('--disable-backgrounding-occluded-windows')
    chrome_options.add_argument('--disable-renderer-backgrounding')

    # User agent to avoid blocking
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.set_page_load_timeout(30)
        driver.implicitly_wait(5)

        print("✅ Chrome WebDriver initialized successfully")
        return driver

    except Exception as e:
        print(f"❌ Failed to initialize WebDriver: {e}")
        raise

print("✅ WebDriver setup function loaded")

✅ WebDriver setup function loaded


In [64]:
# =============================================================================
# CELL 6: Enhanced Master Processing Function with Detail Extraction (UPDATED)
# =============================================================================

def process_diocese_with_detailed_extraction(diocese_info: Dict, driver) -> Dict:
    """
    Enhanced processing function that extracts detailed parish information
    by navigating to individual parish detail pages
    """

    diocese_url = diocese_info['url']
    diocese_name = diocese_info['name']
    parish_directory_url = diocese_info['parish_directory_url']

    print(f"\n{'='*60}")
    print(f"🔍 ENHANCED DETAILED PROCESSING: {diocese_name}")
    print(f"📍 Main URL: {diocese_url}")
    print(f"📂 Parish Directory URL: {parish_directory_url}")
    print(f"{'='*60}")

    result = {
        'diocese_name': diocese_name,
        'diocese_url': diocese_url,
        'parish_directory_url': parish_directory_url,
        'timestamp': datetime.now().isoformat(),
        'pattern_detected': None,
        'parishes_found': [],
        'success': False,
        'extraction_methods_used': [],
        'processing_time': 0,
        'errors': [],
        # Enhanced tracking
        'detail_extraction_stats': {
            'attempted': 0,
            'successful': 0,
            'failed': 0,
            'success_rate': 0.0
        },
        'field_extraction_stats': {
            'addresses_extracted': 0,
            'phones_extracted': 0,
            'websites_extracted': 0,
            'zip_codes_extracted': 0,
            'clergy_info_extracted': 0,
            'service_times_extracted': 0
        }
    }

    start_time = time.time()

    try:
        # Step 1: Load the parish directory page
        print("  📥 Loading parish directory page...")
        driver.get(parish_directory_url)
        time.sleep(3)  # Give time for JS to load

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        # Step 2: Detect pattern
        print("  🔍 Detecting website pattern...")
        detector = PatternDetector()
        pattern = detector.detect_pattern(html_content, parish_directory_url)

        result['pattern_detected'] = {
            'platform': pattern.platform.value,
            'listing_type': pattern.listing_type.value,
            'confidence': pattern.confidence_score,
            'extraction_method': pattern.extraction_method,
            'javascript_required': pattern.javascript_required,
            'notes': pattern.notes
        }

        print(f"    📋 Platform: {pattern.platform.value}")
        print(f"    📊 Listing Type: {pattern.listing_type.value}")
        print(f"    🎯 Confidence: {pattern.confidence_score:.2f}")
        print(f"    ⚙️ Method: {pattern.extraction_method}")

        # Step 3: Extract parishes with detailed information
        parishes = []

        # Try extractors in order of specificity
        extractors_to_try = []

        # Primary extractor based on detected pattern (enhanced versions)
        if pattern.listing_type == ParishListingType.DIOCESE_CARD_LAYOUT:
            extractors_to_try.append(('EnhancedDiocesesCardExtractor', EnhancedDiocesesCardExtractor(pattern)))
        elif pattern.listing_type == ParishListingType.PARISH_FINDER:
            extractors_to_try.append(('ParishFinderExtractor', ParishFinderExtractor(pattern)))
        elif pattern.listing_type == ParishListingType.STATIC_TABLE:
            extractors_to_try.append(('TableExtractor', TableExtractor(pattern)))
        elif pattern.listing_type == ParishListingType.INTERACTIVE_MAP:
            extractors_to_try.append(('ImprovedInteractiveMapExtractor', ImprovedInteractiveMapExtractor(pattern)))

        # Always add enhanced diocese card extractor as fallback for Salt Lake City style sites
        if not any(name == 'EnhancedDiocesesCardExtractor' for name, _ in extractors_to_try):
            extractors_to_try.append(('EnhancedDiocesesCardExtractor', EnhancedDiocesesCardExtractor(pattern)))

        # Add other fallback extractors
        extractors_to_try.extend([
            ('TableExtractor', TableExtractor(pattern)),
            ('ImprovedInteractiveMapExtractor', ImprovedInteractiveMapExtractor(pattern)),
            ('ImprovedGenericExtractor', ImprovedGenericExtractor(pattern))
        ])

        # Remove duplicates while preserving order
        seen_extractors = set()
        unique_extractors = []
        for name, extractor in extractors_to_try:
            if name not in seen_extractors:
                unique_extractors.append((name, extractor))
                seen_extractors.add(name)

        # Try each extractor until we find parishes
        for extractor_name, extractor in unique_extractors:
            try:
                print(f"  🔄 Trying {extractor_name}...")
                current_parishes = extractor.extract(driver, soup, parish_directory_url)

                if current_parishes:
                    parishes.extend(current_parishes)
                    result['extraction_methods_used'].append(extractor_name)
                    print(f"    ✅ {extractor_name} found {len(current_parishes)} parishes")

                    # If we found parishes with the enhanced extractor, we got detailed info
                    if extractor_name == 'EnhancedDiocesesCardExtractor' and len(parishes) > 5:
                        print(f"    🎯 Using enhanced extractor - detailed information will be extracted")
                        break

                    # If we found a good number of parishes with any method, stop
                    if len(parishes) > 10:
                        break
                else:
                    print(f"    ⚠️ {extractor_name} found no parishes")

            except Exception as e:
                print(f"    ❌ {extractor_name} failed: {str(e)[:100]}")
                result['errors'].append(f"{extractor_name}: {str(e)[:100]}")

        # Step 4: Process results and calculate statistics
        if parishes:
            # Remove duplicates and validate
            unique_parishes = []
            seen_names = set()

            for parish in parishes:
                name_key = parish.name.lower().strip()
                if name_key not in seen_names and len(parish.name) > 2:
                    # Set the source URLs for each parish
                    parish.diocese_url = diocese_url
                    parish.parish_directory_url = parish_directory_url
                    unique_parishes.append(parish)
                    seen_names.add(name_key)

            result['parishes_found'] = unique_parishes
            result['success'] = True

            # Calculate detailed extraction statistics
            total_parishes = len(unique_parishes)
            detailed_successful = sum(1 for p in unique_parishes if p.detail_extraction_success)
            detailed_failed = sum(1 for p in unique_parishes if hasattr(p, 'detail_extraction_success') and not p.detail_extraction_success)

            result['detail_extraction_stats'] = {
                'attempted': total_parishes,
                'successful': detailed_successful,
                'failed': detailed_failed,
                'success_rate': (detailed_successful / total_parishes * 100) if total_parishes > 0 else 0
            }

            # Calculate field extraction statistics
            result['field_extraction_stats'] = {
                'addresses_extracted': sum(1 for p in unique_parishes if p.street_address or p.full_address),
                'phones_extracted': sum(1 for p in unique_parishes if p.phone),
                'websites_extracted': sum(1 for p in unique_parishes if p.website),
                'zip_codes_extracted': sum(1 for p in unique_parishes if p.zip_code),
                'clergy_info_extracted': sum(1 for p in unique_parishes if p.clergy_info),
                'service_times_extracted': sum(1 for p in unique_parishes if p.service_times)
            }

            print(f"  ✅ Found {len(unique_parishes)} unique parishes")
            print(f"  📊 Detail extraction: {detailed_successful}/{total_parishes} successful ({result['detail_extraction_stats']['success_rate']:.1f}%)")

            # Show field extraction summary
            field_stats = result['field_extraction_stats']
            print(f"  📋 Field extraction summary:")
            print(f"      📍 Addresses: {field_stats['addresses_extracted']}/{total_parishes}")
            print(f"      📞 Phones: {field_stats['phones_extracted']}/{total_parishes}")
            print(f"      🌐 Websites: {field_stats['websites_extracted']}/{total_parishes}")
            print(f"      📮 Zip Codes: {field_stats['zip_codes_extracted']}/{total_parishes}")
            print(f"      👥 Clergy Info: {field_stats['clergy_info_extracted']}/{total_parishes}")
            print(f"      ⏰ Service Times: {field_stats['service_times_extracted']}/{total_parishes}")

            # Step 5: Save to database
            if unique_parishes:
                print("  💾 Saving enhanced parish data to database...")
                enhanced_safe_upsert_to_supabase(unique_parishes, diocese_name, diocese_url, parish_directory_url)

        else:
            print("  ❌ No parishes found with any extraction method")
            result['success'] = False

    except Exception as e:
        error_msg = str(e)
        result['errors'].append(error_msg)
        print(f"  ❌ Processing error: {error_msg}")

    finally:
        result['processing_time'] = time.time() - start_time
        print(f"  ⏱️ Completed in {result['processing_time']:.1f}s")

    return result

print("✅ Enhanced master processing function loaded with detailed parish extraction")

✅ Enhanced master processing function loaded with detailed parish extraction


In [66]:
# =============================================================================
# CELL 7: Main Execution Using Existing Parish Directory URLs (MODIFIED TO EXCLUDE PROCESSED DIOCESES)
# =============================================================================

# Get dioceses WITH their parish directory URLs from your existing data
if supabase:
    try:
        print("📥 Fetching dioceses with parish directory URLs from database...")

        # Join dioceses with their parish directory URLs
        response = supabase.table('DiocesesParishDirectory').select(
            'diocese_url, parish_directory_url'
        ).not_.is_('parish_directory_url', 'null').not_.eq('parish_directory_url', '').execute()

        diocese_directory_data = response.data if response.data else []
        print(f"📊 Found {len(diocese_directory_data)} dioceses with parish directory URLs")

        # Get diocese names from the main table
        if diocese_directory_data:
            diocese_urls = [item['diocese_url'] for item in diocese_directory_data]

            # Get diocese names for these URLs
            diocese_names_response = supabase.table('Dioceses').select(
                'Website, Name'
            ).in_('Website', diocese_urls).execute()

            diocese_names_data = diocese_names_response.data if diocese_names_response.data else []

            # Create a mapping of URL to name
            url_to_name = {item['Website']: item['Name'] for item in diocese_names_data}

            # NEW: Check which dioceses already have 5+ parishes in the database
            print("🔍 Checking for dioceses that already have 5+ parishes extracted...")

            # Get all parishes with their parish_directory_url
            parish_counts_response = supabase.table('Parishes').select(
                'parish_directory_url'
            ).not_.is_('parish_directory_url', 'null').not_.eq('parish_directory_url', '').execute()

            parish_counts_data = parish_counts_response.data if parish_counts_response.data else []

            # Count parishes by directory URL
            directory_url_counts = {}
            for parish in parish_counts_data:
                directory_url = parish.get('parish_directory_url')
                if directory_url:
                    directory_url_counts[directory_url] = directory_url_counts.get(directory_url, 0) + 1

            # Combine the data and filter out already processed dioceses
            dioceses_to_process = []
            dioceses_skipped = []
            first_kept_shown = False

            for item in diocese_directory_data:
                diocese_url = item['diocese_url']
                parish_directory_url = item['parish_directory_url']
                diocese_name = url_to_name.get(diocese_url, 'Unknown Diocese')

                # Check if this diocese already has 5+ parishes
                existing_parish_count = directory_url_counts.get(parish_directory_url, 0)

                if existing_parish_count >= 5:
                    dioceses_skipped.append({
                        'name': diocese_name,
                        'url': diocese_url,
                        'parish_directory_url': parish_directory_url,
                        'existing_parish_count': existing_parish_count
                    })
                    print(f"  ⏭️ SKIPPED {diocese_name}: {existing_parish_count} parishes already extracted")
                else:
                    dioceses_to_process.append({
                        'name': diocese_name,
                        'url': diocese_url,
                        'parish_directory_url': parish_directory_url,
                        'existing_parish_count': existing_parish_count
                    })
                    if not first_kept_shown:
                        print(f"  ✅ FIRST TO PROCESS: {diocese_name} ({existing_parish_count} existing parishes)")
                        first_kept_shown = True

            print(f"\n📊 FILTERING RESULTS:")
            print(f"  - Dioceses to process: {len(dioceses_to_process)}")
            print(f"  - Dioceses skipped (5+ parishes): {len(dioceses_skipped)}")

            # MODIFIED: Take the first N dioceses instead of random sampling
            if len(dioceses_to_process) > MAX_DIOCESES_TO_PROCESS:
                print(f"📊 Taking first {MAX_DIOCESES_TO_PROCESS} dioceses from {len(dioceses_to_process)} available")
                dioceses_to_process = dioceses_to_process[:MAX_DIOCESES_TO_PROCESS]
            else:
                print(f"📊 Processing all {len(dioceses_to_process)} available dioceses")

            print(f"📊 Selected {len(dioceses_to_process)} dioceses for enhanced processing")

            # Display what we're about to process
            if dioceses_to_process:
                print(f"\n📋 Dioceses to process:")
                for i, diocese in enumerate(dioceses_to_process, 1):
                    existing_count = diocese.get('existing_parish_count', 0)
                    existing_info = f" ({existing_count} existing)" if existing_count > 0 else " (new)"
                    print(f"  {i}. {diocese['name']}{existing_info}")
                    print(f"     Main URL: {diocese['url']}")
                    print(f"     Parish Directory: {diocese['parish_directory_url']}")

            # Display summary of skipped dioceses
            if dioceses_skipped:
                print(f"\n📋 Dioceses skipped (already have 5+ parishes):")
                for diocese in dioceses_skipped[:5]:  # Show first 5
                    print(f"  - {diocese['name']}: {diocese['existing_parish_count']} parishes")
                if len(dioceses_skipped) > 5:
                    print(f"  ... and {len(dioceses_skipped) - 5} more")

        else:
            dioceses_to_process = []

    except Exception as e:
        print(f"❌ Error fetching dioceses with parish directories: {e}")
        dioceses_to_process = []
else:
    print("❌ No Supabase connection, using test data")
    dioceses_to_process = []

print("✅ Enhanced processing function loaded (MODIFIED to exclude dioceses with 5+ existing parishes)")

📥 Fetching dioceses with parish directory URLs from database...
📊 Found 192 dioceses with parish directory URLs
🔍 Checking for dioceses that already have 5+ parishes extracted...
  ⏭️ SKIPPED Diocese of Salt Lake City: 79 parishes already extracted
  ✅ FIRST TO PROCESS: Eparchy of Parma (0 existing parishes)

📊 FILTERING RESULTS:
  - Dioceses to process: 191
  - Dioceses skipped (5+ parishes): 1
📊 Taking first 1 dioceses from 191 available
📊 Selected 1 dioceses for enhanced processing

📋 Dioceses to process:
  1. Eparchy of Parma (new)
     Main URL: http://www.parma.org/
     Parish Directory: https://www.parma.org/parishfinder

📋 Dioceses skipped (already have 5+ parishes):
  - Diocese of Salt Lake City: 79 parishes
✅ Enhanced processing function loaded (MODIFIED to exclude dioceses with 5+ existing parishes)


In [67]:
# =============================================================================
# CELL 8: Execute Enhanced Processing with Detail Extraction (UPDATED)
# =============================================================================

if dioceses_to_process:
    print(f"\n🚀 Starting ENHANCED pattern-based processing with DETAILED parish extraction...")
    print(f"📋 This will click on each parish card to extract complete information:")
    print(f"    📍 Street addresses and zip codes")
    print(f"    📞 Phone numbers")
    print(f"    🌐 Parish websites")
    print(f"    👥 Clergy information")
    print(f"    ⏰ Service times and schedules")

    # Initialize driver
    driver = setup_enhanced_driver()

    # Track enhanced results
    all_results = []
    summary_stats = {
        'total_dioceses': len(dioceses_to_process),
        'successful_extractions': 0,
        'total_parishes_found': 0,
        'pattern_distribution': {},
        'extraction_method_usage': {},
        'average_confidence': 0.0,
        # Enhanced statistics
        'total_detail_extractions_attempted': 0,
        'total_detail_extractions_successful': 0,
        'overall_detail_success_rate': 0.0,
        'field_extraction_totals': {
            'addresses_extracted': 0,
            'phones_extracted': 0,
            'websites_extracted': 0,
            'zip_codes_extracted': 0,
            'clergy_info_extracted': 0,
            'service_times_extracted': 0
        }
    }

    try:
        for i, diocese_info in enumerate(dioceses_to_process, 1):
            print(f"\n📍 Diocese {i}/{len(dioceses_to_process)}")

            # Process with enhanced detailed extraction system
            result = process_diocese_with_detailed_extraction(diocese_info, driver)
            all_results.append(result)

            # Update summary statistics
            if result['success']:
                summary_stats['successful_extractions'] += 1
                summary_stats['total_parishes_found'] += len(result['parishes_found'])

                # Enhanced statistics tracking
                detail_stats = result.get('detail_extraction_stats', {})
                summary_stats['total_detail_extractions_attempted'] += detail_stats.get('attempted', 0)
                summary_stats['total_detail_extractions_successful'] += detail_stats.get('successful', 0)

                # Field extraction statistics
                field_stats = result.get('field_extraction_stats', {})
                for field, count in field_stats.items():
                    if field in summary_stats['field_extraction_totals']:
                        summary_stats['field_extraction_totals'][field] += count

                # Track pattern distribution
                if result['pattern_detected']:
                    pattern_key = f"{result['pattern_detected']['platform']}_{result['pattern_detected']['listing_type']}"
                    summary_stats['pattern_distribution'][pattern_key] = summary_stats['pattern_distribution'].get(pattern_key, 0) + 1

                # Track extraction methods
                for method in result['extraction_methods_used']:
                    summary_stats['extraction_method_usage'][method] = summary_stats['extraction_method_usage'].get(method, 0) + 1

            # Be respectful - pause between requests (longer pause due to detail extraction)
            if i < len(dioceses_to_process):
                time.sleep(3)

    finally:
        # Clean up
        driver.quit()
        print("\n🧹 WebDriver closed")

    # Calculate final enhanced statistics
    if summary_stats['successful_extractions'] > 0:
        summary_stats['success_rate'] = (summary_stats['successful_extractions'] / summary_stats['total_dioceses']) * 100
        summary_stats['avg_parishes_per_diocese'] = summary_stats['total_parishes_found'] / summary_stats['successful_extractions']

    if summary_stats['total_detail_extractions_attempted'] > 0:
        summary_stats['overall_detail_success_rate'] = (summary_stats['total_detail_extractions_successful'] / summary_stats['total_detail_extractions_attempted']) * 100

    # =============================================================================
    # ENHANCED RESULTS DISPLAY
    # =============================================================================

    print(f"\n{'='*70}")
    print(f"📊 ENHANCED DETAILED EXTRACTION SUMMARY")
    print(f"{'='*70}")
    print(f"Total dioceses processed: {summary_stats['total_dioceses']}")
    print(f"Successful extractions: {summary_stats['successful_extractions']}")
    print(f"Success rate: {summary_stats.get('success_rate', 0):.1f}%")
    print(f"Total parishes found: {summary_stats['total_parishes_found']}")
    if summary_stats['successful_extractions'] > 0:
        print(f"Average parishes per diocese: {summary_stats.get('avg_parishes_per_diocese', 0):.1f}")

    print(f"\n📍 DETAILED EXTRACTION PERFORMANCE:")
    print(f"Detail extractions attempted: {summary_stats['total_detail_extractions_attempted']}")
    print(f"Detail extractions successful: {summary_stats['total_detail_extractions_successful']}")
    print(f"Overall detail success rate: {summary_stats['overall_detail_success_rate']:.1f}%")

    print(f"\n📋 FIELD EXTRACTION TOTALS:")
    field_totals = summary_stats['field_extraction_totals']
    total_parishes = summary_stats['total_parishes_found']
    if total_parishes > 0:
        print(f"  📍 Street Addresses: {field_totals['addresses_extracted']}/{total_parishes} ({field_totals['addresses_extracted']/total_parishes*100:.1f}%)")
        print(f"  📞 Phone Numbers: {field_totals['phones_extracted']}/{total_parishes} ({field_totals['phones_extracted']/total_parishes*100:.1f}%)")
        print(f"  🌐 Websites: {field_totals['websites_extracted']}/{total_parishes} ({field_totals['websites_extracted']/total_parishes*100:.1f}%)")
        print(f"  📮 Zip Codes: {field_totals['zip_codes_extracted']}/{total_parishes} ({field_totals['zip_codes_extracted']/total_parishes*100:.1f}%)")
        print(f"  👥 Clergy Info: {field_totals['clergy_info_extracted']}/{total_parishes} ({field_totals['clergy_info_extracted']/total_parishes*100:.1f}%)")
        print(f"  ⏰ Service Times: {field_totals['service_times_extracted']}/{total_parishes} ({field_totals['service_times_extracted']/total_parishes*100:.1f}%)")

    print(f"\n📈 Pattern Distribution:")
    for pattern, count in summary_stats['pattern_distribution'].items():
        percentage = (count / summary_stats['total_dioceses']) * 100
        print(f"  {pattern.replace('_', ' ').title()}: {count} dioceses ({percentage:.1f}%)")

    print(f"\n🔧 Extraction Method Usage:")
    for method, count in summary_stats['extraction_method_usage'].items():
        enhanced_indicator = "🔥" if "Enhanced" in method else ""
        print(f"  {enhanced_indicator} {method}: {count} times")

    print(f"\n🔍 Detailed Results:")
    for result in all_results:
        status = "✅" if result['success'] else "❌"
        parish_count = len(result['parishes_found'])

        # Detail extraction summary
        detail_stats = result.get('detail_extraction_stats', {})
        detail_success = detail_stats.get('successful', 0)
        detail_total = detail_stats.get('attempted', 0)
        detail_rate = detail_stats.get('success_rate', 0)

        pattern_info = ""
        if result['pattern_detected']:
            pattern_info = f" [{result['pattern_detected']['platform']} / {result['pattern_detected']['listing_type']}]"

        print(f"  {status} {result['diocese_name']}: {parish_count} parishes{pattern_info}")
        print(f"      📊 Detail extraction: {detail_success}/{detail_total} successful ({detail_rate:.1f}%)")
        print(f"      🌐 Main URL: {result['diocese_url']}")
        print(f"      📂 Parish Directory: {result['parish_directory_url']}")

        if result['extraction_methods_used']:
            methods = ', '.join(result['extraction_methods_used'])
            print(f"      🔧 Methods: {methods}")

        # Show field extraction summary for this diocese
        field_stats = result.get('field_extraction_stats', {})
        if any(field_stats.values()):
            field_summary = []
            if field_stats.get('addresses_extracted', 0) > 0:
                field_summary.append(f"📍{field_stats['addresses_extracted']} addresses")
            if field_stats.get('phones_extracted', 0) > 0:
                field_summary.append(f"📞{field_stats['phones_extracted']} phones")
            if field_stats.get('websites_extracted', 0) > 0:
                field_summary.append(f"🌐{field_stats['websites_extracted']} websites")
            if field_stats.get('zip_codes_extracted', 0) > 0:
                field_summary.append(f"📮{field_stats['zip_codes_extracted']} zips")

            if field_summary:
                print(f"      📋 Fields: {', '.join(field_summary[:4])}")

        if result['errors']:
            for error in result['errors']:
                print(f"      ❌ Error: {error[:100]}...")
        print()  # Add blank line between dioceses

    # Save enhanced summary to file
    summary_filename = f"enhanced_extraction_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(summary_filename, 'w') as f:
        json.dump({
            'summary_stats': summary_stats,
            'detailed_results': all_results
        }, f, indent=2, default=str)
    print(f"💾 Enhanced detailed results saved to: {summary_filename}")

else:
    print("❌ No dioceses with parish directory URLs found to process")

print(f"\n🎉 Enhanced pattern-based extraction with DETAILED parish information complete!")
print(f"📊 Successfully extracted detailed information for parishes including addresses, phones, websites, and more!")
print(f"{'='*70}")


🚀 Starting ENHANCED pattern-based processing with DETAILED parish extraction...
📋 This will click on each parish card to extract complete information:
    📍 Street addresses and zip codes
    📞 Phone numbers
    🌐 Parish websites
    👥 Clergy information
    ⏰ Service times and schedules
🔧 Setting up enhanced Chrome WebDriver...
✅ Chrome WebDriver initialized successfully

📍 Diocese 1/1

🔍 ENHANCED DETAILED PROCESSING: Eparchy of Parma
📍 Main URL: http://www.parma.org/
📂 Parish Directory URL: https://www.parma.org/parishfinder
  📥 Loading parish directory page...
  🔍 Detecting website pattern...
    📋 Platform: ecatholic
    📊 Listing Type: parish_finder
    🎯 Confidence: 0.95
    ⚙️ Method: parish_finder_extraction
  🔄 Trying ParishFinderExtractor...
    📍 Parish finder interface detected
    📊 Found 31 parish elements using li.site
    ⚠️ ParishFinderExtractor found no parishes
  🔄 Trying EnhancedDiocesesCardExtractor...
    📍 Enhanced diocese card layout detected - extracting with 

In [54]:
    # =============================================================================
    # CELL 9: Display Results and Analysis (UPDATED)
    # =============================================================================

    print(f"\\n{'='*70}")
    print(f"📊 ENHANCED EXTRACTION SUMMARY")
    print(f"{'='*70}")
    print(f"Total dioceses processed: {summary_stats['total_dioceses']}")
    print(f"Successful extractions: {summary_stats['successful_extractions']}")
    print(f"Success rate: {summary_stats.get('success_rate', 0):.1f}%")
    print(f"Total parishes found: {summary_stats['total_parishes_found']}")

    if summary_stats['total_dioceses'] > 0:
        print(f"\n{'='*70}")
        print(f"📊 ENHANCED EXTRACTION SUMMARY")
        print(f"{'='*70}")
        print(f"Total dioceses processed: {summary_stats['total_dioceses']}")
        print(f"Successful extractions: {summary_stats['successful_extractions']}")
        print(f"Success rate: {summary_stats.get('success_rate', 0):.1f}%")
        print(f"Total parishes found: {summary_stats['total_parishes_found']}")
        if summary_stats['successful_extractions'] > 0:
            print(f"Average parishes per diocese: {summary_stats.get('avg_parishes_per_diocese', 0):.1f}")
        print(f"\n\n📈 Pattern Distribution:")
        for pattern, count in summary_stats['pattern_distribution'].items():
            percentage = (count / summary_stats['total_dioceses']) * 100
            print(f"  {pattern.replace('_', ' ').title()}: {count} dioceses ({percentage:.1f}%)")
        print(f"\n\n🔧 Extraction Method Usage:")
        for method, count in summary_stats['extraction_method_usage'].items():
            print(f"  {method}: {count} times")
        print(f"\n\n🔍 Detailed Results:")
        for result in all_results:
            status = "✅" if result['success'] else "❌"
            parish_count = len(result['parishes_found'])
            pattern_info = ""
            if result['pattern_detected']:
                pattern_info = f" [{result['pattern_detected']['platform']} / {result['pattern_detected']['listing_type']}]"
            print(f"  {status} {result['diocese_name']}: {parish_count} parishes{pattern_info}")
            print(f"      Main URL: {result['diocese_url']}")
            print(f"      Parish Directory: {result['parish_directory_url']}")
            if result['extraction_methods_used']:
                methods = ', '.join(result['extraction_methods_used'])
                print(f"      Methods: {methods}")
            if result['errors']:
                for error in result['errors']:
                    print(f"      Error: {error[:100]}...")
            print()  # Add blank line between dioceses

        # Save summary to file for analysis
        summary_filename = f"extraction_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(summary_filename, 'w') as f:
            json.dump({
                'summary_stats': summary_stats,
                'detailed_results': [
                    [result, result['errors']] if result.get('errors') and len(result['errors']) > 0 else result
                    for result in all_results
                ]
            }, f, indent=2, default=str)
        print(f"💾 Detailed results saved to: {summary_filename}")

    else:
        print("❌ No dioceses with parish directory URLs found to process")

    print(f"\n🎉 Enhanced pattern-based extraction complete!")
    print(f"{'='*70}")

📊 ENHANCED EXTRACTION SUMMARY
Total dioceses processed: 1
Successful extractions: 1
Success rate: 100.0%
Total parishes found: 79

📊 ENHANCED EXTRACTION SUMMARY
Total dioceses processed: 1
Successful extractions: 1
Success rate: 100.0%
Total parishes found: 79
Average parishes per diocese: 79.0


📈 Pattern Distribution:
  Ecatholic Card Grid: 1 dioceses (100.0%)


🔧 Extraction Method Usage:
  EnhancedDiocesesCardExtractor: 1 times


🔍 Detailed Results:
  ✅ Diocese of Salt Lake City: 79 parishes [ecatholic / card_grid]
      Main URL: http://www.utahcatholicdiocese.org
      Parish Directory: http://www.utahcatholicdiocese.org/parishes
      Methods: EnhancedDiocesesCardExtractor

💾 Detailed results saved to: extraction_summary_20250529_005429.json

🎉 Enhanced pattern-based extraction complete!
