<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Enhanced_Pattern_Based_Parish_Extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
# =============================================================================
# CELL 1: Install and Import Dependencies (FIXED)
# =============================================================================

# Install additional dependencies for the enhanced system
!pip install supabase dataclasses-json beautifulsoup4 selenium webdriver-manager tenacity

# Complete imports including missing ones
import os
import time
import json
import random
import sqlite3
import pandas as pd
import subprocess  # FIXED: Added for Chrome installation
import re  # FIXED: Added for pattern detection
from datetime import datetime
from dataclasses import dataclass, asdict
from enum import Enum
from typing import List, Dict, Optional, Any
from urllib.parse import urljoin, urlparse

# Web scraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Your existing Supabase and AI imports
from google.colab import userdata
from supabase import create_client, Client

print("✅ All dependencies installed and imported successfully!")

✅ All dependencies installed and imported successfully!


In [29]:
# =============================================================================
# CELL 2: Configuration (Reuse your existing setup)
# =============================================================================

# Reuse your existing configuration logic
print("=== ENHANCED PARISH EXTRACTOR CONFIGURATION ===")

# GitHub and database setup (copy from your existing notebooks)
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# Supabase configuration (copy from your existing setup)
SUPABASE_URL = userdata.get('SUPABASE_URL')
SUPABASE_KEY = userdata.get('SUPABASE_KEY')

if SUPABASE_URL and SUPABASE_KEY:
    supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
    print("✅ Supabase client initialized")
else:
    print("❌ Supabase credentials not found")
    supabase = None

# Processing configuration
MAX_DIOCESES_TO_PROCESS = 1  # Start small for testing
ENABLE_PATTERN_DETECTION = True
SAVE_DETAILED_LOGS = True

print(f"📊 Will process {MAX_DIOCESES_TO_PROCESS} dioceses with pattern detection")


=== ENHANCED PARISH EXTRACTOR CONFIGURATION ===
✅ Supabase client initialized
📊 Will process 1 dioceses with pattern detection


In [30]:
# =============================================================================
# CELL 2.5: Chrome Installation for Google Colab (FIXED)
# =============================================================================

import subprocess  # This was missing!
import os

def ensure_chrome_installed():
    """Ensures Chrome is installed in the Colab environment."""
    try:
        # Check if Chrome is already available
        result = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True)
        if result.returncode == 0:
            print("✅ Chrome is already installed and available.")
            return True

        print("🔧 Chrome not found. Installing Chrome for Selenium...")

        # Install Chrome
        os.system('apt-get update > /dev/null 2>&1')
        os.system('wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - > /dev/null 2>&1')
        os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list')
        os.system('apt-get update > /dev/null 2>&1')
        os.system('apt-get install -y google-chrome-stable > /dev/null 2>&1')

        # Verify installation
        result = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            print(f"✅ Chrome installed successfully: {result.stdout.strip()}")
            return True
        else:
            print("❌ Chrome installation may have failed.")
            return False

    except Exception as e:
        print(f"❌ Error during Chrome installation: {e}")
        return False

# Run the installation check
print("🔧 Checking Chrome installation...")
chrome_ready = ensure_chrome_installed()

if chrome_ready:
    print("🚀 Ready to proceed with Selenium operations!")
else:
    print("⚠️ You may need to restart the runtime if Chrome installation failed.")

# Test Chrome installation
if chrome_ready:
    try:
        result = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
        print(f"📋 Chrome version: {result.stdout.strip()}")
    except Exception as e:
        print(f"⚠️ Chrome test failed: {e}")
        chrome_ready = False

print(f"Final Chrome status: {'✅ Ready' if chrome_ready else '❌ Not Ready'}")

🔧 Checking Chrome installation...
✅ Chrome is already installed and available.
🚀 Ready to proceed with Selenium operations!
📋 Chrome version: Google Chrome 137.0.7151.55
Final Chrome status: ✅ Ready


In [31]:
# =============================================================================
# CELL 2.6: Driver Setup Function (MISSING - NOW ADDED)
# =============================================================================

def setup_enhanced_driver():
    """Setup Chrome driver with enhanced options for parish extraction"""

    try:
        print("🔧 Setting up Chrome driver...")

        # Chrome options
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run in background
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)

        # Setup driver
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)

        # Additional setup to avoid detection
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

        # Set timeouts
        driver.implicitly_wait(10)
        driver.set_page_load_timeout(30)

        print("✅ Chrome driver setup complete")
        return driver

    except Exception as e:
        print(f"❌ Failed to setup driver: {e}")
        raise

print("✅ Driver setup function added")

✅ Driver setup function added


In [32]:
# =============================================================================
# CELL 3: Enhanced Pattern Detection Classes (UPDATED FOR TULSA-STYLE FINDERS)
# =============================================================================

import re  # This was also missing!

class DiocesePlatform(Enum):
    SQUARESPACE = "squarespace"
    WORDPRESS = "wordpress"
    DRUPAL = "drupal"
    CUSTOM_CMS = "custom"
    STATIC_HTML = "static"
    ECATHOLIC = "ecatholic"  # NEW: Added for sites like Tulsa
    UNKNOWN = "unknown"

class ParishListingType(Enum):
    INTERACTIVE_MAP = "interactive_map"
    STATIC_TABLE = "static_table"
    CARD_GRID = "card_grid"
    SIMPLE_LIST = "simple_list"
    PAGINATED_LIST = "paginated_list"
    SEARCHABLE_DIRECTORY = "searchable_directory"
    PARISH_FINDER = "parish_finder"  # NEW: Added for interactive parish finders
    PDF_DIRECTORY = "pdf_directory"
    UNKNOWN = "unknown"

@dataclass
class ParishData:
    name: str
    address: Optional[str] = None
    city: Optional[str] = None
    state: Optional[str] = None
    zip_code: Optional[str] = None
    phone: Optional[str] = None
    website: Optional[str] = None
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    pastor: Optional[str] = None
    mass_times: Optional[str] = None
    confidence_score: float = 0.5
    extraction_method: str = "unknown"
    diocese_url: Optional[str] = None
    parish_directory_url: Optional[str] = None

@dataclass
class DioceseSitePattern:
    platform: DiocesePlatform
    listing_type: ParishListingType
    confidence_score: float
    extraction_method: str
    specific_selectors: Dict[str, str]
    javascript_required: bool
    pagination_pattern: Optional[str] = None
    notes: str = ""

class PatternDetector:
    """Detects patterns in diocese websites for targeted extraction"""

    def detect_pattern(self, html_content: str, url: str) -> DioceseSitePattern:
        """Analyze website content and detect the best extraction pattern"""
        soup = BeautifulSoup(html_content, 'html.parser')
        html_lower = html_content.lower()

        # Platform detection
        platform = self._detect_platform(html_lower, url)

        # Listing type detection
        listing_type = self._detect_listing_type(html_lower, soup, url)

        # JavaScript requirement
        js_required = self._requires_javascript(html_lower)

        # Determine extraction method and confidence
        extraction_method, confidence, selectors, notes = self._determine_extraction_strategy(
            platform, listing_type, soup, html_lower, url
        )

        return DioceseSitePattern(
            platform=platform,
            listing_type=listing_type,
            confidence_score=confidence,
            extraction_method=extraction_method,
            specific_selectors=selectors,
            javascript_required=js_required,
            notes=notes
        )

    def _detect_platform(self, html_lower: str, url: str) -> DiocesePlatform:
        """Detect CMS/platform"""
        if 'ecatholic.com' in url or 'ecatholic' in html_lower:
            return DiocesePlatform.ECATHOLIC
        elif 'squarespace' in html_lower:
            return DiocesePlatform.SQUARESPACE
        elif 'wp-content' in html_lower or 'wordpress' in html_lower:
            return DiocesePlatform.WORDPRESS
        elif 'drupal' in html_lower:
            return DiocesePlatform.DRUPAL
        else:
            return DiocesePlatform.CUSTOM_CMS

    def _detect_listing_type(self, html_lower: str, soup: BeautifulSoup, url: str) -> ParishListingType:
        """Detect how parishes are listed"""

        # Check for eCatholic parish finder pattern (like Tulsa)
        if ('parishfinder' in url.lower() or 'parish-finder' in url.lower() or
            'finderCore' in html_lower or 'finder.js' in html_lower or
            'parish finder' in html_lower):
            return ParishListingType.PARISH_FINDER

        # Interactive map indicators
        map_indicators = ['leaflet', 'google.maps', 'mapbox', 'parish-map', 'interactive']
        if any(indicator in html_lower for indicator in map_indicators):
            return ParishListingType.INTERACTIVE_MAP

        # Table indicators
        if soup.find('table') and ('parish' in html_lower or 'church' in html_lower):
            return ParishListingType.STATIC_TABLE

        # Card/grid layout
        if soup.find_all(class_=re.compile(r'(card|grid|parish-item)', re.I)):
            return ParishListingType.CARD_GRID

        # Pagination
        if any(word in html_lower for word in ['pagination', 'page-numbers', 'next-page']):
            return ParishListingType.PAGINATED_LIST

        return ParishListingType.SIMPLE_LIST

    def _requires_javascript(self, html_lower: str) -> bool:
        """Check if JavaScript is required"""
        js_indicators = ['react', 'angular', 'vue', 'leaflet', 'google.maps', 'ajax', 'finder.js']
        return any(indicator in html_lower for indicator in js_indicators)

    def _determine_extraction_strategy(self, platform, listing_type, soup, html_lower, url):
        """Determine the best extraction strategy"""

        if listing_type == ParishListingType.PARISH_FINDER:
            return (
                "parish_finder_extraction",
                0.95,
                {
                    "parish_list": ".site, li.site",
                    "parish_name": ".name",
                    "parish_city": ".city",
                    "parish_info": ".siteInfo",
                    "parish_details": ".details"
                },
                "Parish finder interface detected - specialized extraction for interactive directory"
            )

        elif listing_type == ParishListingType.INTERACTIVE_MAP:
            return (
                "interactive_map_extraction",
                0.9,
                {"map_container": "#map, .map-container, .parish-map"},
                "Interactive map detected - will extract from JS data and markers"
            )

        elif listing_type == ParishListingType.STATIC_TABLE:
            return (
                "table_extraction",
                0.95,
                {"table": "table", "rows": "tr:not(:first-child)"},
                "HTML table detected - most reliable extraction method"
            )

        elif platform == DiocesePlatform.SQUARESPACE:
            return (
                "squarespace_extraction",
                0.8,
                {"items": ".summary-item, .parish-item", "title": ".summary-title"},
                "SquareSpace platform - using platform-specific selectors"
            )

        else:
            return (
                "generic_extraction",
                0.4,
                {"containers": "[class*='parish'], [class*='church']"},
                "Using generic extraction patterns"
            )

print("✅ Pattern detection classes loaded (UPDATED with parish finder support)")

✅ Pattern detection classes loaded (UPDATED with parish finder support)


In [33]:
# =============================================================================
# CELL 4: Enhanced Extraction Classes (COMPLETE WITH NEW PARISH FINDER EXTRACTOR)
# =============================================================================

class BaseExtractor:
    """Base class for all parish extractors"""

    def __init__(self, pattern: DioceseSitePattern):
        self.pattern = pattern

    def extract(self, driver, soup: BeautifulSoup, url: str) -> List[ParishData]:
        """Override in subclasses"""
        raise NotImplementedError

    def clean_text(self, text: str) -> str:
        """Clean extracted text"""
        if not text:
            return ""
        return ' '.join(text.strip().split())

    def extract_phone(self, text: str) -> Optional[str]:
        """Extract phone number"""
        import re
        phone_pattern = r'\(?\d{3}\)?[-.s]?\d{3}[-.s]?\d{4}'
        match = re.search(phone_pattern, text)
        return match.group() if match else None

class ParishFinderExtractor(BaseExtractor):
    """Extract from eCatholic-style parish finder interfaces (like Tulsa Diocese)"""

    def extract(self, driver, soup: BeautifulSoup, url: str) -> List[ParishData]:
        parishes = []

        try:
            print("    📍 Parish finder interface detected")

            # Wait for the parish finder to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#finder, .finder, #finderCore"))
            )
            time.sleep(2)  # Give extra time for dynamic content

            # Method 1: Extract from site list elements (most reliable for Tulsa-style)
            parishes.extend(self._extract_from_site_list(driver, soup))

            # Method 2: If no parishes found, try extracting from JavaScript data
            if not parishes:
                parishes.extend(self._extract_from_parish_js_data(driver))

        except Exception as e:
            print(f"    ⚠️ Parish finder extraction info: {str(e)[:100]}...")

        return parishes

    def _extract_from_site_list(self, driver, soup: BeautifulSoup) -> List[ParishData]:
        """Extract parishes from the site list in parish finder"""
        parishes = []

        try:
            # Look for parish site elements - these contain the actual parish data
            site_selectors = [
                "li.site",
                ".site",
                "[class*='site']",
                "#categories .site"
            ]

            parish_elements = []
            for selector in site_selectors:
                try:
                    parish_elements = driver.find_elements(By.CSS_SELECTOR, selector)
                    if parish_elements:
                        print(f"    📊 Found {len(parish_elements)} parish elements using {selector}")
                        break
                except:
                    continue

            if not parish_elements:
                print("    ℹ️ No parish site elements found")
                return parishes

            for element in parish_elements:
                try:
                    parish_data = self._extract_parish_from_site_element(element, driver)
                    if parish_data:
                        parishes.append(parish_data)
                except Exception as e:
                    continue

        except Exception as e:
            print(f"    ℹ️ Site list extraction info: {str(e)[:50]}...")

        return parishes

    def _extract_parish_from_site_element(self, element, driver) -> Optional[ParishData]:
        """Extract parish data from a single site element"""
        try:
            # Get basic info that's visible
            name_elem = element.find_element(By.CSS_SELECTOR, ".name")
            name = name_elem.text.strip() if name_elem else None

            if not name or len(name) < 3:
                return None

            # Skip non-parish entries
            skip_terms = [
                'no parish registration', 'contact', 'chancery', 'pastoral center',
                'tv mass', 'directory', 'finder', 'diocese', 'bishop', 'office'
            ]
            if any(term in name.lower() for term in skip_terms):
                return None

            # Get city if available
            city = None
            try:
                city_elem = element.find_element(By.CSS_SELECTOR, ".city")
                city = city_elem.text.strip() if city_elem else None
            except:
                pass

            # Click to get detailed information
            address = None
            phone = None
            website = None
            pastor = None
            mass_times = None
            latitude = None
            longitude = None

            try:
                # Click the element to open details
                driver.execute_script("arguments[0].click();", element)
                time.sleep(1)

                # Look for expanded details
                try:
                    # Check for siteInfo popup/details
                    site_info = driver.find_element(By.CSS_SELECTOR, ".siteInfo")

                    # Extract address
                    try:
                        address_elem = site_info.find_element(By.CSS_SELECTOR, ".address")
                        address = address_elem.text.strip()
                    except:
                        pass

                    # Extract phone from phone link or text
                    try:
                        phone_elem = site_info.find_element(By.CSS_SELECTOR, ".phoneLink, a[href^='tel:']")
                        phone_href = phone_elem.get_attribute('href')
                        if phone_href and phone_href.startswith('tel:'):
                            phone = phone_href.replace('tel:', '').strip()
                        else:
                            phone = phone_elem.text.strip()
                    except:
                        # Try finding phone in text
                        try:
                            phone_text = site_info.text
                            phone_match = re.search(r'P:\s*(\([0-9]{3}\)\s*[0-9]{3}-[0-9]{4})', phone_text)
                            if phone_match:
                                phone = phone_match.group(1)
                        except:
                            pass

                    # Extract website
                    try:
                        website_elem = site_info.find_element(By.CSS_SELECTOR, ".urlLink, a[class*='url']")
                        website = website_elem.get_attribute('href')
                    except:
                        pass

                    # Extract pastor from details section
                    try:
                        details_elem = site_info.find_element(By.CSS_SELECTOR, ".details")
                        details_text = details_elem.text
                        # Look for pastor information
                        pastor_match = re.search(r'Pastor[:\s]+([^\n\r]+)', details_text)
                        if pastor_match:
                            pastor = pastor_match.group(1).strip()
                    except:
                        pass

                    # Extract mass times
                    try:
                        times_tab = site_info.find_element(By.CSS_SELECTOR, ".times")
                        mass_times_elem = times_tab.find_element(By.CSS_SELECTOR, ".massTimes")
                        mass_times = mass_times_elem.text.strip()
                    except:
                        pass

                    # Extract coordinates from data attributes
                    try:
                        lat_attr = element.get_attribute('data-latitude')
                        lng_attr = element.get_attribute('data-longitude')
                        if lat_attr and lng_attr:
                            latitude = float(lat_attr) if lat_attr != '0.0' else None
                            longitude = float(lng_attr) if lng_attr != '0.0' else None
                    except:
                        pass

                except Exception as detail_error:
                    pass  # Details extraction failed, continue with basic info

            except Exception as click_error:
                pass  # Clicking failed, continue with basic info

            return ParishData(
                name=name,
                city=city,
                address=address,
                phone=phone,
                website=website,
                pastor=pastor,
                mass_times=mass_times,
                latitude=latitude,
                longitude=longitude,
                confidence_score=0.9,  # High confidence for parish finder extraction
                extraction_method="parish_finder_extraction"
            )

        except Exception as e:
            return None

    def _extract_from_parish_js_data(self, driver) -> List[ParishData]:
        """Fallback: Extract from JavaScript parish data"""
        parishes = []

        try:
            # Common variable names for parish finder data
            js_vars = [
                "parishData", "parishes", "sites", "locations",
                "finderData", "churchData", "mapData"
            ]

            for var_name in js_vars:
                try:
                    js_data = driver.execute_script(f"""
                        try {{
                            return window.{var_name};
                        }} catch(e) {{
                            return null;
                        }}
                    """)

                    if js_data and isinstance(js_data, list):
                        print(f"    📊 Found JS data in window.{var_name}: {len(js_data)} items")

                        for item in js_data:
                            parish = self._parse_finder_js_object(item)
                            if parish:
                                parishes.append(parish)

                        if parishes:
                            break

                except:
                    continue

        except Exception as e:
            print(f"    ℹ️ JS data extraction info: {str(e)[:50]}...")

        return parishes

    def _parse_finder_js_object(self, data: Dict) -> Optional[ParishData]:
        """Parse parish data from parish finder JavaScript object"""
        if not isinstance(data, dict):
            return None

        # Extract name
        name = None
        for field in ['name', 'title', 'parishName', 'site_name', 'Name']:
            if field in data and data[field]:
                name = str(data[field]).strip()
                break

        if not name or len(name) < 3:
            return None

        # Skip non-parish entries
        skip_terms = [
            'no parish registration', 'contact', 'chancery',
            'finder', 'diocese', 'bishop', 'office'
        ]
        if any(term in name.lower() for term in skip_terms):
            return None

        # Extract other fields with fallbacks
        address = None
        for field in ['address', 'location', 'street_address', 'addr']:
            if field in data and data[field]:
                address = str(data[field]).strip()
                break

        city = data.get('city', data.get('City'))
        phone = data.get('phone', data.get('telephone', data.get('Phone')))
        website = data.get('website', data.get('url', data.get('Website')))

        # Coordinates
        lat = data.get('latitude', data.get('lat'))
        lng = data.get('longitude', data.get('lng', data.get('lon')))

        return ParishData(
            name=name,
            city=city,
            address=address,
            phone=phone,
            website=website,
            latitude=float(lat) if lat else None,
            longitude=float(lng) if lng else None,
            confidence_score=0.8,
            extraction_method="parish_finder_js_extraction"
        )

class InteractiveMapExtractor(BaseExtractor):
    """Extract from JavaScript-powered maps"""

    def extract(self, driver, soup: BeautifulSoup, url: str) -> List[ParishData]:
        parishes = []

        try:
            # Wait for map to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#map, .map, .parish-map"))
            )

            # Method 1: Extract from JavaScript variables
            parishes.extend(self._extract_from_js_variables(driver))

            # Method 2: Extract from map markers (if JS method failed)
            if not parishes:
                parishes.extend(self._extract_from_markers(driver))

        except Exception as e:
            print(f"    ⚠️ Map extraction failed: {e}")

        return parishes

    def _extract_from_js_variables(self, driver) -> List[ParishData]:
        """Extract from common JavaScript variable names"""
        parishes = []

        # Common variable names dioceses use
        js_vars = ["parishes", "parishData", "locations", "markers", "churchData"]

        for var_name in js_vars:
            try:
                js_data = driver.execute_script(f"return window.{var_name};")
                if js_data and isinstance(js_data, list):
                    for item in js_data:
                        parish = self._parse_js_parish_object(item)
                        if parish:
                            parishes.append(parish)
                    break
            except:
                continue

        return parishes

    def _parse_js_parish_object(self, data: Dict) -> Optional[ParishData]:
        """Parse parish data from JavaScript object"""
        if not isinstance(data, dict):
            return None

        # Find name using common field names
        name = None
        for field in ['name', 'title', 'parishName', 'churchName']:
            if field in data and data[field]:
                name = str(data[field]).strip()
                break

        if not name:
            return None

        return ParishData(
            name=name,
            address=data.get('address', data.get('location')),
            phone=data.get('phone', data.get('telephone')),
            website=data.get('website', data.get('url')),
            latitude=data.get('lat', data.get('latitude')),
            longitude=data.get('lng', data.get('longitude')),
            confidence_score=0.8,
            extraction_method="js_variable_extraction"
        )

    def _extract_from_markers(self, driver) -> List[ParishData]:
        """Extract by clicking map markers"""
        parishes = []

        try:
            markers = driver.find_elements(By.CSS_SELECTOR, ".marker, .leaflet-marker")

            for marker in markers[:10]:  # Limit to avoid timeouts
                try:
                    driver.execute_script("arguments[0].click();", marker)
                    time.sleep(1)

                    # Look for popup content
                    popup = driver.find_element(By.CSS_SELECTOR, ".popup, .info-window")
                    text = popup.text

                    if text and len(text) > 10:
                        parishes.append(ParishData(
                            name=text.split('\n')[0],  # First line usually name
                            confidence_score=0.6,
                            extraction_method="marker_click_extraction"
                        ))
                except:
                    continue
        except:
            pass

        return parishes

class TableExtractor(BaseExtractor):
    """Extract from HTML tables"""

    def extract(self, driver, soup: BeautifulSoup, url: str) -> List[ParishData]:
        parishes = []

        # Find tables that contain parish data
        tables = soup.find_all('table')

        for table in tables:
            if self._is_parish_table(table):
                parishes.extend(self._extract_from_table(table))

        return parishes

    def _is_parish_table(self, table) -> bool:
        """Check if table contains parish data"""
        text = table.get_text().lower()
        indicators = ['parish', 'church', 'address', 'phone']
        return sum(1 for indicator in indicators if indicator in text) >= 2

    def _extract_from_table(self, table) -> List[ParishData]:
        """Extract parishes from table"""
        parishes = []
        rows = table.find_all('tr')

        if len(rows) < 2:
            return parishes

        # Analyze header row to map columns
        headers = [cell.get_text().strip().lower() for cell in rows[0].find_all(['th', 'td'])]
        column_map = self._map_table_columns(headers)

        # Extract data from each row
        for row in rows[1:]:
            cells = row.find_all(['td', 'th'])
            parish = self._extract_parish_from_row(cells, column_map)
            if parish:
                parishes.append(parish)

        return parishes

    def _map_table_columns(self, headers: List[str]) -> Dict[str, int]:
        """Map table columns to data fields"""
        mapping = {}

        for i, header in enumerate(headers):
            if any(word in header for word in ['name', 'parish', 'church']):
                mapping['name'] = i
            elif 'address' in header:
                mapping['address'] = i
            elif 'phone' in header:
                mapping['phone'] = i
            elif 'website' in header or 'web' in header:
                mapping['website'] = i

        return mapping

    def _extract_parish_from_row(self, cells, column_map: Dict[str, int]) -> Optional[ParishData]:
        """Extract parish data from table row"""
        if not cells or 'name' not in column_map:
            return None

        name_idx = column_map['name']
        if name_idx >= len(cells):
            return None

        name = self.clean_text(cells[name_idx].get_text())
        if not name or len(name) < 3:
            return None

        # Extract other fields
        address = None
        if 'address' in column_map and column_map['address'] < len(cells):
            address = self.clean_text(cells[column_map['address']].get_text())

        phone = None
        if 'phone' in column_map and column_map['phone'] < len(cells):
            phone = self.extract_phone(cells[column_map['phone']].get_text())

        website = None
        if 'website' in column_map and column_map['website'] < len(cells):
            link = cells[column_map['website']].find('a')
            if link:
                website = link.get('href')

        return ParishData(
            name=name,
            address=address,
            phone=phone,
            website=website,
            confidence_score=0.9,  # Tables are very reliable
            extraction_method="table_extraction"
        )

class GenericExtractor(BaseExtractor):
    """Fallback extractor for unrecognized patterns"""

    def extract(self, driver, soup: BeautifulSoup, url: str) -> List[ParishData]:
        parishes = []

        # Look for elements that might contain parish info
        selectors = [
            "[class*='parish']",
            "[class*='church']",
            "[class*='location']",
            "h2, h3, h4"  # Headers that might be parish names
        ]

        for selector in selectors:
            elements = soup.select(selector)
            for element in elements[:20]:  # Limit to avoid noise
                text = element.get_text().strip()
                if self._looks_like_parish_name(text):
                    parishes.append(ParishData(
                        name=text,
                        confidence_score=0.3,
                        extraction_method="generic_extraction"
                    ))

        return parishes

    def _looks_like_parish_name(self, text: str) -> bool:
        """Check if text looks like a parish name"""
        if not text or len(text) < 5 or len(text) > 100:
            return False

        parish_indicators = ['parish', 'church', 'st.', 'saint', 'our lady', 'holy']
        return any(indicator in text.lower() for indicator in parish_indicators)

print("✅ All extraction classes loaded (including new ParishFinderExtractor)")

✅ All extraction classes loaded (including new ParishFinderExtractor)


In [34]:
# =============================================================================
# CELL 4.5: Improved Map Extractor (INSERT AFTER CELL 4)
# =============================================================================

class ImprovedInteractiveMapExtractor(BaseExtractor):
    """Improved extractor for JavaScript-powered maps with better error handling"""

    def extract(self, driver, soup: BeautifulSoup, url: str) -> List[ParishData]:
        parishes = []

        try:
            # Try to find map containers with more flexible selectors
            map_selectors = [
                "#map", ".map", ".parish-map", ".church-map",
                "[id*='map']", "[class*='map']",
                "#parish-finder", ".parish-finder"
            ]

            map_found = False
            for selector in map_selectors:
                try:
                    WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, selector))
                    )
                    map_found = True
                    print(f"    📍 Found map container: {selector}")
                    break
                except:
                    continue

            if not map_found:
                print(f"    ℹ️ No map container found, trying direct JS extraction...")

            # Method 1: Extract from JavaScript variables (works even without visible map)
            parishes.extend(self._extract_from_js_variables(driver))

            # Method 2: Look for parish data in script tags
            if not parishes:
                parishes.extend(self._extract_from_script_tags(soup))

            # Method 3: Extract from map markers (only if map found)
            if not parishes and map_found:
                parishes.extend(self._extract_from_markers(driver))

        except Exception as e:
            print(f"    ℹ️ Map extraction completed with info: {str(e)[:100]}...")

        return parishes

    def _extract_from_script_tags(self, soup: BeautifulSoup) -> List[ParishData]:
        """Extract parish data from script tags containing JSON"""
        parishes = []

        try:
            script_tags = soup.find_all('script')

            for script in script_tags:
                if not script.string:
                    continue

                script_content = script.string

                # Look for JSON-like data containing parish information
                if any(keyword in script_content.lower() for keyword in
                       ['parish', 'church', 'location', 'marker']):

                    # Try to extract JSON objects
                    import json

                    # Look for common patterns
                    patterns = [
                        r'parishes\s*[:=]\s*(\[.*?\])',
                        r'locations\s*[:=]\s*(\[.*?\])',
                        r'markers\s*[:=]\s*(\[.*?\])',
                        r'churches\s*[:=]\s*(\[.*?\])'
                    ]

                    for pattern in patterns:
                        matches = re.findall(pattern, script_content, re.DOTALL)
                        for match in matches:
                            try:
                                data = json.loads(match)
                                if isinstance(data, list):
                                    for item in data:
                                        parish = self._parse_js_parish_object(item)
                                        if parish:
                                            parishes.append(parish)
                            except:
                                continue

                        if parishes:
                            break

                if parishes:
                    break

        except Exception as e:
            print(f"    ℹ️ Script tag extraction info: {str(e)[:50]}...")

        return parishes

    def _extract_from_js_variables(self, driver) -> List[ParishData]:
        """Extract from common JavaScript variable names with better error handling"""
        parishes = []

        # Expanded list of common variable names
        js_vars = [
            "parishes", "parishData", "locations", "markers", "churchData",
            "parishList", "churches", "mapData", "data", "items",
            "parishInfo", "churchInfo", "mapMarkers", "points"
        ]

        for var_name in js_vars:
            try:
                js_data = driver.execute_script(f"""
                    try {{
                        return window.{var_name};
                    }} catch(e) {{
                        return null;
                    }}
                """)

                if js_data and isinstance(js_data, list) and len(js_data) > 0:
                    print(f"    📊 Found data in window.{var_name}: {len(js_data)} items")

                    for item in js_data:
                        parish = self._parse_js_parish_object(item)
                        if parish:
                            parishes.append(parish)

                    if parishes:
                        break

            except Exception as e:
                continue

        return parishes

    def _parse_js_parish_object(self, data: Dict) -> Optional[ParishData]:
        """Enhanced parsing of parish data from JavaScript object"""
        if not isinstance(data, dict):
            return None

        # Enhanced field mapping for name
        name = None
        for field in ['name', 'title', 'parishName', 'churchName', 'parish_name',
                      'church_name', 'label', 'text', 'Name', 'Title']:
            if field in data and data[field]:
                name = str(data[field]).strip()
                break

        if not name or len(name) < 3:
            return None

        # Skip non-parish entries
        if any(skip_word in name.lower() for skip_word in
               ['finder', 'directory', 'map', 'search', 'filter']):
            return None

        # Enhanced field mapping for other data
        address = None
        for field in ['address', 'location', 'fullAddress', 'street', 'addr']:
            if field in data and data[field]:
                address = str(data[field]).strip()
                break

        phone = None
        for field in ['phone', 'telephone', 'phoneNumber', 'tel', 'Phone']:
            if field in data and data[field]:
                phone = str(data[field]).strip()
                break

        website = None
        for field in ['website', 'url', 'link', 'web', 'Website', 'URL']:
            if field in data and data[field]:
                website = str(data[field]).strip()
                break

        # Coordinates
        lat = data.get('lat', data.get('latitude', data.get('Lat')))
        lng = data.get('lng', data.get('longitude', data.get('lon', data.get('Lng'))))

        return ParishData(
            name=name,
            address=address,
            phone=phone,
            website=website,
            latitude=float(lat) if lat else None,
            longitude=float(lng) if lng else None,
            confidence_score=0.8,
            extraction_method="improved_js_extraction"
        )

    def _extract_from_markers(self, driver) -> List[ParishData]:
        """Extract by clicking map markers with improved error handling"""
        parishes = []

        try:
            # More flexible marker selectors
            marker_selectors = [
                ".marker", ".leaflet-marker", ".map-marker",
                "[class*='marker']", ".gm-style-iw", ".mapboxgl-marker"
            ]

            markers = []
            for selector in marker_selectors:
                try:
                    found_markers = driver.find_elements(By.CSS_SELECTOR, selector)
                    if found_markers:
                        markers = found_markers
                        print(f"    📍 Found {len(markers)} markers using {selector}")
                        break
                except:
                    continue

            if not markers:
                print(f"    ℹ️ No clickable markers found")
                return parishes

            # Limit markers to avoid timeout
            for i, marker in enumerate(markers[:5]):  # Only try first 5
                try:
                    # Scroll marker into view
                    driver.execute_script("arguments[0].scrollIntoView(true);", marker)
                    time.sleep(0.5)

                    # Click marker
                    driver.execute_script("arguments[0].click();", marker)
                    time.sleep(1)

                    # Look for popup content with multiple selectors
                    popup_selectors = [
                        ".popup", ".info-window", ".mapboxgl-popup",
                        ".leaflet-popup", ".gm-style-iw-d"
                    ]

                    popup_text = None
                    for popup_selector in popup_selectors:
                        try:
                            popup = driver.find_element(By.CSS_SELECTOR, popup_selector)
                            popup_text = popup.text
                            break
                        except:
                            continue

                    if popup_text and len(popup_text) > 10:
                        parish_data = self._parse_popup_content(popup_text)
                        if parish_data:
                            parishes.append(parish_data)

                except Exception as e:
                    continue

        except Exception as e:
            print(f"    ℹ️ Marker extraction completed: {str(e)[:50]}...")

        return parishes

    def _parse_popup_content(self, popup_text: str) -> Optional[ParishData]:
        """Parse parish information from popup text"""
        lines = [line.strip() for line in popup_text.split('\n') if line.strip()]

        if not lines:
            return None

        name = lines[0]  # First line is usually the name

        # Skip if it doesn't look like a parish name
        if not any(indicator in name.lower() for indicator in
                  ['parish', 'church', 'st.', 'saint', 'our lady', 'holy', 'cathedral']):
            return None

        address = None
        phone = None

        # Look for address and phone in remaining lines
        for line in lines[1:]:
            if self.extract_phone(line):
                phone = self.extract_phone(line)
            elif re.search(r'\d+.*(?:street|st|avenue|ave|road|rd|drive|dr)', line, re.I):
                address = line

        return ParishData(
            name=name,
            address=address,
            phone=phone,
            confidence_score=0.6,
            extraction_method="marker_popup_extraction"
        )

# Replace the InteractiveMapExtractor in the main processing function
print("✅ Improved map extractor loaded with better error handling")

✅ Improved map extractor loaded with better error handling


In [35]:
# =============================================================================
# CELL 5: Integration with Your Existing Database Functions (FIXED)
# =============================================================================

def prepare_parish_for_supabase(parish_data: ParishData, diocese_name: str, diocese_url: str, parish_directory_url: str) -> Dict:
    """Convert ParishData to format compatible with your existing Supabase schema"""

    return {
        'Name': parish_data.name,
        'Status': 'Parish',  # Default status
        'Deanery': None,  # Will be populated later if available
        'Street Address': parish_data.address,
        'City': parish_data.city,
        'State': parish_data.state,
        'Zip Code': parish_data.zip_code,
        'Phone Number': parish_data.phone,
        'Web': parish_data.website,
        'diocese_url': diocese_url,
        'parish_directory_url': parish_directory_url,
        'extraction_method': parish_data.extraction_method,
        # Additional metadata fields
        'confidence_score': parish_data.confidence_score,
        'extracted_at': datetime.now().isoformat()
    }

def enhanced_safe_upsert_to_supabase(parishes: List[ParishData], diocese_name: str, diocese_url: str, parish_directory_url: str):
    """Enhanced version of your existing Supabase upsert function - FIXED"""

    if not supabase:
        print("  ❌ Supabase not available")
        return False

    success_count = 0

    for parish in parishes:
        try:
            # Skip non-parish items (like "Parish Finder", "Contact Info", etc.)
            if any(skip_word in parish.name.lower() for skip_word in
                   ['finder', 'contact', 'chancery', 'pastoral center', 'tv mass', 'directory']):
                print(f"    ⏭️ Skipped: {parish.name} (not a parish)")
                continue

            # Convert to your existing schema format
            supabase_data = prepare_parish_for_supabase(parish, diocese_name, diocese_url, parish_directory_url)

            # Remove None values and empty strings
            clean_data = {k: v for k, v in supabase_data.items()
                         if v is not None and v != ""}

            # Must have a name to proceed
            if not clean_data.get('Name') or len(clean_data.get('Name', '')) < 3:
                print(f"    ⏭️ Skipped: Invalid name for parish")
                continue

            # Use your existing upsert logic
            response = supabase.table('Parishes').insert(clean_data).execute()

            if hasattr(response, 'error') and response.error:
                print(f"    ❌ Database error for {parish.name}: {response.error}")
            else:
                success_count += 1
                print(f"    ✅ Saved: {parish.name} (confidence: {parish.confidence_score:.2f})")

        except Exception as e:
            print(f"    ❌ Error saving {parish.name}: {e}")

    print(f"  📊 Successfully saved {success_count}/{len(parishes)} parishes")
    return success_count > 0

# Alternative: Create a separate metadata table for extraction details
def create_extraction_metadata_record(diocese_name: str, diocese_url: str, parishes_found: List[ParishData], pattern_info: Dict):
    """Store extraction metadata in a separate table (optional)"""

    if not supabase:
        return False

    try:
        metadata = {
            'diocese_name': diocese_name,
            'diocese_url': diocese_url,
            'parishes_count': len(parishes_found),
            'extraction_timestamp': datetime.now().isoformat(),
            'pattern_platform': pattern_info.get('platform'),
            'pattern_listing_type': pattern_info.get('listing_type'),
            'pattern_confidence': pattern_info.get('confidence'),
            'extraction_methods': ', '.join(pattern_info.get('methods_used', [])),
            'success': len(parishes_found) > 0
        }

        # You would need to create this table in Supabase:
        # CREATE TABLE extraction_metadata (
        #   id SERIAL PRIMARY KEY,
        #   diocese_name TEXT,
        #   diocese_url TEXT,
        #   parishes_count INTEGER,
        #   extraction_timestamp TIMESTAMP,
        #   pattern_platform TEXT,
        #   pattern_listing_type TEXT,
        #   pattern_confidence FLOAT,
        #   extraction_methods TEXT,
        #   success BOOLEAN
        # );

        response = supabase.table('extraction_metadata').insert(metadata).execute()
        return not (hasattr(response, 'error') and response.error)

    except Exception as e:
        print(f"    ⚠️ Could not save extraction metadata: {e}")
        return False

print("✅ Database integration functions loaded (FIXED for schema compatibility)")

✅ Database integration functions loaded (FIXED for schema compatibility)


In [36]:
# =============================================================================
# CELL 5.5: WebDriver Setup Function (ADD THIS BEFORE CELL 6)
# =============================================================================

def setup_enhanced_driver():
    """Set up Chrome WebDriver with options optimized for parish extraction"""

    print("🔧 Setting up enhanced Chrome WebDriver...")

    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--disable-extensions')
    chrome_options.add_argument('--disable-plugins')
    chrome_options.add_argument('--disable-images')  # Speed up loading
    chrome_options.add_argument('--disable-javascript-harmony-shipping')
    chrome_options.add_argument('--disable-background-timer-throttling')
    chrome_options.add_argument('--disable-backgrounding-occluded-windows')
    chrome_options.add_argument('--disable-renderer-backgrounding')

    # User agent to avoid blocking
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.set_page_load_timeout(30)
        driver.implicitly_wait(5)

        print("✅ Chrome WebDriver initialized successfully")
        return driver

    except Exception as e:
        print(f"❌ Failed to initialize WebDriver: {e}")
        raise

print("✅ WebDriver setup function loaded")

✅ WebDriver setup function loaded


In [37]:
# =============================================================================
# CELL 6: Master Processing Function (UPDATED FOR PARISH FINDER SUPPORT)
# =============================================================================

def process_diocese_with_existing_directory_url(diocese_info: Dict, driver) -> Dict:
    """
    Process diocese using the existing parish directory URL from DiocesesParishDirectory table
    UPDATED to support parish finder interfaces like Tulsa Diocese
    """

    diocese_url = diocese_info['url']
    diocese_name = diocese_info['name']
    parish_directory_url = diocese_info['parish_directory_url']

    print(f"\n{'='*60}")
    print(f"🔍 ENHANCED PROCESSING: {diocese_name}")
    print(f"📍 Main URL: {diocese_url}")
    print(f"📂 Parish Directory URL: {parish_directory_url}")
    print(f"{'='*60}")

    result = {
        'diocese_name': diocese_name,
        'diocese_url': diocese_url,
        'parish_directory_url': parish_directory_url,
        'timestamp': datetime.now().isoformat(),
        'pattern_detected': None,
        'parishes_found': [],
        'success': False,
        'extraction_methods_used': [],
        'processing_time': 0,
        'errors': []
    }

    start_time = time.time()

    try:
        # Step 1: Load the parish directory page directly
        print("  📥 Loading parish directory page...")
        driver.get(parish_directory_url)
        time.sleep(3)

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        # Step 2: Detect pattern on the parish directory page
        print("  🔍 Detecting website pattern...")
        detector = PatternDetector()
        pattern = detector.detect_pattern(html_content, parish_directory_url)

        result['pattern_detected'] = {
            'platform': pattern.platform.value,
            'listing_type': pattern.listing_type.value,
            'confidence': pattern.confidence_score,
            'extraction_method': pattern.extraction_method,
            'javascript_required': pattern.javascript_required,
            'notes': pattern.notes
        }

        print(f"    📋 Platform: {pattern.platform.value}")
        print(f"    📊 Listing Type: {pattern.listing_type.value}")
        print(f"    🎯 Confidence: {pattern.confidence_score:.2f}")
        print(f"    ⚙️ Method: {pattern.extraction_method}")

        # Step 3: Extract parishes using pattern-specific method
        parishes = []

        # Try primary extraction method based on detected pattern
        if pattern.listing_type == ParishListingType.PARISH_FINDER:
            extractor = ParishFinderExtractor(pattern)
        elif pattern.listing_type == ParishListingType.INTERACTIVE_MAP:
            extractor = ImprovedInteractiveMapExtractor(pattern)
        elif pattern.listing_type == ParishListingType.STATIC_TABLE:
            extractor = TableExtractor(pattern)
        else:
            extractor = GenericExtractor(pattern)

        print(f"  🔄 Extracting using {extractor.__class__.__name__}...")
        parishes = extractor.extract(driver, soup, parish_directory_url)
        result['extraction_methods_used'].append(extractor.__class__.__name__)

        # Step 4: Fallback methods if primary failed
        if not parishes:
            print("  🔄 Primary method found no parishes, trying fallbacks...")

            # Try other extractors in order of likely success
            fallback_extractors = [
                ParishFinderExtractor(pattern),  # Try parish finder first
                TableExtractor(pattern),
                ImprovedInteractiveMapExtractor(pattern),
                GenericExtractor(pattern)
            ]

            for fallback_extractor in fallback_extractors:
                if fallback_extractor.__class__.__name__ in result['extraction_methods_used']:
                    continue  # Skip if already tried

                try:
                    print(f"    🔄 Trying {fallback_extractor.__class__.__name__}...")
                    fallback_parishes = fallback_extractor.extract(driver, soup, parish_directory_url)
                    if fallback_parishes:
                        parishes.extend(fallback_parishes)
                        result['extraction_methods_used'].append(fallback_extractor.__class__.__name__)
                        break
                except Exception as e:
                    print(f"    ❌ {fallback_extractor.__class__.__name__} failed: {e}")

        # Step 5: Process results
        if parishes:
            # Remove duplicates and validate
            unique_parishes = []
            seen_names = set()

            for parish in parishes:
                name_key = parish.name.lower().strip()
                if name_key not in seen_names and len(parish.name) > 2:
                    unique_parishes.append(parish)
                    seen_names.add(name_key)

            result['parishes_found'] = unique_parishes
            result['success'] = True

            print(f"  ✅ Found {len(unique_parishes)} unique parishes")

            # Step 6: Save to database
            if unique_parishes:
                print("  💾 Saving to database...")
                enhanced_safe_upsert_to_supabase(unique_parishes, diocese_name, diocese_url, parish_directory_url)

        else:
            print("  ❌ No parishes found with any extraction method")
            result['success'] = False

    except Exception as e:
        error_msg = str(e)
        result['errors'].append(error_msg)
        print(f"  ❌ Processing error: {error_msg}")

    finally:
        result['processing_time'] = time.time() - start_time
        print(f"  ⏱️ Completed in {result['processing_time']:.1f}s")

    return result

print("✅ Master processing function loaded (UPDATED with parish finder support)")

✅ Master processing function loaded (UPDATED with parish finder support)


In [38]:
# =============================================================================
# CELL 7: Main Execution Using Existing Parish Directory URLs (FIXED)
# =============================================================================

# Get dioceses WITH their parish directory URLs from your existing data
if supabase:
    try:
        print("📥 Fetching dioceses with parish directory URLs from database...")

        # Join dioceses with their parish directory URLs
        response = supabase.table('DiocesesParishDirectory').select(
            'diocese_url, parish_directory_url'
        ).not_.is_('parish_directory_url', 'null').not_.eq('parish_directory_url', '').execute()

        diocese_directory_data = response.data if response.data else []
        print(f"📊 Found {len(diocese_directory_data)} dioceses with parish directory URLs")

        # Get diocese names from the main table
        if diocese_directory_data:
            diocese_urls = [item['diocese_url'] for item in diocese_directory_data]

            # Get diocese names for these URLs
            diocese_names_response = supabase.table('Dioceses').select(
                'Website, Name'
            ).in_('Website', diocese_urls).execute()

            diocese_names_data = diocese_names_response.data if diocese_names_response.data else []

            # Create a mapping of URL to name
            url_to_name = {item['Website']: item['Name'] for item in diocese_names_data}

            # Combine the data
            dioceses_to_process = []
            for item in diocese_directory_data:
                diocese_url = item['diocese_url']
                parish_directory_url = item['parish_directory_url']
                diocese_name = url_to_name.get(diocese_url, 'Unknown Diocese')

                dioceses_to_process.append({
                    'name': diocese_name,
                    'url': diocese_url,
                    'parish_directory_url': parish_directory_url
                })

            # Randomly sample for testing
            if len(dioceses_to_process) > MAX_DIOCESES_TO_PROCESS:
                dioceses_to_process = random.sample(dioceses_to_process, MAX_DIOCESES_TO_PROCESS)

            print(f"📊 Selected {len(dioceses_to_process)} dioceses for enhanced processing")

            # Display what we're about to process
            print(f"\n📋 Dioceses to process:")
            for i, diocese in enumerate(dioceses_to_process, 1):
                print(f"  {i}. {diocese['name']}")
                print(f"     Main URL: {diocese['url']}")
                print(f"     Parish Directory: {diocese['parish_directory_url']}")
        else:
            dioceses_to_process = []

    except Exception as e:
        print(f"❌ Error fetching dioceses with parish directories: {e}")
        dioceses_to_process = []
else:
    print("❌ No Supabase connection, using test data")
    dioceses_to_process = []

def process_diocese_with_existing_directory_url(diocese_info: Dict, driver) -> Dict:
    """
    Process diocese using the existing parish directory URL from DiocesesParishDirectory table
    FIXED: Properly handle all variables in scope
    """

    diocese_url = diocese_info['url']
    diocese_name = diocese_info['name']
    parish_directory_url = diocese_info['parish_directory_url']  # FIXED: Now properly extracted

    print(f"\n{'='*60}")
    print(f"🔍 ENHANCED PROCESSING: {diocese_name}")
    print(f"📍 Main URL: {diocese_url}")
    print(f"📂 Parish Directory URL: {parish_directory_url}")
    print(f"{'='*60}")

    result = {
        'diocese_name': diocese_name,
        'diocese_url': diocese_url,
        'parish_directory_url': parish_directory_url,
        'timestamp': datetime.now().isoformat(),
        'pattern_detected': None,
        'parishes_found': [],
        'success': False,
        'extraction_methods_used': [],
        'processing_time': 0,
        'errors': []
    }

    start_time = time.time()

    try:
        # Step 1: Load the parish directory page directly
        print("  📥 Loading parish directory page...")
        driver.get(parish_directory_url)
        time.sleep(5)  # Give more time for JS to load

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        # Step 2: Detect pattern on the parish directory page
        print("  🔍 Detecting website pattern...")
        detector = PatternDetector()
        pattern = detector.detect_pattern(html_content, parish_directory_url)

        result['pattern_detected'] = {
            'platform': pattern.platform.value,
            'listing_type': pattern.listing_type.value,
            'confidence': pattern.confidence_score,
            'extraction_method': pattern.extraction_method,
            'javascript_required': pattern.javascript_required,
            'notes': pattern.notes
        }

        print(f"    📋 Platform: {pattern.platform.value}")
        print(f"    📊 Listing Type: {pattern.listing_type.value}")
        print(f"    🎯 Confidence: {pattern.confidence_score:.2f}")
        print(f"    ⚙️ Method: {pattern.extraction_method}")

        # Step 3: Extract parishes using ALL methods (not just pattern-specific)
        parishes = []

        # Try all extractors in order of reliability
        extractors_to_try = [
            ('ImprovedInteractiveMapExtractor', ImprovedInteractiveMapExtractor(pattern)),
            ('TableExtractor', TableExtractor(pattern)),
            ('GenericExtractor', GenericExtractor(pattern))
        ]

        for extractor_name, extractor in extractors_to_try:
            try:
                print(f"  🔄 Trying {extractor_name}...")
                current_parishes = extractor.extract(driver, soup, parish_directory_url)

                if current_parishes:
                    parishes.extend(current_parishes)
                    result['extraction_methods_used'].append(extractor_name)
                    print(f"    ✅ {extractor_name} found {len(current_parishes)} parishes")
                else:
                    print(f"    ⚠️ {extractor_name} found no parishes")

            except Exception as e:
                print(f"    ❌ {extractor_name} failed: {str(e)[:100]}")
                result['errors'].append(f"{extractor_name}: {str(e)[:100]}")

        # Step 4: Process results
        if parishes:
            # Remove duplicates and validate
            unique_parishes = []
            seen_names = set()

            for parish in parishes:
                name_key = parish.name.lower().strip()
                if name_key not in seen_names and len(parish.name) > 2:
                    # Set the source URLs for each parish
                    parish.diocese_url = diocese_url
                    parish.parish_directory_url = parish_directory_url
                    unique_parishes.append(parish)
                    seen_names.add(name_key)

            result['parishes_found'] = unique_parishes
            result['success'] = True

            print(f"  ✅ Found {len(unique_parishes)} unique parishes")

            # Step 5: Save to database
            if unique_parishes:
                print("  💾 Saving to database...")
                # FIXED: Pass the correct parameters including parish_directory_url
                enhanced_safe_upsert_to_supabase(unique_parishes, diocese_name, diocese_url, parish_directory_url)

        else:
            print("  ❌ No parishes found with any extraction method")
            result['success'] = False

    except Exception as e:
        error_msg = str(e)
        result['errors'].append(error_msg)
        print(f"  ❌ Processing error: {error_msg}")

    finally:
        result['processing_time'] = time.time() - start_time
        print(f"  ⏱️ Completed in {result['processing_time']:.1f}s")

    return result

print("✅ Enhanced processing function loaded (FIXED with proper variable handling)")

📥 Fetching dioceses with parish directory URLs from database...
📊 Found 192 dioceses with parish directory URLs
📊 Selected 1 dioceses for enhanced processing

📋 Dioceses to process:
  1. Diocese of San Diego
     Main URL: https://sdcatholic.org/
     Parish Directory: https://sdcatholic.org/find-a-parish/
✅ Enhanced processing function loaded (FIXED with proper variable handling)


In [39]:
# =============================================================================
# CELL 8: Execute Enhanced Processing (UPDATED to use existing parish directory URLs)
# =============================================================================

if dioceses_to_process:
    print(f"\\n🚀 Starting enhanced pattern-based processing with existing parish directory URLs...")

    # Initialize driver
    driver = setup_enhanced_driver()

    # Track results
    all_results = []
    summary_stats = {
        'total_dioceses': len(dioceses_to_process),
        'successful_extractions': 0,
        'total_parishes_found': 0,
        'pattern_distribution': {},
        'extraction_method_usage': {},
        'average_confidence': 0.0
    }

    try:
        for i, diocese_info in enumerate(dioceses_to_process, 1):
            print(f"\\n📍 Diocese {i}/{len(dioceses_to_process)}")

            # Process with enhanced system using existing parish directory URL
            result = process_diocese_with_existing_directory_url(diocese_info, driver)
            all_results.append(result)

            # Update summary statistics
            if result['success']:
                summary_stats['successful_extractions'] += 1
                summary_stats['total_parishes_found'] += len(result['parishes_found'])

                # Track pattern distribution
                if result['pattern_detected']:
                    pattern_key = f"{result['pattern_detected']['platform']}_{result['pattern_detected']['listing_type']}"
                    summary_stats['pattern_distribution'][pattern_key] = summary_stats['pattern_distribution'].get(pattern_key, 0) + 1

                # Track extraction methods
                for method in result['extraction_methods_used']:
                    summary_stats['extraction_method_usage'][method] = summary_stats['extraction_method_usage'].get(method, 0) + 1

            # Be respectful - pause between requests
            if i < len(dioceses_to_process):
                time.sleep(2)

    finally:
        # Clean up
        driver.quit()
        print("\\n🧹 WebDriver closed")

    # Calculate final statistics
    if summary_stats['successful_extractions'] > 0:
        summary_stats['success_rate'] = (summary_stats['successful_extractions'] / summary_stats['total_dioceses']) * 100
        summary_stats['avg_parishes_per_diocese'] = summary_stats['total_parishes_found'] / summary_stats['successful_extractions']

\n🚀 Starting enhanced pattern-based processing with existing parish directory URLs...
🔧 Setting up enhanced Chrome WebDriver...
✅ Chrome WebDriver initialized successfully
\n📍 Diocese 1/1

🔍 ENHANCED PROCESSING: Diocese of San Diego
📍 Main URL: https://sdcatholic.org/
📂 Parish Directory URL: https://sdcatholic.org/find-a-parish/
  📥 Loading parish directory page...
  🔍 Detecting website pattern...
    📋 Platform: wordpress
    📊 Listing Type: interactive_map
    🎯 Confidence: 0.90
    ⚙️ Method: interactive_map_extraction
  🔄 Trying ImprovedInteractiveMapExtractor...
    📍 Found map container: #map
    📍 Found 125 markers using [class*='marker']
    ✅ ImprovedInteractiveMapExtractor found 4 parishes
  🔄 Trying TableExtractor...
    ⚠️ TableExtractor found no parishes
  🔄 Trying GenericExtractor...
    ✅ GenericExtractor found 5 parishes
  ✅ Found 7 unique parishes
  💾 Saving to database...
    ✅ Saved: SAINT PETER THE APOSTLE, Fallbrook (confidence: 0.60)
    ✅ Saved: SAINT RICHARD, Bo

In [40]:
    # =============================================================================
    # CELL 9: Display Results and Analysis (UPDATED)
    # =============================================================================

    print(f"\\n{'='*70}")
    print(f"📊 ENHANCED EXTRACTION SUMMARY")
    print(f"{'='*70}")
    print(f"Total dioceses processed: {summary_stats['total_dioceses']}")
    print(f"Successful extractions: {summary_stats['successful_extractions']}")
    print(f"Success rate: {summary_stats.get('success_rate', 0):.1f}%")
    print(f"Total parishes found: {summary_stats['total_parishes_found']}")

    if summary_stats['total_dioceses'] > 0:
        print(f"\n{'='*70}")
        print(f"📊 ENHANCED EXTRACTION SUMMARY")
        print(f"{'='*70}")
        print(f"Total dioceses processed: {summary_stats['total_dioceses']}")
        print(f"Successful extractions: {summary_stats['successful_extractions']}")
        print(f"Success rate: {summary_stats.get('success_rate', 0):.1f}%")
        print(f"Total parishes found: {summary_stats['total_parishes_found']}")
        if summary_stats['successful_extractions'] > 0:
            print(f"Average parishes per diocese: {summary_stats.get('avg_parishes_per_diocese', 0):.1f}")
        print(f"\n\n📈 Pattern Distribution:")
        for pattern, count in summary_stats['pattern_distribution'].items():
            percentage = (count / summary_stats['total_dioceses']) * 100
            print(f"  {pattern.replace('_', ' ').title()}: {count} dioceses ({percentage:.1f}%)")
        print(f"\n\n🔧 Extraction Method Usage:")
        for method, count in summary_stats['extraction_method_usage'].items():
            print(f"  {method}: {count} times")
        print(f"\n\n🔍 Detailed Results:")
        for result in all_results:
            status = "✅" if result['success'] else "❌"
            parish_count = len(result['parishes_found'])
            pattern_info = ""
            if result['pattern_detected']:
                pattern_info = f" [{result['pattern_detected']['platform']} / {result['pattern_detected']['listing_type']}]"
            print(f"  {status} {result['diocese_name']}: {parish_count} parishes{pattern_info}")
            print(f"      Main URL: {result['diocese_url']}")
            print(f"      Parish Directory: {result['parish_directory_url']}")
            if result['extraction_methods_used']:
                methods = ', '.join(result['extraction_methods_used'])
                print(f"      Methods: {methods}")
            if result['errors']:
                for error in result['errors']:
                    print(f"      Error: {error[:100]}...")
            print()  # Add blank line between dioceses

        # Save summary to file for analysis
        summary_filename = f"extraction_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(summary_filename, 'w') as f:
            json.dump({
                'summary_stats': summary_stats,
                'detailed_results': [
                    [result, result['errors']] if result.get('errors') and len(result['errors']) > 0 else result
                    for result in all_results
                ]
            }, f, indent=2, default=str)
        print(f"💾 Detailed results saved to: {summary_filename}")

    else:
        print("❌ No dioceses with parish directory URLs found to process")

    print(f"\n🎉 Enhanced pattern-based extraction complete!")
    print(f"{'='*70}")

📊 ENHANCED EXTRACTION SUMMARY
Total dioceses processed: 1
Successful extractions: 1
Success rate: 100.0%
Total parishes found: 7

📊 ENHANCED EXTRACTION SUMMARY
Total dioceses processed: 1
Successful extractions: 1
Success rate: 100.0%
Total parishes found: 7
Average parishes per diocese: 7.0


📈 Pattern Distribution:
  Wordpress Interactive Map: 1 dioceses (100.0%)


🔧 Extraction Method Usage:
  ImprovedInteractiveMapExtractor: 1 times
  GenericExtractor: 1 times


🔍 Detailed Results:
  ✅ Diocese of San Diego: 7 parishes [wordpress / interactive_map]
      Main URL: https://sdcatholic.org/
      Parish Directory: https://sdcatholic.org/find-a-parish/
      Methods: ImprovedInteractiveMapExtractor, GenericExtractor

💾 Detailed results saved to: extraction_summary_20250529_000454.json

🎉 Enhanced pattern-based extraction complete!


In [41]:
# =============================================================================
# CELL 10: Test Tulsa Diocese Specifically (ADD AS NEW CELL)
# =============================================================================

def test_tulsa_diocese():
    """Test the enhanced extractor specifically on Tulsa Diocese"""

    print("🧪 Testing enhanced extractor on Diocese of Tulsa...")

    # Tulsa Diocese info
    tulsa_info = {
        'name': 'Diocese of Tulsa',
        'url': 'https://dioceseoftulsa.org/',
        'parish_directory_url': 'https://dioceseoftulsa.org/parishfinder'
    }

    # Initialize driver
    driver = setup_enhanced_driver()

    try:
        # Process Tulsa specifically
        result = process_diocese_with_existing_directory_url(tulsa_info, driver)

        # Display results
        print(f"\n{'='*50}")
        print(f"📊 TULSA DIOCESE TEST RESULTS")
        print(f"{'='*50}")
        print(f"Success: {'✅' if result['success'] else '❌'}")
        print(f"Parishes found: {len(result['parishes_found'])}")
        print(f"Processing time: {result['processing_time']:.1f}s")
        print(f"Methods used: {', '.join(result['extraction_methods_used'])}")

        if result['pattern_detected']:
            print(f"Platform: {result['pattern_detected']['platform']}")
            print(f"Listing type: {result['pattern_detected']['listing_type']}")
            print(f"Confidence: {result['pattern_detected']['confidence']:.2f}")

        if result['parishes_found']:
            print(f"\n📋 Found parishes:")
            for i, parish in enumerate(result['parishes_found'][:10], 1):  # Show first 10
                print(f"  {i}. {parish.name}")
                if parish.city:
                    print(f"     City: {parish.city}")
                if parish.address:
                    print(f"     Address: {parish.address}")
                if parish.phone:
                    print(f"     Phone: {parish.phone}")
                print(f"     Confidence: {parish.confidence_score:.2f}")
                print()

            if len(result['parishes_found']) > 10:
                print(f"  ... and {len(result['parishes_found']) - 10} more parishes")

        if result['errors']:
            print(f"\n⚠️ Errors:")
            for error in result['errors']:
                print(f"  - {error}")

    finally:
        driver.quit()
        print("\n🧹 Test completed, WebDriver closed")

# Run the test
if __name__ == "__main__":
    test_tulsa_diocese()

print("✅ Tulsa Diocese test function loaded")

🧪 Testing enhanced extractor on Diocese of Tulsa...
🔧 Setting up enhanced Chrome WebDriver...
✅ Chrome WebDriver initialized successfully

🔍 ENHANCED PROCESSING: Diocese of Tulsa
📍 Main URL: https://dioceseoftulsa.org/
📂 Parish Directory URL: https://dioceseoftulsa.org/parishfinder
  📥 Loading parish directory page...
  🔍 Detecting website pattern...
    📋 Platform: ecatholic
    📊 Listing Type: parish_finder
    🎯 Confidence: 0.95
    ⚙️ Method: parish_finder_extraction
  🔄 Trying ImprovedInteractiveMapExtractor...
    📍 Found map container: #map
    ℹ️ No clickable markers found
    ⚠️ ImprovedInteractiveMapExtractor found no parishes
  🔄 Trying TableExtractor...
    ⚠️ TableExtractor found no parishes
  🔄 Trying GenericExtractor...
    ⚠️ GenericExtractor found no parishes
  ❌ No parishes found with any extraction method
  ⏱️ Completed in 40.5s

📊 TULSA DIOCESE TEST RESULTS
Success: ❌
Parishes found: 0
Processing time: 40.5s
Methods used: 
Platform: ecatholic
Listing type: parish_fi