<a href="https://colab.research.google.com/github/tomknightatl/usccb-parish-extraction/blob/main/notebooks/02_Find_Parish_Directories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Find Parish Directories

This notebook discovers parish directory URLs on diocesan websites using AI-powered analysis.

**What this does**:
- Sets up the complete environment (no separate setup notebook needed)
- Analyzes diocese websites to find parish directory pages
- Uses Google Gemini AI for intelligent link classification
- Provides fallback search using mock responses
- Saves discovered directory URLs to Supabase database

**Prerequisites**: You should run `01_Build_Dioceses_Database.ipynb` first to populate dioceses data.

In [10]:
# Updated Cell 1: Complete Environment Setup with Fixed WebDriver
import os
import sys
import warnings
warnings.filterwarnings('ignore')

print("🚀 Setting up USCCB Parish Extraction Environment...\n")

# Step 1: Clone repository if needed
repo_path = '/content/usccb-parish-extraction'
if not os.path.exists(repo_path):
    print("📁 Cloning repository...")
    !git clone https://github.com/tomknightatl/usccb-parish-extraction.git
    print("✅ Repository cloned")
else:
    print("✅ Repository already exists")
    os.chdir(repo_path)
    !git pull --quiet
    print("✅ Repository updated")

# Step 2: Set working directory and Python path
os.chdir(repo_path)
if repo_path not in sys.path:
    sys.path.insert(0, repo_path)
print(f"📂 Working directory: {os.getcwd()}")

# Step 3: Install required packages with specific Chrome setup for Colab
print("\n📦 Installing packages...")
!pip install --quiet selenium==4.15.0 webdriver-manager==4.0.1
!pip install --quiet beautifulsoup4==4.12.2 lxml
!pip install --quiet google-generativeai==0.3.0 tenacity==8.2.3
!pip install --quiet "supabase>=2.15.0"

# Setup Chrome for Colab with better error handling
print("🔧 Setting up Chrome for Colab...")
try:
    !apt-get update >/dev/null 2>&1
    !apt-get install -y wget gnupg >/dev/null 2>&1

    # Add Google's signing key and repository
    !wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - >/dev/null 2>&1
    !echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list

    # Update and install Chrome
    !apt-get update >/dev/null 2>&1
    !apt-get install -y google-chrome-stable >/dev/null 2>&1

    # Check Chrome installation
    chrome_version = !google-chrome --version 2>/dev/null
    if chrome_version:
        print(f"✅ Chrome installed: {chrome_version[0] if chrome_version else 'Unknown version'}")
    else:
        print("⚠️ Chrome installation unclear, will try alternatives")

except Exception as e:
    print(f"⚠️ Chrome setup had issues: {e}")
    print("Will try alternative approaches in WebDriver setup")

print("✅ Packages installed")

# Step 4: Import required modules
print("\n🧪 Testing imports...")
try:
    import time
    import re
    from urllib.parse import urljoin, urlparse
    from datetime import datetime
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import google.generativeai as genai
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from selenium.common.exceptions import TimeoutException, WebDriverException
    from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
    print("✅ External packages imported")

    # Import project modules (these might need to be adapted)
    try:
        from config.settings import setup_environment, set_config, get_config
        from src.utils.webdriver import setup_driver, load_page, clean_text
        from src.utils.ai_analysis import analyze_with_ai
        print("✅ Project modules imported")
    except ImportError as e:
        print(f"⚠️ Project modules not available: {e}")
        print("Will use fallback implementations")

except ImportError as e:
    print(f"❌ Import error: {e}")
    print("\n🔧 Try restarting runtime and running this cell again")
    raise

# Step 5: Colab-specific WebDriver setup function
def setup_colab_driver():
    """Setup Chrome WebDriver specifically for Google Colab environment."""
    try:
        print("🔧 Setting up Chrome WebDriver for Colab...")

        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--disable-extensions')
        chrome_options.add_argument('--disable-plugins')
        chrome_options.add_argument('--disable-images')
        chrome_options.add_argument('--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36')
        chrome_options.add_argument('--remote-debugging-port=9222')
        chrome_options.add_argument('--disable-web-security')
        chrome_options.add_argument('--allow-running-insecure-content')

        # Start with the working approach (auto-detect)
        print("  🔍 Using Selenium auto-detection...")
        try:
            driver = webdriver.Chrome(options=chrome_options)
            print("  ✅ Selenium auto-detection successful")
        except Exception as auto_error:
            print(f"  ❌ Auto-detection failed: {auto_error}")

            # Fallback to webdriver-manager
            print("  🔄 Trying webdriver-manager as fallback...")
            try:
                from webdriver_manager.chrome import ChromeDriverManager
                service = Service(ChromeDriverManager().install())
                driver = webdriver.Chrome(service=service, options=chrome_options)
                print("  ✅ webdriver-manager approach successful")
            except Exception as wdm_error:
                print(f"  ❌ webdriver-manager also failed: {wdm_error}")
                raise Exception("All ChromeDriver approaches failed")

        # Test the driver
        driver.set_page_load_timeout(30)
        driver.get("data:text/html,<html><body><h1>Test</h1></body></html>")

        print("✅ Chrome WebDriver setup successful")
        return driver

    except Exception as e:
        print(f"❌ WebDriver setup failed: {e}")
        if 'driver' in locals() and driver:
            try:
                driver.quit()
            except:
                pass
        return None

# Step 6: Fallback implementations for missing project modules
def fallback_load_page(driver, url, timeout=30):
    """Load a page and return BeautifulSoup object."""
    try:
        driver.get(url)
        time.sleep(2)  # Wait for page to load
        html = driver.page_source
        return BeautifulSoup(html, 'html.parser')
    except Exception as e:
        print(f"Error loading page {url}: {e}")
        return None

def fallback_analyze_with_ai(content, analysis_type="parish_directory"):
    """Fallback AI analysis function."""
    # Simple keyword-based scoring as fallback
    parish_keywords = [
        'parish', 'church', 'directory', 'finder', 'location', 'mass',
        'catholic', 'sacrament', 'worship', 'faith', 'community'
    ]

    content_lower = content.lower()
    score = sum(10 for keyword in parish_keywords if keyword in content_lower)

    # Boost score for certain patterns
    if 'parish' in content_lower and ('directory' in content_lower or 'finder' in content_lower):
        score += 30
    if 'find' in content_lower and 'church' in content_lower:
        score += 20

    return {'score': min(score, 100)}

# Step 7: Configure APIs
print("\n🔑 Configuring APIs...")
from google.colab import userdata

# Simple config class
class Config:
    def __init__(self):
        self.supabase = None
        self.genai_enabled = False
        self.ai_confidence_threshold = 50
        self.request_delay = 2

config = Config()

try:
    supabase_url = userdata.get('SUPABASE_URL')
    supabase_key = userdata.get('SUPABASE_KEY')
    genai_key = userdata.get('GENAI_API_KEY_USCCB')

    # Setup Supabase
    if supabase_url and supabase_key:
        from supabase import create_client
        config.supabase = create_client(supabase_url, supabase_key)
        print("✅ Supabase connected")
    else:
        print("⚠️ Supabase credentials not found")

    # Setup Gemini AI
    if genai_key:
        genai.configure(api_key=genai_key)
        config.genai_enabled = True
        print("✅ Gemini AI configured")
    else:
        print("⚠️ Gemini AI key not found - using fallback")

except Exception as e:
    print(f"❌ Configuration error: {e}")
    print("\n🔧 Make sure to add your API keys to Colab Secrets:")
    print("   • SUPABASE_URL")
    print("   • SUPABASE_KEY")
    print("   • GENAI_API_KEY_USCCB")

print("\n🎉 Environment setup complete!")
print(f"   📊 Database: {'Connected' if config.supabase else 'Not connected'}")
print(f"   🤖 AI: {'Enabled' if config.genai_enabled else 'Fallback mode'}")

# Test the WebDriver setup
print("\n🧪 Testing WebDriver...")
test_driver = setup_colab_driver()
if test_driver:
    test_driver.quit()
    print("✅ WebDriver test successful")
else:
    print("❌ WebDriver test failed")

🚀 Setting up USCCB Parish Extraction Environment...

✅ Repository already exists
✅ Repository updated
📂 Working directory: /content/usccb-parish-extraction

📦 Installing packages...
🔧 Setting up Chrome for Colab...
✅ Chrome installed: Google Chrome 137.0.7151.55 
✅ Packages installed

🧪 Testing imports...
✅ External packages imported
✅ Project modules imported

🔑 Configuring APIs...
✅ Supabase connected
✅ Gemini AI configured

🎉 Environment setup complete!
   📊 Database: Connected
   🤖 AI: Enabled

🧪 Testing WebDriver...
🔧 Setting up Chrome WebDriver for Colab...
  🔍 Using Selenium auto-detection...
  ✅ Selenium auto-detection successful
✅ Chrome WebDriver setup successful
✅ WebDriver test successful


In [11]:
# Cell 2: Parish Directory Discovery Functions

def normalize_url_join(base_url, relative_url):
    """Properly joins URLs while avoiding double slashes."""
    if base_url.endswith('/') and relative_url.startswith('/'):
        base_url = base_url.rstrip('/')
    return urljoin(base_url, relative_url)

def get_surrounding_text(element, max_length=200):
    """Extracts context text from the parent element of a link."""
    if element and element.parent:
        parent_text = element.parent.get_text(separator=' ', strip=True)
        return parent_text[:max_length] + ('...' if len(parent_text) > max_length else '')
    return ''

def find_candidate_urls(soup, base_url):
    """Scans a webpage for potential parish directory links."""
    candidate_links = []
    processed_hrefs = set()

    # Keywords likely to appear in parish directory links
    parish_link_keywords = [
        'Churches', 'Directory of Parishes', 'Parishes', 'parishfinder', 'Parish Finder',
        'Find a Parish', 'Locations', 'Our Parishes', 'Parish Listings', 'Find a Church',
        'Church Directory', 'Faith Communities', 'Find Mass Times', 'Our Churches',
        'Search Parishes', 'Parish Map', 'Mass Schedule', 'Sacraments', 'Worship'
    ]

    # URL path patterns for parish directories
    url_patterns = [
        r'parishes', r'directory', r'locations', r'churches',
        r'parish-finder', r'findachurch', r'parishsearch', r'parishdirectory',
        r'find-a-church', r'church-directory', r'parish-listings', r'parish-map',
        r'mass-times', r'sacraments', r'search', r'worship', r'finder'
    ]

    all_links = soup.find_all('a', href=True)

    for link_tag in all_links:
        href = link_tag['href']
        if not href or href.startswith('#') or href.lower().startswith('javascript:') or href.lower().startswith('mailto:'):
            continue

        abs_href = normalize_url_join(base_url, href)
        if not abs_href.startswith('http') or abs_href in processed_hrefs:
            continue

        link_text = link_tag.get_text(strip=True)
        surrounding_text = get_surrounding_text(link_tag)
        parsed_href_path = urlparse(abs_href).path.lower()

        # Check for matches based on keywords or URL patterns
        text_match = any(keyword.lower() in link_text.lower() or keyword.lower() in surrounding_text.lower()
                        for keyword in parish_link_keywords)
        pattern_match = any(re.search(pattern, parsed_href_path, re.IGNORECASE)
                           for pattern in url_patterns)

        if text_match or pattern_match:
            candidate_links.append({
                'text': link_text,
                'href': abs_href,
                'surrounding_text': surrounding_text
            })
            processed_hrefs.add(abs_href)

    return candidate_links

def analyze_links_with_ai(candidate_links, diocese_name=None):
    """Analyzes candidate links using AI to find the best parish directory URL."""
    best_link = None
    highest_score = -1

    print(f"    🤖 Analyzing {len(candidate_links)} candidate links with AI...")

    for link_info in candidate_links:
        try:
            # Create analysis prompt
            link_context = f"Link text: '{link_info['text']}', URL: '{link_info['href']}', Context: '{link_info['surrounding_text'][:100]}'"

            # Use the AI analysis utility
            analysis = analyze_with_ai(link_context, "parish_directory")
            score = analysis.get('score', 0)

            print(f"      📊 '{link_info['text'][:30]}...' -> Score: {score}")

            if score >= config.ai_confidence_threshold and score > highest_score:
                highest_score = score
                best_link = link_info['href']

        except Exception as e:
            print(f"      ❌ Error analyzing link: {str(e)[:50]}...")
            continue

    return best_link

print("✅ Parish directory discovery functions loaded")

✅ Parish directory discovery functions loaded


In [12]:
# Cell 3: Database Functions

def get_dioceses_to_process(limit=None):
    """Get dioceses that need parish directory URL discovery."""
    if not config or not config.supabase:
        print("❌ No database connection")
        return []

    try:
        # Get all dioceses
        response = config.supabase.table('Dioceses').select('Website, Name').execute()
        all_dioceses = response.data or []

        # Get dioceses that already have directory URLs
        try:
            processed_response = config.supabase.table('DiocesesParishDirectory').select(
                'diocese_url'
            ).not_.is_('parish_directory_url', 'null').not_.eq('parish_directory_url', '').execute()

            processed_urls = {item['diocese_url'] for item in (processed_response.data or [])}
        except:
            # Table might not exist yet, process all dioceses
            processed_urls = set()

        # Filter to unprocessed dioceses
        unprocessed = [
            {'url': d['Website'], 'name': d['Name']}
            for d in all_dioceses
            if d.get('Website') and d['Website'] not in processed_urls
        ]

        if limit and len(unprocessed) > limit:
            import random
            unprocessed = random.sample(unprocessed, limit)

        return unprocessed

    except Exception as e:
        print(f"❌ Error fetching dioceses: {e}")
        return []

def save_directory_result(diocese_url, directory_url, success, method="ai_analysis"):
    """Save parish directory discovery result to database."""
    if not config or not config.supabase:
        print(f"  📝 Would save: {diocese_url} -> {directory_url or 'Not Found'}")
        return

    try:
        data = {
            'diocese_url': diocese_url,
            'parish_directory_url': directory_url,
            'found': 'Success' if success else 'Not Found',
            'found_method': method,
            'updated_at': datetime.now().isoformat()
        }

        config.supabase.table('DiocesesParishDirectory').upsert(data).execute()
        print(f"  💾 Saved result to database")

    except Exception as e:
        print(f"  ❌ Error saving to database: {e}")

def search_for_directory_link(diocese_name, diocese_website_url):
    """Uses mock search results as fallback to find parish directory links."""
    print(f"    🔍 Using search engine fallback for {diocese_name}...")

    # Generate mock search results based on common patterns
    print(f"    📝 Using mock search results")
    mock_results = [
        {
            'link': normalize_url_join(diocese_website_url, '/parishes'),
            'title': f"Parishes - {diocese_name}",
            'snippet': f"List of parishes in the Diocese of {diocese_name}. Find a parish near you."
        },
        {
            'link': normalize_url_join(diocese_website_url, '/directory'),
            'title': f"Directory - {diocese_name}",
            'snippet': f"Official directory of churches and schools for {diocese_name}."
        },
        {
            'link': normalize_url_join(diocese_website_url, '/find-a-church'),
            'title': f"Find a Church - {diocese_name}",
            'snippet': f"Search for a Catholic church in {diocese_name}. Mass times and locations."
        }
    ]

    return analyze_search_snippets_with_ai(mock_results, diocese_name)

def analyze_search_snippets_with_ai(search_results, diocese_name):
    """Analyzes search result snippets to find the best parish directory URL."""
    best_link = None
    highest_score = -1

    print(f"    🤖 Analyzing {len(search_results)} search snippets with AI...")

    for result in search_results:
        try:
            snippet_context = f"Title: '{result.get('title', '')}', Snippet: '{result.get('snippet', '')}', URL: '{result.get('link', '')}'"

            analysis = analyze_with_ai(snippet_context, "parish_directory")
            score = analysis.get('score', 0)

            print(f"      📊 '{result.get('title', '')[:30]}...' -> Score: {score}")

            if score >= config.ai_confidence_threshold and score > highest_score:
                highest_score = score
                best_link = result.get('link')

        except Exception as e:
            print(f"      ❌ Error analyzing snippet: {str(e)[:50]}...")
            continue

    return best_link

print("✅ Database and search functions loaded")

✅ Database and search functions loaded


In [13]:
# Cell 4: Main Processing Loop - FIXED VERSION

def analyze_links_with_fallback(candidate_links, diocese_name=None):
    """Analyzes candidate links using enhanced keyword scoring."""
    best_link = None
    highest_score = -1

    print(f"    🤖 Analyzing {len(candidate_links)} candidate links...")

    for link_info in candidate_links:
        try:
            # Create analysis context
            link_context = f"Link text: '{link_info['text']}', URL: '{link_info['href']}', Context: '{link_info['surrounding_text'][:100]}'"

            # Use enhanced keyword-based scoring
            analysis = fallback_analyze_with_ai(link_context)
            score = analysis.get('score', 0)

            print(f"      📊 '{link_info['text'][:30]}...' -> Score: {score}")

            if score >= config.ai_confidence_threshold and score > highest_score:
                highest_score = score
                best_link = link_info['href']

        except Exception as e:
            print(f"      ❌ Error analyzing link: {str(e)[:50]}...")
            continue

    return best_link

def test_directory_paths(diocese_name, diocese_website_url):
    """Tests common parish directory URL paths to see if they exist."""
    print(f"    🔍 Testing common directory paths...")

    # Common parish directory paths - expanded with real-world patterns
    common_paths = [
        '/parishes',
        '/directory',
        '/parish-directory',
        '/find-a-parish',
        '/churches',
        '/locations',
        '/parish-finder',
        '/church-directory',
        '/find-a-church',
        '/parish-listings',
        '/parish-map',
        '/worship',
        '/sacraments',
        # Ukrainian Catholic specific patterns
        '/en/eparchy/map-of-eparchy.html',
        '/eparchy/map-of-eparchy',
        '/map-of-eparchy',
        '/eparchy/parishes',
        '/en/parishes',
        # Other common patterns found in real dioceses
        '/our-parishes',
        '/communities',
        '/catholic-churches',
        '/church-locator',
        '/mass-times',
        '/parish-search',
        '/ministries',
        '/about/parishes'
    ]

    found_links = []

    # Test common directory paths
    for path in common_paths:
        try:
            test_url = normalize_url_join(diocese_website_url, path)
            print(f"      🔍 Testing: {test_url}")

            # Use requests to test if URL exists
            headers = {
                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
            }
            response = requests.head(test_url, headers=headers, timeout=5, allow_redirects=True)

            if response.status_code == 200:
                # Also check if it's not just redirecting to homepage
                if response.url != diocese_website_url and response.url != diocese_website_url + '/':
                    found_links.append({
                        'link': test_url,
                        'title': f"Parish Directory - {diocese_name}",
                        'snippet': f"Parish directory found at {path}. Likely contains parish listings and information."
                    })
                    print(f"      ✅ Found working link: {test_url}")
                else:
                    print(f"      ⚠️ Redirects to homepage: {test_url}")
            else:
                print(f"      ❌ {response.status_code}: {test_url}")

        except Exception as e:
            print(f"      ❌ Failed: {path} - {str(e)[:30]}...")
            continue

    if found_links:
        print(f"    📋 Found {len(found_links)} working directory paths")
        return analyze_directory_candidates(found_links, diocese_name)
    else:
        print(f"    ❌ No standard directory paths found")
        # Try one more approach - scan the main page more thoroughly
        return scan_main_page_deeper(diocese_name, diocese_website_url)

def scan_main_page_deeper(diocese_name, diocese_website_url):
    """Scan the main page more thoroughly for any parish-related links."""
    print(f"    🔍 Scanning main page more thoroughly...")

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
        }
        response = requests.get(diocese_website_url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Look for any links that might contain parish information
        all_links = soup.find_all('a', href=True)
        potential_links = []

        for link in all_links:
            href = link.get('href', '')
            text = link.get_text(strip=True).lower()

            # Look for parish-related keywords in link text or href
            parish_indicators = [
                'parish', 'church', 'community', 'location', 'directory',
                'епархія', 'парафії', 'церкви',  # Ukrainian terms
                'map', 'find', 'search', 'contact'
            ]

            if any(indicator in text or indicator in href.lower() for indicator in parish_indicators):
                full_url = normalize_url_join(diocese_website_url, href)
                if full_url != diocese_website_url:  # Don't include homepage
                    potential_links.append({
                        'link': full_url,
                        'title': f"Potential Parish Page - {text[:50]}",
                        'snippet': f"Link text: '{text}' - May contain parish information"
                    })

        if potential_links:
            print(f"    📋 Found {len(potential_links)} potential parish-related links")
            # Limit to top 5 most promising
            return analyze_directory_candidates(potential_links[:5], diocese_name)
        else:
            print(f"    ❌ No parish-related links found on main page")
            return None

    except Exception as e:
        print(f"    ❌ Error scanning main page: {str(e)[:50]}...")
        return None

def analyze_directory_candidates(search_results, diocese_name):
    """Analyzes potential directory links to find the best one."""
    best_link = None
    highest_score = -1

    print(f"    🤖 Analyzing {len(search_results)} potential directory links...")

    for result in search_results:
        try:
            snippet_context = f"Title: '{result.get('title', '')}', Snippet: '{result.get('snippet', '')}', URL: '{result.get('link', '')}'"

            # Use enhanced fallback analysis
            analysis = fallback_analyze_with_ai(snippet_context)
            score = analysis.get('score', 0)

            print(f"      📊 '{result.get('title', '')[:50]}...' -> Score: {score}")

            if score >= config.ai_confidence_threshold and score > highest_score:
                highest_score = score
                best_link = result.get('link')

        except Exception as e:
            print(f"      ❌ Error analyzing candidate: {str(e)[:50]}...")
            continue

    return best_link

def process_diocese_for_directory(diocese_info, driver):
    """Process a single diocese to find its parish directory URL."""
    diocese_url = diocese_info['url']
    diocese_name = diocese_info['name']

    print(f"\\n🏛️ Processing: {diocese_name}")
    print(f"  📍 URL: {diocese_url}")

    try:
        # Load the diocese website
        print(f"  📥 Loading website...")
        soup = fallback_load_page(driver, diocese_url)

        if not soup:
            raise Exception("Failed to load website")

        # Find candidate links
        print(f"  🔍 Scanning for parish directory links...")
        candidate_links = find_candidate_urls(soup, diocese_url)

        if not candidate_links:
            print(f"  ⚠️ No candidate links found on main page")
            # Try testing common directory paths
            directory_url = test_directory_paths(diocese_name, diocese_url)
            method = "path_testing"
        else:
            print(f"  📋 Found {len(candidate_links)} candidate links")
            # Analyze with keyword scoring
            directory_url = analyze_links_with_fallback(candidate_links, diocese_name)
            method = "link_analysis"

        # Save result
        success = directory_url is not None
        if success:
            print(f"  ✅ Found directory URL: {directory_url}")
        else:
            print(f"  ❌ No parish directory URL found")

        save_directory_result(diocese_url, directory_url, success, method)

        return {
            'diocese_name': diocese_name,
            'diocese_url': diocese_url,
            'directory_url': directory_url,
            'success': success,
            'method': method
        }

    except Exception as e:
        error_msg = str(e)[:100]
        print(f"  ❌ Error processing {diocese_name}: {error_msg}")

        # Save error result
        save_directory_result(diocese_url, None, False, f"error: {error_msg}")

        return {
            'diocese_name': diocese_name,
            'diocese_url': diocese_url,
            'directory_url': None,
            'success': False,
            'method': 'error',
            'error': error_msg
        }

# Set processing limit (you can change this)
MAX_DIOCESES_TO_PROCESS = 5  # Process 5 dioceses as a test

print(f"🚀 Starting parish directory discovery...")
print(f"📊 Will process up to {MAX_DIOCESES_TO_PROCESS} dioceses")

# Get dioceses to process
dioceses_to_scan = get_dioceses_to_process(limit=MAX_DIOCESES_TO_PROCESS)

if not dioceses_to_scan:
    print("❌ No dioceses found to process")
    print("\\n🔧 Make sure you've run 01_Build_Dioceses_Database.ipynb first")
else:
    print(f"📋 Found {len(dioceses_to_scan)} dioceses to process")

    # Setup WebDriver using our Colab-specific function
    driver = setup_colab_driver()

    if not driver:
        print("❌ Failed to setup WebDriver")
    else:
        results = []

        try:
            for i, diocese_info in enumerate(dioceses_to_scan, 1):
                print(f"\\n{'='*60}")
                print(f"Processing diocese {i}/{len(dioceses_to_scan)}")

                result = process_diocese_for_directory(diocese_info, driver)
                results.append(result)

                # Be respectful - pause between requests
                if i < len(dioceses_to_scan):
                    print(f"  ⏱️ Waiting {config.request_delay} seconds...")
                    time.sleep(config.request_delay)

        finally:
            driver.quit()
            print("\\n🧹 WebDriver closed")

        # Print summary
        print(f"\\n{'='*60}")
        print(f"📊 SUMMARY")
        print(f"{'='*60}")

        successful = sum(1 for r in results if r['success'])
        failed = len(results) - successful

        print(f"Total dioceses processed: {len(results)}")
        print(f"Successfully found directories: {successful}")
        print(f"Failed to find directories: {failed}")
        print(f"Success rate: {successful/len(results)*100:.1f}%")

        print(f"\\n📋 Detailed Results:")
        for result in results:
            status = "✅" if result['success'] else "❌"
            print(f"  {status} {result['diocese_name']}")
            if result['success']:
                print(f"      Directory: {result['directory_url']}")
            print(f"      Method: {result['method']}")
            if 'error' in result:
                print(f"      Error: {result['error']}")
            print()

        print("\\n🎉 Parish directory discovery complete!")
        print("\\n📚 Next step: Run 03_Extract_Parish_Data.ipynb to extract parish information")

🚀 Starting parish directory discovery...
📊 Will process up to 5 dioceses
📋 Found 5 dioceses to process
🔧 Setting up Chrome WebDriver for Colab...
  🔍 Using Selenium auto-detection...
  ✅ Selenium auto-detection successful
✅ Chrome WebDriver setup successful
Processing diocese 1/5
\n🏛️ Processing: Diocese of Des Moines
  📍 URL: https://www.dmdiocese.org
  📥 Loading website...
  🔍 Scanning for parish directory links...
  📋 Found 12 candidate links
    🤖 Analyzing 12 candidate links...
      📊 'Diocesan Directory...' -> Score: 10
      📊 'Hispanic Ministry...' -> Score: 20
      📊 'Parishes with Hispanic Service...' -> Score: 20
      📊 'Worship...' -> Score: 30
      📊 'Parishes & Mass Times...' -> Score: 30
      📊 'Why Mass...' -> Score: 20
      📊 'Eucharistic Adoration...' -> Score: 10
      📊 'Holy Days & Calendar...' -> Score: 10
      📊 'Lent...' -> Score: 10
      📊 'Sacraments...' -> Score: 20
      📊 '...' -> Score: 10
      📊 '...' -> Score: 10
  ❌ No parish directory URL foun