# Find Parish Directories

This notebook discovers parish directory URLs on diocesan websites using AI-powered analysis.

**Prerequisites**: 
1. Run `00_Colab_Setup.ipynb` first
2. Run `01_Build_Dioceses_Database.ipynb` to populate dioceses

**What this does**:
- Analyzes diocese websites to find parish directory pages
- Uses Google Gemini AI for intelligent link classification
- Provides fallback search using Google Custom Search API
- Saves discovered directory URLs to Supabase database

In [None]:
# Cell 1: Setup Environment and Imports
import os
import sys
import time
import re
from urllib.parse import urljoin, urlparse
from datetime import datetime

# Ensure we're in the correct directory and set up Python path
repo_path = '/content/usccb-parish-extraction'

if not os.path.exists(repo_path):
    print("❌ Repository not found!")
    print("Please run 00_Colab_Setup.ipynb first to clone the repository.")
    raise FileNotFoundError("Repository not found")

# Change to repository directory and add to Python path
os.chdir(repo_path)
if repo_path not in sys.path:
    sys.path.append(repo_path)

print(f"📂 Working directory: {os.getcwd()}")
print("🐍 Python path configured")

# Import required modules
try:
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    import google.generativeai as genai
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.common.exceptions import TimeoutException, WebDriverException
    from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
    from google.api_core.exceptions import DeadlineExceeded, ServiceUnavailable, ResourceExhausted
    from googleapiclient.discovery import build
    from googleapiclient.errors import HttpError
    
    from config.settings import get_config
    from src.utils.webdriver import setup_driver, load_page, clean_text
    from src.utils.ai_analysis import analyze_with_ai
    
    print("✅ All modules imported successfully")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("\n🔧 Troubleshooting:")
    print("1. Make sure you've run 00_Colab_Setup.ipynb completely")
    print("2. If you restarted the runtime, re-run the setup notebook")
    print("3. Check that all required packages are installed")
    raise

# Get configuration
try:
    config = get_config()
    print("✅ Configuration loaded successfully")
    print(f"📊 Database: {'Connected' if config.supabase else 'Not connected'}")
    print(f"🤖 AI: {'Enabled' if config.genai_enabled else 'Mock mode'}")
except RuntimeError as e:
    print(f"❌ Configuration error: {e}")
    print("\n🔧 Please run 00_Colab_Setup.ipynb first to configure your environment.")
    raise

In [None]:
# Cell 2: Parish Directory Discovery Functions

def normalize_url_join(base_url, relative_url):
    """Properly joins URLs while avoiding double slashes."""
    if base_url.endswith('/') and relative_url.startswith('/'):
        base_url = base_url.rstrip('/')
    return urljoin(base_url, relative_url)

def get_surrounding_text(element, max_length=200):
    """Extracts context text from the parent element of a link."""
    if element and element.parent:
        parent_text = element.parent.get_text(separator=' ', strip=True)
        return parent_text[:max_length] + ('...' if len(parent_text) > max_length else '')
    return ''

def find_candidate_urls(soup, base_url):
    """Scans a webpage for potential parish directory links."""
    candidate_links = []
    processed_hrefs = set()

    # Keywords likely to appear in parish directory links
    parish_link_keywords = [
        'Churches', 'Directory of Parishes', 'Parishes', 'parishfinder', 'Parish Finder',
        'Find a Parish', 'Locations', 'Our Parishes', 'Parish Listings', 'Find a Church',
        'Church Directory', 'Faith Communities', 'Find Mass Times', 'Our Churches',
        'Search Parishes', 'Parish Map', 'Mass Schedule', 'Sacraments', 'Worship'
    ]
    
    # URL path patterns for parish directories
    url_patterns = [
        r'parishes', r'directory', r'locations', r'churches',
        r'parish-finder', r'findachurch', r'parishsearch', r'parishdirectory',
        r'find-a-church', r'church-directory', r'parish-listings', r'parish-map',
        r'mass-times', r'sacraments', r'search', r'worship', r'finder'
    ]

    all_links = soup.find_all('a', href=True)

    for link_tag in all_links:
        href = link_tag['href']
        if not href or href.startswith('#') or href.lower().startswith('javascript:') or href.lower().startswith('mailto:'):
            continue

        abs_href = normalize_url_join(base_url, href)
        if not abs_href.startswith('http') or abs_href in processed_hrefs:
            continue

        link_text = link_tag.get_text(strip=True)
        surrounding_text = get_surrounding_text(link_tag)
        parsed_href_path = urlparse(abs_href).path.lower()

        # Check for matches based on keywords or URL patterns
        text_match = any(keyword.lower() in link_text.lower() or keyword.lower() in surrounding_text.lower() 
                        for keyword in parish_link_keywords)
        pattern_match = any(re.search(pattern, parsed_href_path, re.IGNORECASE) 
                           for pattern in url_patterns)

        if text_match or pattern_match:
            candidate_links.append({
                'text': link_text,
                'href': abs_href,
                'surrounding_text': surrounding_text
            })
            processed_hrefs.add(abs_href)

    return candidate_links

def analyze_links_with_ai(candidate_links, diocese_name=None):
    """Analyzes candidate links using AI to find the best parish directory URL."""
    best_link = None
    highest_score = -1

    print(f"    🤖 Analyzing {len(candidate_links)} candidate links with AI...")

    for link_info in candidate_links:
        try:
            # Create analysis prompt
            link_context = f"Link text: '{link_info['text']}', URL: '{link_info['href']}', Context: '{link_info['surrounding_text'][:100]}'"
            
            # Use the AI analysis utility
            analysis = analyze_with_ai(link_context, "parish_directory")
            score = analysis.get('score', 0)

            print(f"      📊 '{link_info['text'][:30]}...' -> Score: {score}")

            if score >= config.ai_confidence_threshold and score > highest_score:
                highest_score = score
                best_link = link_info['href']
                
        except Exception as e:
            print(f"      ❌ Error analyzing link: {str(e)[:50]}...")
            continue

    return best_link

print("✅ Parish directory discovery functions loaded")

In [None]:
# Cell 3: Search Engine Fallback Functions

def search_for_directory_link(diocese_name, diocese_website_url):
    """Uses Google Custom Search as fallback to find parish directory links."""
    print(f"    🔍 Using search engine fallback for {diocese_name}...")
    
    # Mock search results if no API keys configured
    if not config.genai_enabled:
        print(f"    📝 Using mock search results")
        mock_results = [
            {
                'link': normalize_url_join(diocese_website_url, '/parishes'),
                'title': f"Parishes - {diocese_name}",
                'snippet': f"List of parishes in the Diocese of {diocese_name}. Find a parish near you."
            },
            {
                'link': normalize_url_join(diocese_website_url, '/directory'),
                'title': f"Directory - {diocese_name}",
                'snippet': f"Official directory of churches and schools for {diocese_name}."
            },
            {
                'link': normalize_url_join(diocese_website_url, '/find-a-church'),
                'title': f"Find a Church - {diocese_name}",
                'snippet': f"Search for a Catholic church in {diocese_name}. Mass times and locations."
            }
        ]
        return analyze_search_snippets_with_ai(mock_results, diocese_name)
    
    # If we reach here, we would implement actual Google Custom Search
    # For now, return None to indicate search not available
    print(f"    ⚠️ Live search not implemented in this simplified version")
    return None

def analyze_search_snippets_with_ai(search_results, diocese_name):
    """Analyzes search result snippets to find the best parish directory URL."""
    best_link = None
    highest_score = -1

    print(f"    🤖 Analyzing {len(search_results)} search snippets with AI...")

    for result in search_results:
        try:
            snippet_context = f"Title: '{result.get('title', '')}', Snippet: '{result.get('snippet', '')}', URL: '{result.get('link', '')}'"
            
            analysis = analyze_with_ai(snippet_context, "parish_directory")
            score = analysis.get('score', 0)

            print(f"      📊 '{result.get('title', '')[:30]}...' -> Score: {score}")

            if score >= config.ai_confidence_threshold and score > highest_score:
                highest_score = score
                best_link = result.get('link')
                
        except Exception as e:
            print(f"      ❌ Error analyzing snippet: {str(e)[:50]}...")
            continue

    return best_link

print("✅ Search fallback functions loaded")

In [None]:
# Cell 4: Database Functions

def get_dioceses_to_process(limit=None):
    """Get dioceses that need parish directory URL discovery."""
    if not config.supabase:
        print("❌ No database connection")
        return []
    
    try:
        # Get all dioceses
        response = config.supabase.table('Dioceses').select('Website, Name').execute()
        all_dioceses = response.data or []
        
        # Get dioceses that already have directory URLs
        processed_response = config.supabase.table('DiocesesParishDirectory').select(
            'diocese_url'
        ).not_.is_('parish_directory_url', 'null').not_.eq('parish_directory_url', '').execute()
        
        processed_urls = {item['diocese_url'] for item in (processed_response.data or [])}
        
        # Filter to unprocessed dioceses
        unprocessed = [
            {'url': d['Website'], 'name': d['Name']}
            for d in all_dioceses
            if d['Website'] not in processed_urls
        ]
        
        if limit and len(unprocessed) > limit:
            import random
            unprocessed = random.sample(unprocessed, limit)
        
        return unprocessed
        
    except Exception as e:
        print(f"❌ Error fetching dioceses: {e}")
        return []

def save_directory_result(diocese_url, directory_url, success, method="ai_analysis"):
    """Save parish directory discovery result to database."""
    if not config.supabase:
        print(f"  📝 Would save: {diocese_url} -> {directory_url or 'Not Found'}")
        return
    
    try:
        data = {
            'diocese_url': diocese_url,
            'parish_directory_url': directory_url,
            'found': 'Success' if success else 'Not Found',
            'found_method': method,
            'updated_at': datetime.now().isoformat()
        }
        
        config.supabase.table('DiocesesParishDirectory').upsert(data).execute()
        print(f"  💾 Saved result to database")
        
    except Exception as e:
        print(f"  ❌ Error saving to database: {e}")

print("✅ Database functions loaded")

In [None]:
# Cell 5: Main Processing Loop

def process_diocese_for_directory(diocese_info, driver):
    """Process a single diocese to find its parish directory URL."""
    diocese_url = diocese_info['url']
    diocese_name = diocese_info['name']
    
    print(f"\n🏛️ Processing: {diocese_name}")
    print(f"  📍 URL: {diocese_url}")
    
    try:
        # Load the diocese website
        print(f"  📥 Loading website...")
        soup = load_page(driver, diocese_url)
        
        # Find candidate links
        print(f"  🔍 Scanning for parish directory links...")
        candidate_links = find_candidate_urls(soup, diocese_url)
        
        if not candidate_links:
            print(f"  ⚠️ No candidate links found on main page")
            # Try search engine fallback
            directory_url = search_for_directory_link(diocese_name, diocese_url)
            method = "search_engine_fallback"
        else:
            print(f"  📋 Found {len(candidate_links)} candidate links")
            # Analyze with AI
            directory_url = analyze_links_with_ai(candidate_links, diocese_name)
            method = "ai_direct_analysis"
        
        # Save result
        success = directory_url is not None
        if success:
            print(f"  ✅ Found directory URL: {directory_url}")
        else:
            print(f"  ❌ No parish directory URL found")
        
        save_directory_result(diocese_url, directory_url, success, method)
        
        return {
            'diocese_name': diocese_name,
            'diocese_url': diocese_url,
            'directory_url': directory_url,
            'success': success,
            'method': method
        }
        
    except Exception as e:
        error_msg = str(e)[:100]
        print(f"  ❌ Error processing {diocese_name}: {error_msg}")
        
        # Save error result
        save_directory_result(diocese_url, None, False, f"error: {error_msg}")
        
        return {
            'diocese_name': diocese_name,
            'diocese_url': diocese_url,
            'directory_url': None,
            'success': False,
            'method': 'error',
            'error': error_msg
        }

# Set processing limit (you can change this)
MAX_DIOCESES_TO_PROCESS = 5  # Process 5 dioceses as a test

print(f"🚀 Starting parish directory discovery...")
print(f"📊 Will process up to {MAX_DIOCESES_TO_PROCESS} dioceses")

# Get dioceses to process
dioceses_to_scan = get_dioceses_to_process(limit=MAX_DIOCESES_TO_PROCESS)

if not dioceses_to_scan:
    print("❌ No dioceses found to process")
else:
    print(f"📋 Found {len(dioceses_to_scan)} dioceses to process")
    
    # Setup WebDriver
    driver = setup_driver()
    
    if not driver:
        print("❌ Failed to setup WebDriver")
    else:
        results = []
        
        try:
            for i, diocese_info in enumerate(dioceses_to_scan, 1):
                print(f"\n{'='*60}")
                print(f"Processing diocese {i}/{len(dioceses_to_scan)}")
                
                result = process_diocese_for_directory(diocese_info, driver)
                results.append(result)
                
                # Be respectful - pause between requests
                if i < len(dioceses_to_scan):
                    print(f"  ⏱️ Waiting {config.request_delay} seconds...")
                    time.sleep(config.request_delay)
        
        finally:
            driver.quit()
            print("\n🧹 WebDriver closed")
        
        # Print summary
        print(f"\n{'='*60}")
        print(f"📊 SUMMARY")
        print(f"{'='*60}")
        
        successful = sum(1 for r in results if r['success'])
        failed = len(results) - successful
        
        print(f"Total dioceses processed: {len(results)}")
        print(f"Successfully found directories: {successful}")
        print(f"Failed to find directories: {failed}")
        print(f"Success rate: {successful/len(results)*100:.1f}%")
        
        print(f"\n📋 Detailed Results:")
        for result in results:
            status = "✅" if result['success'] else "❌"
            print(f"  {status} {result['diocese_name']}")
            if result['success']:
                print(f"      Directory: {result['directory_url']}")
            print(f"      Method: {result['method']}")
            if 'error' in result:
                print(f"      Error: {result['error']}")
            print()