<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build_Parishes_Database_Using_AgenticAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Import required code and install packages
!pip install supabase google-generativeai psycopg2-binary tenacity selenium webdriver-manager
!wget https://raw.githubusercontent.com/tomknightatl/USCCB/main/llm_utils.py

In [None]:
# Cell 2: Import required libraries and install Chrome
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import os
import time
import json
import random
import subprocess
from datetime import datetime
from urllib.parse import urlparse, urljoin
from google.colab import userdata
import google.generativeai as genai
from supabase import create_client, Client

# Import Selenium for JavaScript-heavy sites
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

print("All libraries imported successfully!")

# Chrome Installation for Google Colab
def ensure_chrome_installed():
    """Ensures Chrome is installed in the Colab environment."""
    try:
        # Check if Chrome is already available
        result = subprocess.run(['which', 'google-chrome'], capture_output=True, text=True)
        if result.returncode == 0:
            print("✅ Chrome is already installed and available.")
            return True

        print("🔧 Chrome not found. Installing Chrome for Selenium...")

        # Install Chrome
        os.system('apt-get update > /dev/null 2>&1')
        os.system('wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - > /dev/null 2>&1')
        os.system('echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list')
        os.system('apt-get update > /dev/null 2>&1')
        os.system('apt-get install -y google-chrome-stable > /dev/null 2>&1')

        # Verify installation
        result = subprocess.run(['google-chrome', '--version'], capture_output=True, text=True)
        if result.returncode == 0:
            print(f"✅ Chrome installed successfully: {result.stdout.strip()}")
            return True
        else:
            print("❌ Chrome installation may have failed.")
            return False

    except Exception as e:
        print(f"❌ Error during Chrome installation: {e}")
        return False

# Run the installation check
print("\n🔧 Checking Chrome installation...")
chrome_ready = ensure_chrome_installed()

if chrome_ready:
    print("🚀 Ready to proceed with Selenium operations!")
else:
    print("⚠️ You may need to restart the runtime if Chrome installation failed.")

In [None]:
# Cell 3: Enhanced configuration and setup
print("=== ENHANCED PARISH DATABASE BUILDER ===")
print("--- User Configurable Parameters & Setup ---")

# --- Processing Configuration ---
MAX_URLS_TO_PROCESS = 3  # Start small for testing
USE_SELENIUM = True  # Enable JavaScript rendering
SAVE_DEBUG_FILES = True  # Save scraped content for debugging
RETRY_FAILED_URLS = True  # Retry failed URLs with different methods

# Create debug directory
if SAVE_DEBUG_FILES:
    os.makedirs('debug_content', exist_ok=True)
    print("Debug directory created for saving scraped content")

print(f"Processing will be limited to {MAX_URLS_TO_PROCESS} URLs.")
print(f"JavaScript rendering: {'Enabled' if USE_SELENIUM else 'Disabled'}")
print(f"Debug mode: {'Enabled' if SAVE_DEBUG_FILES else 'Disabled'}")

# --- Supabase Configuration ---
SUPABASE_URL = userdata.get('SUPABASE_URL')
SUPABASE_KEY = userdata.get('SUPABASE_KEY')

supabase: Client = None
if SUPABASE_URL and SUPABASE_KEY:
    try:
        supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
        print("✓ Supabase client initialized successfully")

        # Test connection and check table structure
        try:
            test_response = supabase.table('Parishes').select('*').limit(1).execute()
            print("✓ Parishes table accessible")
        except Exception as e:
            print(f"⚠ Warning: Could not access Parishes table: {e}")

    except Exception as e:
        print(f"✗ Error initializing Supabase client: {e}")
        supabase = None
else:
    print("✗ Supabase credentials not found in secrets")
    print("Required secrets: SUPABASE_URL, SUPABASE_KEY")

# --- GenAI Configuration ---
GENAI_API_KEY = userdata.get('GENAI_API_KEY_USCCB')

if GENAI_API_KEY:
    try:
        genai.configure(api_key=GENAI_API_KEY)
        # Test the API
        test_model = genai.GenerativeModel('gemini-1.5-flash')
        test_response = test_model.generate_content("Say 'API working'")
        print("✓ GenAI configured and tested successfully")
    except Exception as e:
        print(f"✗ Error configuring GenAI: {e}")
        GENAI_API_KEY = None
else:
    print("✗ GenAI API Key not found (Secret: GENAI_API_KEY_USCCB)")

# --- Enhanced Selenium WebDriver Setup ---
def setup_webdriver():
    if not USE_SELENIUM:
        return None

    if not chrome_ready:
        print("⚠ Chrome not available - skipping Selenium setup")
        return None

    try:
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')
        chrome_options.add_argument('--remote-debugging-port=9222')
        chrome_options.add_argument('--disable-extensions')
        chrome_options.add_argument('--disable-plugins')
        chrome_options.add_argument('--disable-images')  # Speed up loading
        chrome_options.add_argument('--disable-javascript')  # We'll enable selectively

        # Try to use system Chrome first
        chrome_options.binary_location = '/usr/bin/google-chrome'

        # Set up ChromeDriver
        try:
            service = Service(ChromeDriverManager().install())
            driver = webdriver.Chrome(service=service, options=chrome_options)
            print("✓ Selenium WebDriver initialized with ChromeDriverManager")
            return driver
        except Exception as e:
            print(f"ChromeDriverManager failed: {e}")
            # Fallback to system chromedriver
            try:
                driver = webdriver.Chrome(options=chrome_options)
                print("✓ Selenium WebDriver initialized with system chromedriver")
                return driver
            except Exception as e2:
                print(f"System chromedriver also failed: {e2}")
                return None

    except Exception as e:
        print(f"⚠ Warning: Could not initialize Selenium: {e}")
        return None

# --- Data Retrieval from Supabase ---
urls_to_process = []
if supabase:
    try:
        print("\nFetching parish directory URLs...")
        response = supabase.table('DiocesesParishDirectory').select('parish_directory_url').not_.is_('parish_directory_url', 'null').not_.eq('parish_directory_url', '').execute()

        if response.data:
            fetched_urls = [item['parish_directory_url'] for item in response.data if item['parish_directory_url']]
            print(f"Found {len(fetched_urls)} URLs in database")

            if MAX_URLS_TO_PROCESS and len(fetched_urls) > MAX_URLS_TO_PROCESS:
                urls_to_process = random.sample(fetched_urls, MAX_URLS_TO_PROCESS)
                print(f"Selected {len(urls_to_process)} URLs for processing")
            else:
                urls_to_process = fetched_urls
                print(f"Will process all {len(urls_to_process)} URLs")
        else:
            print("No parish directory URLs found")

    except Exception as e:
        print(f"Error fetching URLs: {e}")
        urls_to_process = []

if not urls_to_process:
    print("\n⚠ No URLs to process - using test URLs")
    urls_to_process = [
        "https://www.dioceseoftyler.org/parishes/",
        "https://www.diopueblo.org/parishes",
        "http://www.miamiarch.org/CatholicDiocese.php"
    ]

print(f"\n📋 Ready to process {len(urls_to_process)} URLs")
for i, url in enumerate(urls_to_process, 1):
    print(f"  {i}. {url}")
print("--- Setup Complete ---\n")

In [None]:
# ROBUST Cell 4: Advanced pagination detection and navigation

import re
from urllib.parse import urljoin, urlparse, parse_qs, urlencode, urlunparse

def detect_pagination_advanced(soup, url):
    """Advanced pagination detection with multiple strategies"""
    pagination_info = {
        'has_pagination': False,
        'current_page': 1,
        'total_pages': None,
        'next_url': None,
        'pagination_type': None,
        'base_url': url,
        'page_urls': []
    }

    # Strategy 1: Look for common pagination containers
    pagination_selectors = [
        '.pagination',
        '.pager',
        '.page-numbers',
        '.pagination-wrapper',
        '[class*="page"]',
        '[class*="pagination"]',
        '.nav-links'
    ]

    pagination_container = None
    for selector in pagination_selectors:
        containers = soup.select(selector)
        if containers:
            pagination_container = containers[0]
            break

    if not pagination_container:
        # Strategy 2: Look for elements with page-related text
        page_elements = soup.find_all(['a', 'span', 'div'],
                                     string=re.compile(r'(next|previous|page\s*\d+)', re.I))
        if page_elements:
            # Find the parent container
            for elem in page_elements:
                parent = elem.find_parent(['div', 'nav', 'ul', 'ol'])
                if parent:
                    pagination_container = parent
                    break

    if not pagination_container:
        return pagination_info

    pagination_info['has_pagination'] = True

    # Strategy 3: Find all page links
    page_links = []

    # Look for numbered page links
    numbered_links = pagination_container.find_all('a', string=lambda text:
                                                  text and text.strip().isdigit())
    page_links.extend(numbered_links)

    # Look for links with page parameters
    param_links = pagination_container.find_all('a', href=re.compile(r'page=\d+', re.I))
    page_links.extend(param_links)

    # Look for links with s-push-url (like Toledo Diocese)
    push_url_links = pagination_container.find_all('a', attrs={'s-push-url': re.compile(r'page=\d+', re.I)})
    page_links.extend(push_url_links)

    # Extract page numbers and URLs
    page_info = []
    for link in page_links:
        try:
            # Try to get page number from text
            text = link.get_text(strip=True)
            if text.isdigit():
                page_num = int(text)
            else:
                # Try to extract from href or s-push-url
                href = link.get('href') or link.get('s-push-url', '')
                page_match = re.search(r'page=(\d+)', href, re.I)
                if page_match:
                    page_num = int(page_match.group(1))
                else:
                    continue

            # Construct full URL
            href = link.get('href') or link.get('s-push-url', '')
            if href:
                if href.startswith('/'):
                    page_url = urljoin(url, href)
                elif href.startswith('?'):
                    # Handle query-only URLs
                    parsed_base = urlparse(url)
                    page_url = urlunparse((parsed_base.scheme, parsed_base.netloc,
                                         parsed_base.path, parsed_base.params,
                                         href[1:], parsed_base.fragment))
                elif href.startswith('http'):
                    page_url = href
                else:
                    # Relative path
                    page_url = urljoin(url, href)

                page_info.append((page_num, page_url))
        except (ValueError, TypeError):
            continue

    if page_info:
        # Sort by page number
        page_info.sort(key=lambda x: x[0])
        pagination_info['page_urls'] = page_info
        pagination_info['total_pages'] = max([p[0] for p in page_info])

    # Strategy 4: Find current page
    current_indicators = pagination_container.find_all(['span', 'a'],
                                                       class_=lambda x: x and 'current' in x.lower())
    if current_indicators:
        try:
            current_text = current_indicators[0].get_text(strip=True)
            if current_text.isdigit():
                pagination_info['current_page'] = int(current_text)
        except:
            pass

    # Strategy 5: Find next page URL
    next_url = find_next_page_url(pagination_container, url, pagination_info)
    pagination_info['next_url'] = next_url

    return pagination_info

def find_next_page_url(container, base_url, pagination_info):
    """Find the next page URL using multiple strategies"""

    # Strategy 1: Look for "Next" text
    next_link = container.find('a', string=lambda text:
                              text and 'next' in text.lower())
    if next_link and next_link.get('href'):
        return construct_full_url(next_link.get('href'), base_url)

    # Strategy 2: Look for right arrow icons
    right_arrow_selectors = [
        'a i.fa-angle-right',
        'a i.fas.fa-angle-right',
        'a i.fa-chevron-right',
        'a i.fas.fa-chevron-right',
        'a .fa-angle-right',
        'a .fa-chevron-right'
    ]

    for selector in right_arrow_selectors:
        arrow_link = container.select_one(selector)
        if arrow_link:
            # Get the parent <a> tag
            a_tag = arrow_link.find_parent('a')
            if a_tag and (a_tag.get('href') or a_tag.get('s-push-url')):
                href = a_tag.get('href') or a_tag.get('s-push-url')
                return construct_full_url(href, base_url)

    # Strategy 3: Look for page number links (get next sequential page)
    current_page = pagination_info.get('current_page', 1)
    next_page_num = current_page + 1

    if pagination_info.get('page_urls'):
        for page_num, page_url in pagination_info['page_urls']:
            if page_num == next_page_num:
                return page_url

    # Strategy 4: Construct next page URL based on pattern
    if '?page=' in base_url:
        # URL already has page parameter
        parsed = urlparse(base_url)
        query_params = parse_qs(parsed.query)
        query_params['page'] = [str(next_page_num)]
        new_query = urlencode(query_params, doseq=True)
        return urlunparse((parsed.scheme, parsed.netloc, parsed.path,
                          parsed.params, new_query, parsed.fragment))
    else:
        # Add page parameter
        separator = '&' if '?' in base_url else '?'
        return f"{base_url}{separator}page={next_page_num}"

    return None

def construct_full_url(href, base_url):
    """Construct a full URL from href and base URL"""
    if not href:
        return None

    if href.startswith('http'):
        return href
    elif href.startswith('/'):
        return urljoin(base_url, href)
    elif href.startswith('?'):
        parsed_base = urlparse(base_url)
        return urlunparse((parsed_base.scheme, parsed_base.netloc,
                         parsed_base.path, parsed_base.params,
                         href[1:], parsed_base.fragment))
    else:
        return urljoin(base_url, href)

def generate_page_urls(base_url, total_pages, current_page=1):
    """Generate all page URLs for a paginated site"""
    page_urls = []

    for page_num in range(1, min(total_pages + 1, 16)):  # Limit to 15 pages max
        if '?page=' in base_url:
            # Replace existing page parameter
            parsed = urlparse(base_url)
            query_params = parse_qs(parsed.query)
            query_params['page'] = [str(page_num)]
            new_query = urlencode(query_params, doseq=True)
            page_url = urlunparse((parsed.scheme, parsed.netloc, parsed.path,
                                 parsed.params, new_query, parsed.fragment))
        else:
            # Add page parameter
            separator = '&' if '?' in base_url else '?'
            page_url = f"{base_url}{separator}page={page_num}"

        page_urls.append((page_num, page_url))

    return page_urls

def scrape_single_page_content_robust(url, driver):
    """Robust single page content extraction"""

    print(f"    📄 Scraping: {url}")

    # Try Selenium first for JavaScript-heavy sites
    if driver:
        try:
            driver.get(url)
            time.sleep(3)

            # Wait for content to load
            try:
                WebDriverWait(driver, 15).until(
                    lambda d: len(d.find_elements(By.TAG_NAME, "body")) > 0
                )
            except:
                pass

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()

            text_content = ' '.join([s.strip() for s in soup.stripped_strings if s.strip()])

            if len(text_content) > 500:  # Good content found
                save_debug_content(url, page_source, f"selenium_page")
                return text_content, soup, "selenium"

        except Exception as e:
            print(f"    ✗ Selenium failed for {url}: {e}")

    # Fallback to requests
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        text_content = ' '.join([s.strip() for s in soup.stripped_strings if s.strip()])

        save_debug_content(url, response.text, f"requests_page")
        return text_content, soup, "requests"

    except Exception as e:
        print(f"    ✗ Requests failed for {url}: {e}")
        return None, None, "failed"

def scrape_all_pages_robust(base_url, driver, max_pages=15):
    """
    Robust multi-page scraping with advanced pagination detection
    """
    print(f"\n🔍 Starting robust multi-page extraction from: {base_url}")

    all_content = []
    visited_urls = set()

    # Step 1: Get first page and analyze pagination
    content, soup, method = scrape_single_page_content_robust(base_url, driver)

    if not content or not soup:
        print(f"  ✗ Failed to get first page content")
        return []

    # Add first page content
    all_content.append({
        'page': 1,
        'url': base_url,
        'content': content,
        'length': len(content)
    })
    visited_urls.add(base_url)
    print(f"    ✓ Page 1 content extracted: {len(content)} characters ({method})")

    # Step 2: Analyze pagination
    pagination_info = detect_pagination_advanced(soup, base_url)

    if not pagination_info['has_pagination']:
        print(f"    🏁 No pagination detected")
        return all_content

    total_pages = pagination_info.get('total_pages')
    print(f"    🔢 Pagination detected! Total pages: {total_pages}")

    # Step 3: Generate all page URLs
    if pagination_info.get('page_urls'):
        # Use detected page URLs
        page_urls = pagination_info['page_urls']
    elif total_pages:
        # Generate URLs based on pattern
        page_urls = generate_page_urls(base_url, total_pages)
    else:
        # Use next_url method
        page_urls = []
        current_url = base_url
        for page_num in range(2, max_pages + 1):
            pagination_info = detect_pagination_advanced(soup, current_url)
            next_url = pagination_info.get('next_url')
            if next_url and next_url not in visited_urls:
                page_urls.append((page_num, next_url))
                current_url = next_url
            else:
                break

    # Step 4: Scrape all additional pages
    for page_num, page_url in page_urls:
        if page_num == 1 or page_url in visited_urls:
            continue  # Skip first page (already processed) or duplicates

        if len(all_content) >= max_pages:
            print(f"    🛑 Reached maximum pages limit ({max_pages})")
            break

        print(f"    📄 Processing page {page_num}: {page_url}")

        content, soup, method = scrape_single_page_content_robust(page_url, driver)

        if content and len(content.strip()) > 100:
            all_content.append({
                'page': page_num,
                'url': page_url,
                'content': content,
                'length': len(content)
            })
            visited_urls.add(page_url)
            print(f"    ✓ Page {page_num} content extracted: {len(content)} characters ({method})")
        else:
            print(f"    ⚠ Page {page_num} had insufficient content")

        # Be respectful with delays
        time.sleep(2)

    print(f"  📊 Total pages processed: {len(all_content)}")
    print(f"  📊 Total content blocks: {len(all_content)}")

    return all_content

def enhanced_content_extraction_robust(url, driver=None):
    """
    ROBUST: Enhanced content extraction with advanced pagination support
    """
    print(f"\n🔍 Robust content extraction from: {url}")

    # Get all pages using robust method
    page_contents = scrape_all_pages_robust(url, driver, MAX_PAGES_PER_SITE)

    if not page_contents:
        print(f"  ✗ No content extracted")
        return None

    if len(page_contents) == 1:
        # Single page
        content = page_contents[0]['content']
        print(f"  📄 Single-page extraction: {len(content):,} characters")
        return content[:50000]  # Limit for API
    else:
        # Multi-page - combine all content
        combined_content = combine_page_contents(page_contents)

        if combined_content:
            # Limit total content length for API processing
            max_content_length = 100000
            if len(combined_content) > max_content_length:
                print(f"  ✂️ Truncating content from {len(combined_content):,} to {max_content_length:,} characters")
                combined_content = combined_content[:max_content_length]

            print(f"  🎯 Multi-page extraction complete: {len(combined_content):,} characters")
            return combined_content
        else:
            # Fallback to first page
            content = page_contents[0]['content']
            print(f"  ⚠ Multi-page combination failed, using first page: {len(content):,} characters")
            return content[:50000]

print("Robust pagination detection and navigation loaded!")
print("Enhanced features:")
print("  🔍 Advanced pagination pattern detection")
print("  🔗 Query parameter URL construction (?page=2)")
print("  📄 Multiple link detection strategies")
print("  🛡️ Robust error handling and fallbacks")
print("  📊 Smart page limit management")

In [None]:
# Enhanced Cell 5: Improved Gemini processing for multi-page content

def create_enhanced_pagination_prompt(url, content):
    """Create a more detailed prompt for multi-page parish extraction"""

    prompt = f"""
You are an expert at extracting Catholic parish information from web content that may span multiple pages.

IMPORTANT INSTRUCTIONS:
1. The URL {url} contains a parish directory or parish listing that may span multiple pages
2. The content below may contain data from multiple pages, marked with "=== PAGE X ===" headers
3. Extract information about ALL Catholic parishes found across ALL pages
4. Do NOT duplicate parishes that appear on multiple pages - each parish should only appear once in your output
5. Look for parishes, churches, missions, chapels, and Catholic communities
6. Return ONLY valid JSON - no explanatory text before or after

EXPECTED OUTPUT FORMAT:
Return a JSON array with ALL parishes found across all pages:
[
  {{
    "Name": "Parish Name",
    "Status": "Parish/Mission/Chapel/Cathedral",
    "Deanery": "Deanery Name",
    "EST": "Established Year",
    "Street Address": "Street Address",
    "City": "City",
    "State": "State",
    "Zipcode": "Zipcode",
    "Phone Number": "Phone",
    "Website": "URL"
  }}
]

DEDUPLICATION RULES:
- If the same parish name appears multiple times, only include it once
- Use the most complete information available if duplicates exist
- Parish names like "St. Mary" in different cities are DIFFERENT parishes

Use null for missing values. Extract phone numbers, websites, and addresses carefully.

MULTI-PAGE CONTENT:
{content[:90000]}
"""
    return prompt

def process_url_with_enhanced_gemini_pagination(url, content):
    """Process URL content with enhanced Gemini prompting for multi-page content"""

    if not content:
        print(f"  ✗ No content to process for {url}")
        return None

    if not GENAI_API_KEY:
        print(f"  ✗ GenAI not configured - skipping LLM processing")
        return None

    try:
        print(f"  🤖 Processing multi-page content with Gemini... ({len(content):,} chars)")

        # Use the enhanced prompt for multi-page content
        if "=== PAGE" in content:
            prompt = create_enhanced_pagination_prompt(url, content)
        else:
            # Fall back to original prompt for single-page content
            prompt = create_enhanced_prompt(url, content)

        response_text = invoke_gemini_model(prompt_text=prompt, model_name="gemini-1.5-flash")

        print(f"  📝 Gemini response length: {len(response_text)} characters")

        # Save raw response for debugging
        if SAVE_DEBUG_FILES:
            domain = extract_domain(url).replace('.', '_')
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            response_file = f"debug_content/{domain}_gemini_multipage_response_{timestamp}.json"
            with open(response_file, 'w', encoding='utf-8') as f:
                f.write(response_text)
            print(f"  📄 Raw response saved: {response_file}")

        # Clean up response
        cleaned_response = response_text.strip()

        # Remove markdown code blocks
        if cleaned_response.startswith("```json"):
            cleaned_response = cleaned_response[7:]
        if cleaned_response.startswith("```"):
            cleaned_response = cleaned_response[3:]
        if cleaned_response.endswith("```"):
            cleaned_response = cleaned_response[:-3]

        cleaned_response = cleaned_response.strip()

        # Parse JSON
        try:
            parsed_data = json.loads(cleaned_response)

            # Handle both single objects and arrays
            if isinstance(parsed_data, list):
                print(f"  ✓ Extracted {len(parsed_data)} parishes from multi-page content")

                # Remove duplicates based on name + city combination
                unique_parishes = []
                seen_parishes = set()

                for parish in parsed_data:
                    parish_key = f"{parish.get('Name', '')}-{parish.get('City', '')}"
                    if parish_key not in seen_parishes:
                        unique_parishes.append(parish)
                        seen_parishes.add(parish_key)
                    else:
                        print(f"    🔄 Skipped duplicate: {parish.get('Name', 'Unknown')}")

                if len(unique_parishes) != len(parsed_data):
                    print(f"  🧹 Removed {len(parsed_data) - len(unique_parishes)} duplicates")

                return unique_parishes

            elif isinstance(parsed_data, dict):
                print(f"  ✓ Extracted 1 parish: {parsed_data.get('Name', 'Unknown')}")
                return [parsed_data]  # Convert to array for consistent handling
            else:
                print(f"  ⚠ Unexpected data type: {type(parsed_data)}")
                return None

        except json.JSONDecodeError as e:
            print(f"  ✗ JSON parsing failed: {e}")
            print(f"  📝 Response preview: {cleaned_response[:200]}...")
            return None

    except Exception as e:
        print(f"  ✗ Gemini processing failed: {e}")
        return None

def analyze_pagination_results(page_contents):
    """Analyze the results from paginated content extraction"""
    if not page_contents:
        return

    print(f"\n📊 PAGINATION ANALYSIS:")
    print(f"  Total pages processed: {len(page_contents)}")

    total_chars = sum(page['length'] for page in page_contents)
    print(f"  Total content length: {total_chars:,} characters")

    for page in page_contents:
        print(f"    Page {page['page']}: {page['length']:,} chars - {page['url']}")

# Modified main processing function to use pagination
def process_single_url_with_pagination(url, driver, attempt=1, max_attempts=2, max_pages=10):
    """Process a single URL with enhanced pagination support"""

    print(f"\n{'='*60}")
    print(f"🔄 Processing URL with Pagination (Attempt {attempt}/{max_attempts}): {url}")
    print(f"{'='*60}")

    try:
        # Step 1: Extract content with pagination support
        content = enhanced_content_extraction_with_pagination(url, driver, max_pages)

        if not content or len(content.strip()) < 100:
            print(f"  ✗ Insufficient content extracted ({len(content) if content else 0} chars)")

            if attempt < max_attempts and RETRY_FAILED_URLS:
                print(f"  🔄 Retrying with different approach...")
                time.sleep(5)
                return process_single_url_with_pagination(url, driver, attempt + 1, max_attempts, max_pages)
            else:
                return {
                    'url': url,
                    'status': 'failed',
                    'reason': 'insufficient_content',
                    'parishes_found': 0
                }

        # Step 2: Process with Enhanced Gemini (pagination-aware)
        parish_data_list = process_url_with_enhanced_gemini_pagination(url, content)

        if not parish_data_list:
            print(f"  ✗ No parish data extracted by Gemini")
            return {
                'url': url,
                'status': 'no_data',
                'reason': 'gemini_extraction_failed',
                'parishes_found': 0
            }

        # Step 3: Save to database
        if len(parish_data_list) == 0:
            print(f"  ℹ️ Page appears to be a directory/landing page with no specific parish data")
            return {
                'url': url,
                'status': 'directory_page',
                'reason': 'no_parish_details',
                'parishes_found': 0
            }

        # Use the exact mapping function
        success = safe_upsert_to_supabase_exact(parish_data_list, url)

        print(f"  🎯 Processing complete for {url}")
        print(f"     Found: {len(parish_data_list)} parishes")
        print(f"     Database: {'Success' if success else 'Failed'}")

        return {
            'url': url,
            'status': 'success' if success else 'db_failed',
            'reason': 'completed',
            'parishes_found': len(parish_data_list),
            'parishes_saved': success,
            'extraction_method': 'pagination'
        }

    except Exception as e:
        print(f"  ✗ Exception processing {url}: {e}")

        if attempt < max_attempts and RETRY_FAILED_URLS:
            print(f"  🔄 Retrying due to exception...")
            time.sleep(5)
            return process_single_url_with_pagination(url, driver, attempt + 1, max_attempts, max_pages)
        else:
            return {
                'url': url,
                'status': 'error',
                'reason': str(e),
                'parishes_found': 0
            }

print("Enhanced Gemini processing with pagination support loaded!")
print("New features:")
print("  🔢 Multi-page content processing")
print("  🧹 Automatic duplicate removal")
print("  📊 Enhanced pagination analysis")
print("  🔍 Improved parish extraction accuracy")

In [None]:
# Cell 6: Database operations with exact column mapping

def validate_parish_data(parish_data, source_url):
    """Validate and clean parish data before database insertion"""

    if not isinstance(parish_data, dict):
        return None

    # Ensure all required fields exist
    required_fields = ['Name', 'Status', 'Deanery', 'EST', 'Street Address',
                       'City', 'State', 'Zipcode', 'Phone Number', 'Website']

    validated_data = {}

    for field in required_fields:
        value = parish_data.get(field)
        # Convert empty strings to None
        if value == "" or value == "null" or value == "NULL":
            value = None
        validated_data[field] = value

    # Add metadata
    validated_data['source_url'] = source_url
    validated_data['domain'] = extract_domain(source_url)
    validated_data['processed_at'] = datetime.now().isoformat()

    return validated_data

def prepare_for_supabase_exact(parish_data):
    """Convert parish data to match your exact Supabase column names"""

    # Map to your EXACT column names from the schema
    return {
        'Name': parish_data.get('Name'),
        'Status': parish_data.get('Status'),
        'Deanery': parish_data.get('Deanery'),
        'Street Address': parish_data.get('Street Address'),  # Exact match
        'City': parish_data.get('City'),
        'State': parish_data.get('State'),
        'Zip Code': parish_data.get('Zipcode'),  # Map Zipcode -> Zip Code
        'Phone Number': parish_data.get('Phone Number'),  # Exact match
        'Web': parish_data.get('Website'),  # Map Website -> Web
        # Note: No EST column in your table, so we skip it
        # Note: id and created_at are auto-generated by Supabase
    }

def safe_upsert_to_supabase_exact(parish_data_list, source_url):
    """Safely insert parish data using exact column mapping"""

    if not supabase:
        print(f"  ✗ Supabase not available - skipping database write")
        return False

    if not parish_data_list:
        print(f"  ⚠ No parish data to save")
        return False

    print(f"  📋 Using exact column mapping for {len(parish_data_list)} parishes")

    success_count = 0

    for i, parish_data in enumerate(parish_data_list):
        try:
            # Validate data
            validated_data = validate_parish_data(parish_data, source_url)
            if not validated_data:
                print(f"    ⚠ Skipping invalid parish data #{i+1}")
                continue

            # Prepare for Supabase with exact column mapping
            supabase_data = prepare_for_supabase_exact(validated_data)

            # Skip if no meaningful data (all key fields are null)
            key_fields = ['Name', 'Street Address', 'City', 'Phone Number']
            if all(supabase_data.get(field) is None for field in key_fields):
                print(f"    ⚠ Skipping parish #{i+1} - no meaningful data")
                continue

            parish_name = supabase_data.get('Name', f'Parish_{i+1}')

            # Remove any None values to avoid issues
            clean_data = {k: v for k, v in supabase_data.items() if v is not None}

            # Simple insert (no upsert since we don't have a unique constraint)
            try:
                response = supabase.table('Parishes').insert(clean_data).execute()

                if hasattr(response, 'error') and response.error:
                    print(f"    ✗ Database error for {parish_name}: {response.error}")
                else:
                    print(f"    ✓ Saved: {parish_name}")
                    success_count += 1

            except Exception as db_error:
                print(f"    ✗ Database exception for {parish_name}: {db_error}")

        except Exception as e:
            print(f"    ✗ Processing error for parish #{i+1}: {e}")

    print(f"  📊 Successfully saved {success_count}/{len(parish_data_list)} parishes")
    return success_count > 0

# Test the exact mapping
print("✅ Exact column mapping loaded!")
print("📋 Column mapping:")
print("   AI 'Zipcode' -> DB 'Zip Code'")
print("   AI 'Website' -> DB 'Web'")
print("   AI 'EST' -> Skipped (column doesn't exist)")
print("   All other fields map directly")

In [None]:
# ROBUST Cell 7: Main processing with advanced pagination

# Configuration
MAX_PAGES_PER_SITE = 15
ENABLE_PAGINATION = True

def process_single_url_robust(url, driver, attempt=1, max_attempts=2):
    """ROBUST: Process a single URL with advanced pagination support"""

    print(f"\n{'='*60}")
    print(f"🔄 ROBUST Processing (Attempt {attempt}/{max_attempts}): {url}")
    print(f"{'='*60}")

    try:
        # Step 1: Extract content using the ROBUST function
        content = enhanced_content_extraction_robust(url, driver)

        if not content or len(content.strip()) < 100:
            print(f"  ✗ Insufficient content extracted ({len(content) if content else 0} chars)")

            if attempt < max_attempts and RETRY_FAILED_URLS:
                print(f"  🔄 Retrying with different approach...")
                time.sleep(5)
                return process_single_url_robust(url, driver, attempt + 1, max_attempts)
            else:
                return {
                    'url': url,
                    'status': 'failed',
                    'reason': 'insufficient_content',
                    'parishes_found': 0
                }

        # Step 2: Process with appropriate Gemini function
        if "=== PAGE" in content:
            print(f"  🔢 Multi-page content detected")
            parish_data_list = process_url_with_enhanced_gemini_pagination(url, content)
        else:
            print(f"  📄 Single-page content detected")
            parish_data_list = process_url_with_enhanced_gemini(url, content)

        if not parish_data_list:
            print(f"  ✗ No parish data extracted by Gemini")
            return {
                'url': url,
                'status': 'no_data',
                'reason': 'gemini_extraction_failed',
                'parishes_found': 0
            }

        # Step 3: Save to database
        if len(parish_data_list) == 0:
            print(f"  ℹ️ Page appears to be a directory/landing page with no specific parish data")
            return {
                'url': url,
                'status': 'directory_page',
                'reason': 'no_parish_details',
                'parishes_found': 0
            }

        # Use the exact mapping function
        success = safe_upsert_to_supabase_exact(parish_data_list, url)

        print(f"  🎯 Processing complete for {url}")
        print(f"     Found: {len(parish_data_list)} parishes")
        print(f"     Database: {'Success' if success else 'Failed'}")
        print(f"     Method: {'Multi-page' if '=== PAGE' in content else 'Single-page'}")

        return {
            'url': url,
            'status': 'success' if success else 'db_failed',
            'reason': 'completed',
            'parishes_found': len(parish_data_list),
            'parishes_saved': success,
            'is_multi_page': "=== PAGE" in content
        }

    except Exception as e:
        print(f"  ✗ Exception processing {url}: {e}")

        if attempt < max_attempts and RETRY_FAILED_URLS:
            print(f"  🔄 Retrying due to exception...")
            time.sleep(5)
            return process_single_url_robust(url, driver, attempt + 1, max_attempts)
        else:
            return {
                'url': url,
                'status': 'error',
                'reason': str(e),
                'parishes_found': 0
            }

# Initialize WebDriver
print(f"\n🚀 Starting ROBUST parish processing with advanced pagination...")
print(f"📊 Configuration:")
print(f"   Max pages per site: {MAX_PAGES_PER_SITE}")
print(f"   Pagination enabled: {ENABLE_PAGINATION}")
print(f"   Max URLs to process: {MAX_URLS_TO_PROCESS}")
print(f"   Debug mode: {SAVE_DEBUG_FILES}")

driver = setup_webdriver()

# Track results
results = []
total_parishes_found = 0
successful_urls = 0
multi_page_count = 0

try:
    for i, url in enumerate(urls_to_process, 1):
        print(f"\n\n📍 URL {i}/{len(urls_to_process)}")

        result = process_single_url_robust(url, driver)
        results.append(result)

        total_parishes_found += result.get('parishes_found', 0)
        if result.get('status') == 'success':
            successful_urls += 1

        if result.get('is_multi_page', False):
            multi_page_count += 1

        # Add delay between requests to be respectful
        if i < len(urls_to_process):
            print(f"\n⏳ Waiting 3 seconds before next URL...")
            time.sleep(3)

finally:
    # Clean up WebDriver
    if driver:
        try:
            driver.quit()
            print("\n🧹 WebDriver closed")
        except:
            pass

# Print enhanced summary
print(f"\n\n{'='*60}")
print(f"📊 ROBUST PROCESSING SUMMARY")
print(f"{'='*60}")
print(f"URLs processed: {len(results)}")
print(f"Successful URLs: {successful_urls}")
print(f"Multi-page sites detected: {multi_page_count}")
print(f"Total parishes found: {total_parishes_found}")
if successful_urls > 0:
    print(f"Average parishes per successful URL: {total_parishes_found/successful_urls:.1f}")

print(f"\nDetailed results:")
for result in results:
    status_emoji = {
        'success': '✅',
        'directory_page': '📁',
        'no_data': '❌',
        'failed': '❌',
        'error': '💥',
        'db_failed': '⚠️'
    }.get(result['status'], '❓')

    multi_page_indicator = " [MULTI-PAGE]" if result.get('is_multi_page', False) else ""

    print(f"  {status_emoji} {result['url']}{multi_page_indicator}")
    print(f"     Status: {result['status']} | Parishes: {result['parishes_found']} | Reason: {result['reason']}")

print(f"\n🎉 ROBUST processing complete!")
if SAVE_DEBUG_FILES:
    print(f"📄 Debug files saved in 'debug_content/' folder")

if multi_page_count > 0:
    print(f"🔢 Successfully processed {multi_page_count} multi-page sites")

print(f"{'='*60}")

print("\n✅ ROBUST enhancements:")
print("  🔍 Advanced pagination pattern detection")
print("  🔗 Query parameter URL construction (?page=2)")
print("  📄 Multiple link detection strategies")
print("  🛡️ Comprehensive error handling")
print("  📊 Smart content combination")

# Test function to verify pagination detection
def test_pagination_detection():
    """Test the pagination detection on a sample"""
    print(f"\n🧪 Testing pagination detection...")

    if driver and urls_to_process:
        test_url = urls_to_process[1]  # Use Toledo Diocese
        print(f"Testing with: {test_url}")

        try:
            driver.get(test_url)
            time.sleep(3)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            pagination_info = detect_pagination_advanced(soup, test_url)

            print(f"  Has pagination: {pagination_info['has_pagination']}")
            print(f"  Total pages: {pagination_info.get('total_pages')}")
            print(f"  Current page: {pagination_info.get('current_page')}")
            print(f"  Next URL: {pagination_info.get('next_url')}")

            if pagination_info.get('page_urls'):
                print(f"  Found {len(pagination_info['page_urls'])} page URLs:")
                for page_num, page_url in pagination_info['page_urls'][:5]:  # Show first 5
                    print(f"    Page {page_num}: {page_url}")

        except Exception as e:
            print(f"  Test failed: {e}")
    else:
        print(f"  Cannot test - no driver or URLs available")

# Run the test
test_pagination_detection()

In [None]:
# # Cell 8: Fixed analysis and debugging tools

# def analyze_debug_files():
#     """Analyze saved debug files to understand extraction issues"""

#     if not SAVE_DEBUG_FILES or not os.path.exists('debug_content'):
#         print("No debug files to analyze")
#         return

#     debug_files = [f for f in os.listdir('debug_content') if f.endswith('.html')]

#     if not debug_files:
#         print("No HTML debug files found")
#         return

#     print(f"\n🔍 Analyzing {len(debug_files)} debug files...")

#     for file in debug_files:  # Analyze all files
#         file_path = f"debug_content/{file}"

#         try:
#             with open(file_path, 'r', encoding='utf-8') as f:
#                 content = f.read()

#             # Basic analysis
#             soup = BeautifulSoup(content, 'html.parser')

#             # Count potential parish indicators
#             parish_keywords = ['parish', 'church', 'cathedral', 'mission', 'chapel', 'catholic']
#             phone_patterns = content.count('(') + content.count('-')
#             address_indicators = content.lower().count('street') + content.lower().count('avenue') + content.lower().count('road') + content.lower().count('drive')

#             keyword_count = sum(content.lower().count(keyword) for keyword in parish_keywords)

#             print(f"\n📄 {file}:")
#             print(f"   Content length: {len(content):,} chars")
#             print(f"   Parish keywords: {keyword_count}")
#             print(f"   Phone indicators: {phone_patterns}")
#             print(f"   Address indicators: {address_indicators}")
#             print(f"   Links found: {len(soup.find_all('a'))}")

#             # Get clean text content
#             for script in soup(["script", "style"]):
#                 script.decompose()
#             text_content = ' '.join([s.strip() for s in soup.stripped_strings if s.strip()])

#             print(f"   Clean text length: {len(text_content):,} chars")
#             print(f"   Sample text: {text_content[:200]}...")

#             # Look for specific parish data patterns
#             if 'phone' in text_content.lower() or 'address' in text_content.lower():
#                 print(f"   ✓ Contains potential contact info")
#             if any(keyword in text_content.lower() for keyword in parish_keywords):
#                 print(f"   ✓ Contains parish-related keywords")

#         except Exception as e:
#             print(f"   Error analyzing {file}: {e}")

# def show_gemini_responses():
#     """Show Gemini response files for debugging"""

#     if not SAVE_DEBUG_FILES or not os.path.exists('debug_content'):
#         print("No debug files available")
#         return

#     response_files = [f for f in os.listdir('debug_content') if 'gemini_response' in f]

#     if not response_files:
#         print("No Gemini response files found")
#         return

#     print(f"\n🤖 Found {len(response_files)} Gemini response files:")

#     for file in response_files:
#         file_path = f"debug_content/{file}"
#         try:
#             with open(file_path, 'r', encoding='utf-8') as f:
#                 content = f.read()

#             print(f"\n📄 {file}:")
#             print(f"   Length: {len(content)} chars")
#             print(f"   Content: '{content}'")

#             if content.strip() == "[]":
#                 print(f"   ⚠ Empty array - Gemini found no parish data")
#             elif len(content.strip()) < 10:
#                 print(f"   ⚠ Very short response - possible error")

#         except Exception as e:
#             print(f"   Error reading {file}: {e}")

# def test_gemini_with_sample():
#     """Test Gemini with a sample parish text to see if it's working"""

#     if not GENAI_API_KEY:
#         print("❌ GenAI not configured")
#         return

#     sample_text = """
#     St. Mary's Catholic Church
#     123 Main Street
#     Anytown, TX 75001
#     Phone: (555) 123-4567
#     Website: www.stmarys.org

#     Our Lady of Guadalupe Parish
#     456 Oak Avenue
#     Somewhere, TX 75002
#     Phone: (555) 987-6543
#     Website: www.olg.org
#     """

#     print("🧪 Testing Gemini with sample parish data...")

#     try:
#         from llm_utils import invoke_gemini_model

#         prompt = f"""
# Extract parish information from this text. Return valid JSON only:

# {sample_text}

# Format as array of objects with keys: Name, Status, Deanery, EST, Street Address, City, State, Zipcode, Phone Number, Website
# """

#         response = invoke_gemini_model(prompt_text=prompt)
#         print(f"\n📝 Gemini response:")
#         print(f"Length: {len(response)} chars")
#         print(f"Content: {response}")

#         # Try to parse
#         try:
#             import json
#             parsed = json.loads(response.strip())
#             print(f"✓ Valid JSON parsed successfully")
#             print(f"✓ Found {len(parsed)} items")
#         except:
#             print(f"✗ Invalid JSON response")

#     except Exception as e:
#         print(f"❌ Error testing Gemini: {e}")

# def inspect_actual_content():
#     """Look at the actual content being sent to Gemini"""

#     if not SAVE_DEBUG_FILES or not os.path.exists('debug_content'):
#         print("No debug files available")
#         return

#     debug_files = [f for f in os.listdir('debug_content') if f.endswith('.html')]

#     for file in debug_files[:2]:  # Check first 2 files
#         file_path = f"debug_content/{file}"

#         try:
#             with open(file_path, 'r', encoding='utf-8') as f:
#                 content = f.read()

#             # Extract the same way our script does
#             soup = BeautifulSoup(content, 'html.parser')
#             for script in soup(["script", "style"]):
#                 script.decompose()

#             text_content = ' '.join([s.strip() for s in soup.stripped_strings if s.strip()])

#             print(f"\n📄 Content analysis for {file}:")
#             print(f"Raw HTML length: {len(content):,} chars")
#             print(f"Processed text length: {len(text_content):,} chars")
#             print(f"\nFirst 500 chars of processed text:")
#             print(f"'{text_content[:500]}'")
#             print(f"\nLast 500 chars of processed text:")
#             print(f"'{text_content[-500:]}'")

#             # Check if it looks like a parish directory
#             has_parish_words = any(word in text_content.lower() for word in ['parish', 'church', 'catholic'])
#             has_contact_info = any(word in text_content.lower() for word in ['phone', 'address', 'email'])

#             print(f"\nContent assessment:")
#             print(f"  Contains parish words: {has_parish_words}")
#             print(f"  Contains contact info: {has_contact_info}")

#             if not has_parish_words:
#                 print(f"  ⚠ This might not be a parish directory page")

#         except Exception as e:
#             print(f"Error inspecting {file}: {e}")

# def check_supabase_connection():
#     """Test Supabase connection and table structure"""

#     if not supabase:
#         print("❌ Supabase not configured")
#         return

#     try:
#         # Test basic connection
#         response = supabase.table('Parishes').select('*').limit(5).execute()
#         print(f"✅ Supabase connection working")
#         print(f"📊 Sample records in Parishes table: {len(response.data)}")

#         if response.data:
#             print(f"📋 Sample record structure:")
#             for key in response.data[0].keys():
#                 print(f"   - {key}")

#     except Exception as e:
#         print(f"❌ Supabase connection test failed: {e}")

# print("\n🛠️ Enhanced debug tools loaded!")
# print("Available functions:")
# print("  - analyze_debug_files() - Analyze scraped HTML content")
# print("  - show_gemini_responses() - View AI responses")
# print("  - test_gemini_with_sample() - Test Gemini with known parish data")
# print("  - inspect_actual_content() - See exactly what text is sent to Gemini")
# print("  - check_supabase_connection() - Test database connection")

# print("\n🔍 Let's analyze what happened in your last run:")
# analyze_debug_files()
# show_gemini_responses()

# print("\n🧪 Testing Gemini with sample data:")
# test_gemini_with_sample()