<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build_Parishes_Database_Using_AgenticAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Cell 1: Import required code and install packages
!pip install supabase google-generativeai psycopg2-binary tenacity selenium webdriver-manager
!wget https://raw.githubusercontent.com/tomknightatl/USCCB/main/llm_utils.py

Collecting selenium
  Downloading selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.33.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading webdriver_manager-4.0.2-py2.py3

In [7]:
# Cell 2: Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import os
import time
import json
import random
from datetime import datetime
from urllib.parse import urlparse, urljoin
from google.colab import userdata
import google.generativeai as genai
from supabase import create_client, Client

# Import Selenium for JavaScript-heavy sites
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

print("All libraries imported successfully!")

All libraries imported successfully!


In [8]:
# Cell 3: Enhanced configuration and setup
print("=== ENHANCED PARISH DATABASE BUILDER ===")
print("--- User Configurable Parameters & Setup ---")

# --- Processing Configuration ---
MAX_URLS_TO_PROCESS = 3  # Start small for testing
USE_SELENIUM = True  # Enable JavaScript rendering
SAVE_DEBUG_FILES = True  # Save scraped content for debugging
RETRY_FAILED_URLS = True  # Retry failed URLs with different methods

# Create debug directory
if SAVE_DEBUG_FILES:
    os.makedirs('debug_content', exist_ok=True)
    print("Debug directory created for saving scraped content")

print(f"Processing will be limited to {MAX_URLS_TO_PROCESS} URLs.")
print(f"JavaScript rendering: {'Enabled' if USE_SELENIUM else 'Disabled'}")
print(f"Debug mode: {'Enabled' if SAVE_DEBUG_FILES else 'Disabled'}")

# --- Supabase Configuration ---
SUPABASE_URL = userdata.get('SUPABASE_URL')
SUPABASE_KEY = userdata.get('SUPABASE_KEY')

supabase: Client = None
if SUPABASE_URL and SUPABASE_KEY:
    try:
        supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
        print("✓ Supabase client initialized successfully")

        # Test connection and check table structure
        try:
            test_response = supabase.table('Parishes').select('*').limit(1).execute()
            print("✓ Parishes table accessible")
        except Exception as e:
            print(f"⚠ Warning: Could not access Parishes table: {e}")

    except Exception as e:
        print(f"✗ Error initializing Supabase client: {e}")
        supabase = None
else:
    print("✗ Supabase credentials not found in secrets")
    print("Required secrets: SUPABASE_URL, SUPABASE_KEY")

# --- GenAI Configuration ---
GENAI_API_KEY = userdata.get('GENAI_API_KEY_USCCB')

if GENAI_API_KEY:
    try:
        genai.configure(api_key=GENAI_API_KEY)
        # Test the API
        test_model = genai.GenerativeModel('gemini-1.5-flash')
        test_response = test_model.generate_content("Say 'API working'")
        print("✓ GenAI configured and tested successfully")
    except Exception as e:
        print(f"✗ Error configuring GenAI: {e}")
        GENAI_API_KEY = None
else:
    print("✗ GenAI API Key not found (Secret: GENAI_API_KEY_USCCB)")

# --- Setup Selenium WebDriver ---
def setup_webdriver():
    if not USE_SELENIUM:
        return None

    try:
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--window-size=1920,1080')

        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        print("✓ Selenium WebDriver initialized")
        return driver
    except Exception as e:
        print(f"⚠ Warning: Could not initialize Selenium: {e}")
        return None

# --- Data Retrieval from Supabase ---
urls_to_process = []
if supabase:
    try:
        print("\nFetching parish directory URLs...")
        response = supabase.table('DiocesesParishDirectory').select('parish_directory_url').not_.is_('parish_directory_url', 'null').not_.eq('parish_directory_url', '').execute()

        if response.data:
            fetched_urls = [item['parish_directory_url'] for item in response.data if item['parish_directory_url']]
            print(f"Found {len(fetched_urls)} URLs in database")

            if MAX_URLS_TO_PROCESS and len(fetched_urls) > MAX_URLS_TO_PROCESS:
                urls_to_process = random.sample(fetched_urls, MAX_URLS_TO_PROCESS)
                print(f"Selected {len(urls_to_process)} URLs for processing")
            else:
                urls_to_process = fetched_urls
                print(f"Will process all {len(urls_to_process)} URLs")
        else:
            print("No parish directory URLs found")

    except Exception as e:
        print(f"Error fetching URLs: {e}")
        urls_to_process = []

if not urls_to_process:
    print("\n⚠ No URLs to process - using test URLs")
    urls_to_process = [
        "https://www.diopueblo.org/parishes",
        "https://www.rcbo.org/directories/parishes/",
        "https://archdiosf.org/directory-for-the-archdiocese"
    ]

print(f"\n📋 Ready to process {len(urls_to_process)} URLs")
print("--- Setup Complete ---\n")

=== ENHANCED PARISH DATABASE BUILDER ===
--- User Configurable Parameters & Setup ---
Debug directory created for saving scraped content
Processing will be limited to 3 URLs.
JavaScript rendering: Enabled
Debug mode: Enabled
✓ Supabase client initialized successfully
✓ GenAI configured and tested successfully

Fetching parish directory URLs...
Found 192 URLs in database
Selected 3 URLs for processing

📋 Ready to process 3 URLs
--- Setup Complete ---



In [9]:
# Cell 4: Enhanced web scraping and content extraction functions
from llm_utils import invoke_gemini_model

def extract_domain(url):
    """Extract domain from URL"""
    return urlparse(url).netloc

def save_debug_content(url, content, method="unknown"):
    """Save scraped content for debugging"""
    if not SAVE_DEBUG_FILES:
        return

    try:
        domain = extract_domain(url).replace('.', '_')
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"debug_content/{domain}_{method}_{timestamp}.html"

        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"<!-- URL: {url} -->\n")
            f.write(f"<!-- Method: {method} -->\n")
            f.write(f"<!-- Timestamp: {timestamp} -->\n\n")
            f.write(content)

        print(f"  📄 Debug content saved: {filename}")
    except Exception as e:
        print(f"  ⚠ Could not save debug content: {e}")

def scrape_with_requests(url, timeout=10):
    """Scrape content using requests + BeautifulSoup"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        # Get text content
        text_content = ' '.join([s.strip() for s in soup.stripped_strings if s.strip()])

        save_debug_content(url, response.text, "requests")

        return text_content, len(text_content), "requests"

    except Exception as e:
        print(f"  ✗ Requests failed: {e}")
        return None, 0, "requests_failed"

def scrape_with_selenium(url, driver, timeout=15):
    """Scrape content using Selenium for JavaScript rendering"""
    if not driver:
        return None, 0, "selenium_unavailable"

    try:
        driver.get(url)

        # Wait for page to load
        time.sleep(3)

        # Try to wait for common content indicators
        try:
            WebDriverWait(driver, timeout).until(
                lambda d: len(d.find_elements(By.TAG_NAME, "body")) > 0
            )
        except:
            pass  # Continue anyway

        # Get page source after JavaScript execution
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        text_content = ' '.join([s.strip() for s in soup.stripped_strings if s.strip()])

        save_debug_content(url, page_source, "selenium")

        return text_content, len(text_content), "selenium"

    except Exception as e:
        print(f"  ✗ Selenium failed: {e}")
        return None, 0, "selenium_failed"

def enhanced_content_extraction(url, driver=None):
    """Try multiple methods to extract content from a URL"""
    print(f"\n🔍 Extracting content from: {url}")

    methods_tried = []
    best_content = None
    best_length = 0
    best_method = None

    # Method 1: Try requests first (faster)
    print("  📡 Trying requests method...")
    content, length, method = scrape_with_requests(url)
    methods_tried.append((content, length, method))

    if content and length > best_length:
        best_content, best_length, best_method = content, length, method
        print(f"  ✓ Requests success: {length} characters")

    # Method 2: Try Selenium if requests failed or content is too short
    if USE_SELENIUM and (not content or length < 1000):
        print("  🌐 Trying Selenium method...")
        content, length, method = scrape_with_selenium(url, driver)
        methods_tried.append((content, length, method))

        if content and length > best_length:
            best_content, best_length, best_method = content, length, method
            print(f"  ✓ Selenium success: {length} characters")

    if best_content:
        print(f"  🎯 Best method: {best_method} ({best_length} characters)")
        return best_content[:50000]  # Limit content length for API
    else:
        print(f"  ✗ All methods failed for {url}")
        return None

print("Enhanced scraping functions loaded!")

Enhanced scraping functions loaded!


In [10]:
# Cell 5: Enhanced Gemini processing with better prompts

def create_enhanced_prompt(url, content):
    """Create a more detailed prompt for better parish extraction"""

    prompt = f"""
You are an expert at extracting Catholic parish information from web content.

IMPORTANT INSTRUCTIONS:
1. The URL {url} contains a parish directory or parish listing page
2. Extract information about Catholic parishes from the provided content
3. If the page contains MULTIPLE parishes, extract data for ALL parishes found
4. If the page is a directory/landing page with NO specific parish details, return an empty array
5. Look for parishes, churches, missions, and Catholic communities
6. Return ONLY valid JSON - no explanatory text before or after

EXPECTED OUTPUT FORMAT:
For multiple parishes, return a JSON array:
[
  {{
    "Name": "Parish Name",
    "Status": "Parish/Mission/Chapel",
    "Deanery": "Deanery Name",
    "EST": "Established Year",
    "Street Address": "Street Address",
    "City": "City",
    "State": "State",
    "Zipcode": "Zipcode",
    "Phone Number": "Phone",
    "Website": "URL"
  }}
]

For a single parish, return a JSON object:
{{
  "Name": "Parish Name",
  "Status": "Parish/Mission/Chapel",
  "Deanery": "Deanery Name",
  "EST": "Established Year",
  "Street Address": "Street Address",
  "City": "City",
  "State": "State",
  "Zipcode": "Zipcode",
  "Phone Number": "Phone",
  "Website": "URL"
}}

If no parish information is found, return: []

Use null for missing values. Extract phone numbers, websites, and addresses carefully.

WEBPAGE CONTENT:
{content[:45000]}
"""
    return prompt

def process_url_with_enhanced_gemini(url, content):
    """Process URL content with enhanced Gemini prompting"""

    if not content:
        print(f"  ✗ No content to process for {url}")
        return None

    if not GENAI_API_KEY:
        print(f"  ✗ GenAI not configured - skipping LLM processing")
        return None

    try:
        print(f"  🤖 Processing with Gemini... ({len(content)} chars)")

        prompt = create_enhanced_prompt(url, content)
        response_text = invoke_gemini_model(prompt_text=prompt, model_name="gemini-1.5-flash")

        print(f"  📝 Gemini response length: {len(response_text)} characters")

        # Save raw response for debugging
        if SAVE_DEBUG_FILES:
            domain = extract_domain(url).replace('.', '_')
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            response_file = f"debug_content/{domain}_gemini_response_{timestamp}.json"
            with open(response_file, 'w', encoding='utf-8') as f:
                f.write(response_text)
            print(f"  📄 Raw response saved: {response_file}")

        # Clean up response
        cleaned_response = response_text.strip()

        # Remove markdown code blocks
        if cleaned_response.startswith("```json"):
            cleaned_response = cleaned_response[7:]
        if cleaned_response.startswith("```"):
            cleaned_response = cleaned_response[3:]
        if cleaned_response.endswith("```"):
            cleaned_response = cleaned_response[:-3]

        cleaned_response = cleaned_response.strip()

        # Parse JSON
        try:
            parsed_data = json.loads(cleaned_response)

            # Handle both single objects and arrays
            if isinstance(parsed_data, list):
                print(f"  ✓ Extracted {len(parsed_data)} parishes")
                return parsed_data
            elif isinstance(parsed_data, dict):
                print(f"  ✓ Extracted 1 parish: {parsed_data.get('Name', 'Unknown')}")
                return [parsed_data]  # Convert to array for consistent handling
            else:
                print(f"  ⚠ Unexpected data type: {type(parsed_data)}")
                return None

        except json.JSONDecodeError as e:
            print(f"  ✗ JSON parsing failed: {e}")
            print(f"  📝 Response preview: {cleaned_response[:200]}...")
            return None

    except Exception as e:
        print(f"  ✗ Gemini processing failed: {e}")
        return None

print("Enhanced Gemini processing functions loaded!")

Enhanced Gemini processing functions loaded!


In [11]:
# Cell 6: Enhanced database operations

def validate_parish_data(parish_data, source_url):
    """Validate and clean parish data before database insertion"""

    if not isinstance(parish_data, dict):
        return None

    # Ensure all required fields exist
    required_fields = ['Name', 'Status', 'Deanery', 'EST', 'Street Address',
                       'City', 'State', 'Zipcode', 'Phone Number', 'Website']

    validated_data = {}

    for field in required_fields:
        value = parish_data.get(field)
        # Convert empty strings to None
        if value == "" or value == "null" or value == "NULL":
            value = None
        validated_data[field] = value

    # Add metadata
    validated_data['source_url'] = source_url
    validated_data['domain'] = extract_domain(source_url)
    validated_data['processed_at'] = datetime.now().isoformat()

    return validated_data

def prepare_for_supabase(parish_data):
    """Convert parish data to Supabase-compatible format"""

    return {
        'Name': parish_data.get('Name'),
        'Status': parish_data.get('Status'),
        'Deanery': parish_data.get('Deanery'),
        'EST': parish_data.get('EST'),
        'StreetAddress': parish_data.get('Street Address'),  # Note: field name change
        'City': parish_data.get('City'),
        'State': parish_data.get('State'),
        'Zipcode': parish_data.get('Zipcode'),
        'PhoneNumber': parish_data.get('Phone Number'),  # Note: field name change
        'Website': parish_data.get('Website'),
        'source_url': parish_data.get('source_url'),
        'domain': parish_data.get('domain'),
        'processed_at': parish_data.get('processed_at')
    }

def safe_upsert_to_supabase(parish_data_list, source_url):
    """Safely upsert parish data to Supabase with enhanced error handling"""

    if not supabase:
        print(f"  ✗ Supabase not available - skipping database write")
        return False

    if not parish_data_list:
        print(f"  ⚠ No parish data to save")
        return False

    success_count = 0

    for i, parish_data in enumerate(parish_data_list):
        try:
            # Validate data
            validated_data = validate_parish_data(parish_data, source_url)
            if not validated_data:
                print(f"    ⚠ Skipping invalid parish data #{i+1}")
                continue

            # Prepare for Supabase
            supabase_data = prepare_for_supabase(validated_data)

            # Skip if no meaningful data (all key fields are null)
            key_fields = ['Name', 'StreetAddress', 'City', 'PhoneNumber']
            if all(supabase_data.get(field) is None for field in key_fields):
                print(f"    ⚠ Skipping parish #{i+1} - no meaningful data")
                continue

            # Create a unique identifier for upsert
            parish_name = supabase_data.get('Name', 'Unknown')
            unique_id = f"{source_url}_{i}"
            supabase_data['unique_id'] = unique_id

            # Attempt upsert
            try:
                response = supabase.table('Parishes').upsert(
                    supabase_data,
                    on_conflict='unique_id'
                ).execute()

                if hasattr(response, 'error') and response.error:
                    print(f"    ✗ Database error for {parish_name}: {response.error}")
                else:
                    print(f"    ✓ Saved: {parish_name}")
                    success_count += 1

            except Exception as db_error:
                print(f"    ✗ Database exception for {parish_name}: {db_error}")

                # Try alternative approach - insert without upsert
                try:
                    # Remove unique_id and try simple insert
                    insert_data = {k: v for k, v in supabase_data.items() if k != 'unique_id'}
                    response = supabase.table('Parishes').insert(insert_data).execute()

                    if not (hasattr(response, 'error') and response.error):
                        print(f"    ✓ Saved via insert: {parish_name}")
                        success_count += 1
                    else:
                        print(f"    ✗ Insert also failed: {response.error}")

                except Exception as insert_error:
                    print(f"    ✗ Insert exception: {insert_error}")

        except Exception as e:
            print(f"    ✗ Processing error for parish #{i+1}: {e}")

    print(f"  📊 Successfully saved {success_count}/{len(parish_data_list)} parishes")
    return success_count > 0

print("Enhanced database operations loaded!")

Enhanced database operations loaded!


In [12]:
# Cell 7: Main processing loop with comprehensive error handling

def process_single_url(url, driver, attempt=1, max_attempts=2):
    """Process a single URL with retry logic"""

    print(f"\n{'='*60}")
    print(f"🔄 Processing URL (Attempt {attempt}/{max_attempts}): {url}")
    print(f"{'='*60}")

    try:
        # Step 1: Extract content
        content = enhanced_content_extraction(url, driver)

        if not content or len(content.strip()) < 100:
            print(f"  ✗ Insufficient content extracted ({len(content) if content else 0} chars)")

            if attempt < max_attempts and RETRY_FAILED_URLS:
                print(f"  🔄 Retrying with different approach...")
                time.sleep(5)  # Wait before retry
                return process_single_url(url, driver, attempt + 1, max_attempts)
            else:
                return {
                    'url': url,
                    'status': 'failed',
                    'reason': 'insufficient_content',
                    'parishes_found': 0
                }

        # Step 2: Process with Gemini
        parish_data_list = process_url_with_enhanced_gemini(url, content)

        if not parish_data_list:
            print(f"  ✗ No parish data extracted by Gemini")
            return {
                'url': url,
                'status': 'no_data',
                'reason': 'gemini_extraction_failed',
                'parishes_found': 0
            }

        # Step 3: Save to database
        if len(parish_data_list) == 0:
            print(f"  ℹ️ Page appears to be a directory/landing page with no specific parish data")
            return {
                'url': url,
                'status': 'directory_page',
                'reason': 'no_parish_details',
                'parishes_found': 0
            }

        success = safe_upsert_to_supabase(parish_data_list, url)

        print(f"  🎯 Processing complete for {url}")
        print(f"     Found: {len(parish_data_list)} parishes")
        print(f"     Database: {'Success' if success else 'Failed'}")

        return {
            'url': url,
            'status': 'success' if success else 'db_failed',
            'reason': 'completed',
            'parishes_found': len(parish_data_list),
            'parishes_saved': success
        }

    except Exception as e:
        print(f"  ✗ Exception processing {url}: {e}")

        if attempt < max_attempts and RETRY_FAILED_URLS:
            print(f"  🔄 Retrying due to exception...")
            time.sleep(5)
            return process_single_url(url, driver, attempt + 1, max_attempts)
        else:
            return {
                'url': url,
                'status': 'error',
                'reason': str(e),
                'parishes_found': 0
            }

# Initialize WebDriver
print("\n🚀 Starting enhanced parish processing...")
driver = setup_webdriver()

# Track results
results = []
total_parishes_found = 0
successful_urls = 0

try:
    for i, url in enumerate(urls_to_process, 1):
        print(f"\n\n📍 URL {i}/{len(urls_to_process)}")

        result = process_single_url(url, driver)
        results.append(result)

        total_parishes_found += result.get('parishes_found', 0)
        if result.get('status') == 'success':
            successful_urls += 1

        # Add delay between requests to be respectful
        if i < len(urls_to_process):
            print(f"\n⏳ Waiting 3 seconds before next URL...")
            time.sleep(3)

finally:
    # Clean up WebDriver
    if driver:
        try:
            driver.quit()
            print("\n🧹 WebDriver closed")
        except:
            pass

# Print summary
print(f"\n\n{'='*60}")
print(f"📊 PROCESSING SUMMARY")
print(f"{'='*60}")
print(f"URLs processed: {len(results)}")
print(f"Successful URLs: {successful_urls}")
print(f"Total parishes found: {total_parishes_found}")
print(f"\nDetailed results:")

for result in results:
    status_emoji = {
        'success': '✅',
        'directory_page': '📁',
        'no_data': '❌',
        'failed': '❌',
        'error': '💥',
        'db_failed': '⚠️'
    }.get(result['status'], '❓')

    print(f"  {status_emoji} {result['url']}")
    print(f"     Status: {result['status']} | Parishes: {result['parishes_found']} | Reason: {result['reason']}")

print(f"\n🎉 Enhanced processing complete!")
if SAVE_DEBUG_FILES:
    print(f"📄 Debug files saved in 'debug_content/' folder")
print(f"{'='*60}")


🚀 Starting enhanced parish processing...
Stacktrace:
#0 0x5cd36b80d4e3 <unknown>
#1 0x5cd36b53cc76 <unknown>
#2 0x5cd36b563757 <unknown>
#3 0x5cd36b562029 <unknown>
#4 0x5cd36b5a0ccc <unknown>
#5 0x5cd36b5a047f <unknown>
#6 0x5cd36b597de3 <unknown>
#7 0x5cd36b56d2dd <unknown>
#8 0x5cd36b56e34e <unknown>
#9 0x5cd36b7cd3e4 <unknown>
#10 0x5cd36b7d13d7 <unknown>
#11 0x5cd36b7dbb20 <unknown>
#12 0x5cd36b7d2023 <unknown>
#13 0x5cd36b7a01aa <unknown>
#14 0x5cd36b7f66b8 <unknown>
#15 0x5cd36b7f6847 <unknown>
#16 0x5cd36b806243 <unknown>
#17 0x7ae71bee5ac3 <unknown>



📍 URL 1/3

🔄 Processing URL (Attempt 1/2): https://www.dioceseoftyler.org/parishes/

🔍 Extracting content from: https://www.dioceseoftyler.org/parishes/
  📡 Trying requests method...
  📄 Debug content saved: debug_content/www_dioceseoftyler_org_requests_20250527_201331.html
  ✓ Requests success: 1279 characters
  🎯 Best method: requests (1279 characters)
  🤖 Processing with Gemini... (1279 chars)
  📝 Gemini response length: 3 c

In [13]:
# Cell 8: Optional - Analysis and debugging tools

def analyze_debug_files():
    """Analyze saved debug files to understand extraction issues"""

    if not SAVE_DEBUG_FILES or not os.path.exists('debug_content'):
        print("No debug files to analyze")
        return

    debug_files = [f for f in os.listdir('debug_content') if f.endswith('.html')]

    if not debug_files:
        print("No HTML debug files found")
        return

    print(f"\n🔍 Analyzing {len(debug_files)} debug files...")

    for file in debug_files[:3]:  # Analyze first 3 files
        file_path = f"debug_content/{file}"

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Basic analysis
            soup = BeautifulSoup(content, 'html.parser')

            # Count potential parish indicators
            parish_keywords = ['parish', 'church', 'cathedral', 'mission', 'chapel']
            phone_patterns = content.count('(')
            address_indicators = content.count('street') + content.count('avenue') + content.count('road')

            keyword_count = sum(content.lower().count(keyword) for keyword in parish_keywords)

            print(f"\n📄 {file}:")
            print(f"   Content length: {len(content):,} chars")
            print(f"   Parish keywords: {keyword_count}")
            print(f"   Phone indicators: {phone_patterns}")
            print(f"   Address indicators: {address_indicators}")
            print(f"   Links found: {len(soup.find_all('a'))}")

            # Sample text
            text_content = soup.get_text()[:500]
            print(f"   Sample text: {text_content[:100]}...")

        except Exception as e:
            print(f"   Error analyzing {file}: {e}")

def test_single_url(test_url):
    """Test processing on a single URL for debugging"""

    print(f"\n🧪 Testing single URL: {test_url}")

    driver = setup_webdriver()

    try:
        result = process_single_url(test_url, driver)
        print(f"\n🧪 Test result: {result}")
        return result
    finally:
        if driver:
            driver.quit()

def check_supabase_connection():
    """Test Supabase connection and table structure"""

    if not supabase:
        print("❌ Supabase not configured")
        return

    try:
        # Test basic connection
        response = supabase.table('Parishes').select('*').limit(5).execute()
        print(f"✅ Supabase connection working")
        print(f"📊 Sample records in Parishes table: {len(response.data)}")

        if response.data:
            print(f"📋 Sample record structure:")
            for key in response.data[0].keys():
                print(f"   - {key}")

    except Exception as e:
        print(f"❌ Supabase connection test failed: {e}")

def show_gemini_responses():
    """Show Gemini response files for debugging"""

    if not SAVE_DEBUG_FILES or not os.path.exists('debug_content'):
        print("No debug files available")
        return

    response_files = [f for f in os.listdir('debug_content') if 'gemini_response' in f]

    if not response_files:
        print("No Gemini response files found")
        return

    print(f"\n🤖 Found {len(response_files)} Gemini response files:")

    for file in response_files[:3]:  # Show first 3
        file_path = f"debug_content/{file}"
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            print(f"\n📄 {file}:")
            print(f"   Length: {len(content)} chars")
            print(f"   Preview: {content[:200]}...")

        except Exception as e:
            print(f"   Error reading {file}: {e}")

def quick_test_urls():
    """Quick test of a few URLs to see what content is being extracted"""

    test_urls = [
        "https://www.diopueblo.org/parishes",
        "https://www.rcbo.org/directories/parishes/"
    ]

    driver = setup_webdriver()

    try:
        for url in test_urls:
            print(f"\n🔍 Quick test: {url}")
            content = enhanced_content_extraction(url, driver)

            if content:
                print(f"   ✓ Extracted {len(content)} characters")
                print(f"   Sample: {content[:300]}...")
            else:
                print(f"   ✗ No content extracted")

    finally:
        if driver:
            driver.quit()

print("\n🛠️ Debug tools loaded!")
print("Available functions:")
print("  - analyze_debug_files() - Analyze scraped HTML content")
print("  - test_single_url('your_url_here') - Test a specific URL")
print("  - check_supabase_connection() - Test database connection")
print("  - show_gemini_responses() - View AI responses")
print("  - quick_test_urls() - Quick content extraction test")

# Uncomment any of these to run analysis:
# analyze_debug_files()
# check_supabase_connection()
# show_gemini_responses()
# quick_test_urls()


🛠️ Debug tools loaded!
Available functions:
  - analyze_debug_files() - Analyze scraped HTML content
  - test_single_url('your_url_here') - Test a specific URL
  - check_supabase_connection() - Test database connection
  - show_gemini_responses() - View AI responses
  - quick_test_urls() - Quick content extraction test
