<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Find_Parish_Directory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install necessary libraries & Setup API Keys

# This cell installs all required Python packages for the notebook.
!pip install selenium webdriver-manager google-generativeai google-api-python-client tenacity

# Standard library imports
import sqlite3
import re
import os
import time 

# Third-party library imports
import requests # For simple HTTP requests (though less used now with Selenium)
from bs4 import BeautifulSoup # For parsing HTML
from google.colab import userdata # For securely accessing API keys in Colab

# Selenium imports for web automation and dynamic content loading
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, WebDriverException

# Google GenAI imports (for Gemini model)
import google.generativeai as genai
from google.api_core.exceptions import DeadlineExceeded, ServiceUnavailable, ResourceExhausted, InternalServerError, GoogleAPIError

# Google API Client imports (for Custom Search API)
from googleapiclient.errors import HttpError
# To use the live Google Custom Search API, uncomment the following import in this cell 
# AND in Cell 4.6 where `build` is called.
# from googleapiclient.discovery import build 

# Tenacity library for robust retry mechanisms
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type, RetryError

print("--- API Key Configuration ---")
# --- GenAI API Key Setup ---
# To use live GenAI calls: 
# 1. Ensure your GENAI_API_KEY_USCCB is stored in Colab Secrets.
# 2. Comment out the line `GENAI_API_KEY = None` below.
# 3. In Cells 4.5 and 4.6, set the `use_mock` flags to `False`.
GENAI_API_KEY_FROM_USERDATA = userdata.get('GENAI_API_KEY_USCCB')
GENAI_API_KEY = None # FORCE MOCK BY DEFAULT FOR THIS NOTEBOOK VERSION
if GENAI_API_KEY_FROM_USERDATA and GENAI_API_KEY_FROM_USERDATA not in ["YOUR_API_KEY_PLACEHOLDER", "SET_YOUR_KEY_HERE"]:
    print("GenAI API Key found in userdata. (Live configuration is ready but mock is forced by default).")
    # GENAI_API_KEY = GENAI_API_KEY_FROM_USERDATA # Uncomment this line to use the key from userdata
    if GENAI_API_KEY: # This will only be true if the line above is uncommented
        try:
            genai.configure(api_key=GENAI_API_KEY)
            print("GenAI configured successfully for LIVE calls.")
        except Exception as e:
            print(f"Error configuring GenAI with key: {e}. GenAI features will be mocked.")
            GENAI_API_KEY = None
    else:
        print("GenAI API Key from userdata is available, but GENAI_API_KEY is set to None. Mocking GenAI.")
else:
    print("GenAI API Key not found in userdata or is a placeholder. GenAI features will be mocked.")
    GENAI_API_KEY = None # Ensure it's None if not found/placeholder

# --- Search Engine API Key Setup ---
# To use live Google Custom Search API calls:
# 1. Ensure your SEARCH_API_KEY_USCCB and SEARCH_CX_USCCB are in Colab Secrets.
# 2. Comment out `SEARCH_API_KEY = None` and `SEARCH_CX = None` below.
# 3. In Cell 4.6 (`search_for_directory_link`), set `use_mock_search` to `False`.
# 4. Uncomment the `from googleapiclient.discovery import build` line in this cell and in Cell 4.6.
SEARCH_API_KEY_FROM_USERDATA = userdata.get('SEARCH_API_KEY_USCCB')
SEARCH_CX_FROM_USERDATA = userdata.get('SEARCH_CX_USCCB')
SEARCH_API_KEY = None # FORCE MOCK BY DEFAULT
SEARCH_CX = None      # FORCE MOCK BY DEFAULT

if SEARCH_API_KEY_FROM_USERDATA and SEARCH_API_KEY_FROM_USERDATA not in ["YOUR_API_KEY_PLACEHOLDER", "SET_YOUR_KEY_HERE"] and \
   SEARCH_CX_FROM_USERDATA and SEARCH_CX_FROM_USERDATA not in ["YOUR_CX_PLACEHOLDER", "SET_YOUR_CX_HERE"]:
    print("Search Engine API Key and CX found in userdata. (Live configuration is ready but mock is forced by default).")
    # SEARCH_API_KEY = SEARCH_API_KEY_FROM_USERDATA # Uncomment to use
    # SEARCH_CX = SEARCH_CX_FROM_USERDATA          # Uncomment to use
    if not (SEARCH_API_KEY and SEARCH_CX): # If still None after potential uncommenting
        print("Search API Key/CX from userdata is available, but they are set to None. Mocking Search.")
else:
    print("Search Engine API Key/CX not found in userdata or is placeholder. Search engine calls will be mocked.")
    SEARCH_API_KEY = None # Ensure None if not found
    SEARCH_CX = None
print("--- End API Key Configuration ---")

# --- Selenium WebDriver Setup ---
chrome_options = Options()
chrome_options.add_argument("--headless") # Ensure headless operation
chrome_options.add_argument("--no-sandbox") # Standard for Colab/Docker environments
chrome_options.add_argument("--disable-dev-shm-usage") # Standard for Colab/Docker environments
chrome_options.add_argument("--disable-gpu") # Often helpful in headless environments
chrome_options.add_argument("window-size=1920,1080") # Define window size

driver = None # Global WebDriver instance

def setup_driver():
    """Initializes and returns the Selenium WebDriver instance."""
    global driver
    if driver is None:
        try:
            print("Setting up Chrome WebDriver...")
            # ChromeDriver is automatically managed by webdriver_manager
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
            print("WebDriver setup successfully.")
        except Exception as e:
            print(f"Error setting up WebDriver: {e}")
            print("Ensure Chrome is installed if not using a pre-built environment like Colab.")
            driver = None
    return driver

def close_driver():
    """Closes the Selenium WebDriver instance if it's active."""
    global driver
    if driver:
        print("Closing WebDriver...")
        driver.quit()
        driver = None
        print("WebDriver closed.")

In [None]:
# Cell 2: Clone GitHub repository and configure Git

# This cell clones the GitHub repository if it doesn't exist, 
# or pulls the latest changes if it does. It also configures Git user info.

GITHUB_REPO = 'USCCB' # Name of the repository
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB') # Your GitHub username from Colab Secrets
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')      # Your GitHub Personal Access Token from Colab Secrets

# Construct the repository URL with credentials for private repositories (if applicable)
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

if not os.path.exists(GITHUB_REPO):
    print(f"Cloning repository {GITHUB_REPO}...")
    !git clone {REPO_URL}
    os.chdir(GITHUB_REPO) # Change current directory to the repository root
else:
    print(f"Repository {GITHUB_REPO} already exists. Updating...")
    os.chdir(GITHUB_REPO)
    !git pull origin main # Pull the latest changes from the main branch

# Configure Git local settings for this environment (optional, but good practice for commits)
!git config --global user.email "colab@example.com" # Replace with your email if desired
!git config --global user.name "Colab User"      # Replace with your name if desired

In [None]:
# Cell 3: Fetch Dioceses Info from SQLite database

# This cell connects to the SQLite database (data.db) and fetches a list of dioceses
# that do not yet have a parish directory URL recorded.

import sqlite3

dioceses_to_scan = [] # Initialize an empty list to store diocese info
try:
    # Check if the database file exists before attempting to connect
    if not os.path.exists('data.db'):
        print("WARNING: data.db not found. No dioceses will be fetched for scanning.")
        # In a real scenario, data.db should be populated by other notebooks or processes.
    else:
        conn_db = sqlite3.connect('data.db')
        cursor_db = conn_db.cursor()
        
        # SQL query to select diocesan websites and names where a parish directory URL is missing.
        query = """
        SELECT d.Website, d.Name 
        FROM Dioceses d
        LEFT JOIN DiocesesParishDirectory dpd ON d.Website = dpd.diocese_url
        WHERE dpd.parish_directory_url IS NULL OR dpd.parish_directory_url = ''
        """
        cursor_db.execute(query)
        # Store results as a list of dictionaries for easier access to URL and name
        dioceses_to_scan = [{'url': row[0], 'name': row[1]} for row in cursor_db.fetchall()]
        print(f"Fetched {len(dioceses_to_scan)} dioceses from the database for scanning.")
except sqlite3.Error as e:
    print(f"Database error in Cell 3: {e}")
finally:
    if 'conn_db' in locals() and conn_db: # Ensure connection was opened before trying to close
        conn_db.close()

In [None]:
# Cell 4: Function to find candidate parish listing URLs from page content

from urllib.parse import urljoin, urlparse # For handling relative and absolute URLs
import re # For regular expression matching in URL paths

def get_surrounding_text(element, max_length=200):
    """Extracts text from the parent element of a given link, limited in length.
    This provides context for the link.
    """
    if element and element.parent:
        parent_text = element.parent.get_text(separator=' ', strip=True)
        # Truncate if too long to keep prompts for GenAI concise
        return parent_text[:max_length] + ('...' if len(parent_text) > max_length else '')
    return ''

def find_candidate_urls(soup, base_url):
    """Scans a BeautifulSoup soup object for potential parish directory links.
    It uses a combination of keyword matching in link text/surrounding text 
    and regex patterns for URL paths.
    Returns a list of candidate link dictionaries.
    """
    candidate_links = []
    processed_hrefs = set() # To avoid adding duplicate URLs

    # Keywords likely to appear in link text or surrounding text for parish directories
    parish_link_keywords = [
        'Churches', 'Directory of Parishes', 'Parishes', 'parishfinder', 'Parish Finder', 
        'Find a Parish', 'Locations', 'Our Parishes', 'Parish Listings', 'Find a Church', 
        'Church Directory', 'Faith Communities', 'Find Mass Times', 'Our Churches', 
        'Search Parishes', 'Parish Map', 'Mass Schedule', 'Sacraments', 'Worship'
    ]
    # Regex patterns for URL paths that often indicate a parish directory
    url_patterns = [
        r'parishes', r'directory', r'locations', r'churches', 
        r'parish-finder', r'findachurch', r'parishsearch', r'parishdirectory',
        r'find-a-church', r'church-directory', r'parish-listings', r'parish-map',
        r'mass-times', r'sacraments', r'search', r'worship', r'finder'
    ]

    all_links_tags = soup.find_all('a', href=True) # Find all <a> tags with an href attribute

    for link_tag in all_links_tags:
        href = link_tag['href']
        # Skip empty, anchor, JavaScript, or mailto links
        if not href or href.startswith('#') or href.lower().startswith('javascript:') or href.lower().startswith('mailto:'):
            continue 
        
        abs_href = urljoin(base_url, href) # Resolve relative URLs to absolute
        if not abs_href.startswith('http'): # Ensure it's a web link
            continue
        if abs_href in processed_hrefs: # Avoid re-processing the same URL
            continue

        link_text = link_tag.get_text(strip=True)
        surrounding_text = get_surrounding_text(link_tag)
        parsed_href_path = urlparse(abs_href).path.lower() # Get the path component of the URL

        # Check for matches based on keywords in text or URL patterns
        text_match = any(keyword.lower() in link_text.lower() or keyword.lower() in surrounding_text.lower() for keyword in parish_link_keywords)
        pattern_match = any(re.search(pattern, parsed_href_path, re.IGNORECASE) for pattern in url_patterns)

        if text_match or pattern_match:
            candidate_links.append({
                'text': link_text,
                'href': abs_href,
                'surrounding_text': surrounding_text
            })
            processed_hrefs.add(abs_href)
            
    return candidate_links

In [None]:
# Cell 4.5: GenAI Powered Link Analyzer (for direct page content)

# Define exceptions on which GenAI calls should be retried
RETRYABLE_GENAI_EXCEPTIONS = (
    DeadlineExceeded, ServiceUnavailable, ResourceExhausted, 
    InternalServerError, GoogleAPIError 
)

@retry(
    stop=stop_after_attempt(3), # Retry up to 3 times
    wait=wait_exponential(multiplier=1, min=2, max=10), # Exponential backoff: 2s, 4s, 8s...
    retry=retry_if_exception_type(RETRYABLE_GENAI_EXCEPTIONS),
    reraise=True # Reraise the last exception if all retries fail
)
def _invoke_genai_model_with_retry(prompt):
    """Internal helper to invoke the GenAI model with retry logic."""
    # print("    Attempting GenAI call...") # Uncomment for debugging retries
    model = genai.GenerativeModel('gemini-pro') # Or your preferred model
    return model.generate_content(prompt)

def analyze_links_with_genai(candidate_links, diocese_name=None):
    """Analyzes candidate links using GenAI (or mock) to find the best parish directory URL."""
    best_link_found = None
    highest_score = -1

    # --- Mock vs. Live Control for GenAI (Direct Page Analysis) ---
    # For live GenAI: GENAI_API_KEY must be valid in Cell 1, AND use_mock must be False.
    use_mock = True # <<< SET TO False TO ATTEMPT LIVE GENAI CALLS (requires valid API key in Cell 1)
    if not GENAI_API_KEY: use_mock = True # Always mock if key is not configured
    if not use_mock: print(f"Attempting LIVE GenAI analysis for {len(candidate_links)} direct page links for {diocese_name or 'Unknown Diocese'}.")
    # else: print(f"Using MOCKED GenAI analysis for {len(candidate_links)} direct page links for {diocese_name or 'Unknown Diocese'}.")
    # ---

    if use_mock:
        mock_keywords = ['parish', 'church', 'directory', 'location', 'finder', 'search', 'map', 'listing', 'sacrament', 'mass', 'worship']
        for link_info in candidate_links:
            current_score = 0
            text_to_check = (link_info['text'] + ' ' + link_info['href'] + ' ' + link_info['surrounding_text']).lower()
            for kw in mock_keywords:
                if kw in text_to_check: current_score += 3 
            if diocese_name and diocese_name.lower() in text_to_check: current_score +=1
            current_score = min(current_score, 10) # Cap score at 10
            if current_score >= 7 and current_score > highest_score: # Threshold of 7
                highest_score = current_score
                best_link_found = link_info['href']
        return best_link_found

    # --- Actual GenAI API Call Logic (executes if use_mock is False) ---
    for link_info in candidate_links:
        prompt = f"""Given the following information about a link from the {diocese_name or 'a diocesan'} website:
        Link Text: "{link_info['text']}"
        Link URL: "{link_info['href']}"
        Surrounding Text: "{link_info['surrounding_text']}"
        Does this link likely lead to a parish directory, a list of churches, or a way to find parishes? 
        Respond with a confidence score from 0 (not likely) to 10 (very likely) and a brief justification. 
        Format as: Score: [score], Justification: [text]"""
        try:
            response = _invoke_genai_model_with_retry(prompt)
            response_text = response.text
            # print(f"    GenAI Raw Response (Direct Link): {response_text}") # For debugging
            score_match = re.search(r"Score: (\d+)", response_text, re.IGNORECASE)
            if score_match:
                score = int(score_match.group(1))
                if score >= 7 and score > highest_score:
                    highest_score = score
                    best_link_found = link_info['href']
            # else: print(f"    Could not parse score from GenAI (Direct Link) for {link_info['href']}: {response_text}")
        except RetryError as e:
            print(f"    GenAI API call (Direct Link) failed after multiple retries for {link_info['href']}: {e}")
        except Exception as e:
            print(f"    Error calling GenAI (Direct Link) for {link_info['href']}: {e}. No score assigned.")
    return best_link_found

In [None]:
# Cell 4.6: Search Engine Fallback Functions & GenAI Snippet Analysis

# Ensure 'build' is imported if using live search. It's commented in Cell 1 by default.
from googleapiclient.discovery import build 

def is_retryable_http_error(exception):
    """Custom retry condition for HttpError: only retry on 5xx or 429 (rate limit)."""
    if isinstance(exception, HttpError):
        return exception.resp.status >= 500 or exception.resp.status == 429
    return False

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=2, max=10),
    retry=retry_if_exception_type(is_retryable_http_error), # Use custom condition for HttpError
    reraise=True
)
def _invoke_search_api_with_retry(service, query, cx_id):
    """Internal helper to invoke the Google Custom Search API with retry logic."""
    # print(f"    Attempting Search API call for query: {query}") # Uncomment for debugging retries
    return service.cse().list(q=query, cx=cx_id, num=3).execute() # Fetch top 3 results per query

def analyze_search_snippet_with_genai(search_results, diocese_name):
    """Analyzes search result snippets using GenAI (or mock) to find the best parish directory URL."""
    best_link_from_snippet = None
    highest_score = -1

    # --- Mock vs. Live Control for GenAI (Snippet Analysis) ---
    use_mock_genai_for_snippet = True # <<< SET TO False TO ATTEMPT LIVE GENAI CALLS (requires valid API key)
    if not GENAI_API_KEY: use_mock_genai_for_snippet = True # Always mock if key not configured
    if not use_mock_genai_for_snippet: print(f"Attempting LIVE GenAI analysis for {len(search_results)} snippets for {diocese_name}.")
    # else: print(f"Using MOCKED GenAI analysis for {len(search_results)} snippets for {diocese_name}.")
    # ---

    if use_mock_genai_for_snippet:
        mock_keywords = ['parish', 'church', 'directory', 'location', 'finder', 'search', 'map', 'listing', 'mass times']
        for result in search_results:
            current_score = 0
            text_to_check = (result.get('title', '') + ' ' + result.get('snippet', '') + ' ' + result.get('link', '')).lower()
            for kw in mock_keywords: 
                if kw in text_to_check: current_score += 3
            if diocese_name and diocese_name.lower() in text_to_check: current_score += 1
            current_score = min(current_score, 10)
            if current_score >= 7 and current_score > highest_score: # Threshold of 7
                highest_score = current_score
                best_link_from_snippet = result.get('link')
        return best_link_from_snippet

    # --- Actual GenAI API Call Logic for Snippets (executes if use_mock_genai_for_snippet is False) ---
    for result in search_results:
        title = result.get('title', '')
        snippet = result.get('snippet', '')
        link = result.get('link', '')
        prompt = f"""Given the following search result from {diocese_name}'s website:
        Title: "{title}"
        Snippet: "{snippet}"
        URL: "{link}"
        Does this link likely lead to a parish directory, church locator, or list of churches? 
        Respond with a confidence score from 0 (not likely) to 10 (very likely) and a brief justification. 
        Format as: Score: [score], Justification: [text]"""
        try:
            # Uses the same _invoke_genai_model_with_retry as direct page analysis
            response = _invoke_genai_model_with_retry(prompt) 
            response_text = response.text
            # print(f"    GenAI Raw Response (Snippet): {response_text}") # For debugging
            score_match = re.search(r"Score: (\d+)", response_text, re.IGNORECASE)
            if score_match:
                score = int(score_match.group(1))
                if score >= 7 and score > highest_score:
                    highest_score = score
                    best_link_from_snippet = link
            # else: print(f"    Could not parse score from GenAI (Snippet) for {link}: {response_text}")
        except RetryError as e:
            print(f"    GenAI API call (Snippet) for {link} failed after multiple retries: {e}")
        except Exception as e:
            print(f"    Error calling GenAI for snippet analysis of {link}: {e}")
    return best_link_from_snippet

def search_for_directory_link(diocese_name, diocese_website_url):
    """Uses Google Custom Search (or mock) to find potential directory links, then analyzes snippets."""
    # print(f"Executing search engine fallback for {diocese_name} ({diocese_website_url})") # Verbose

    # --- Mock vs. Live Control for Search Engine ---
    use_mock_search = True # <<< SET TO False TO ATTEMPT LIVE SEARCH ENGINE CALLS (requires valid API keys)
    if not (SEARCH_API_KEY and SEARCH_CX): use_mock_search = True # Always mock if keys not configured
    if not use_mock_search: print(f"Attempting LIVE Google Custom Search for {diocese_name}.")
    # else: print(f"Using MOCKED Google Custom Search for {diocese_name}.")
    # ---

    if use_mock_search:
        mock_results = [
            {'link': f"{diocese_website_url}/parishes", 'title': f"Parishes - {diocese_name}", 'snippet': f"List of parishes in the Diocese of {diocese_name}. Find a parish near you."},
            {'link': f"{diocese_website_url}/directory", 'title': f"Directory - {diocese_name}", 'snippet': f"Official directory of churches and schools for {diocese_name}."},
            {'link': f"{diocese_website_url}/find-a-church", 'title': f"Find a Church - {diocese_name}", 'snippet': f"Search for a Catholic church in {diocese_name}. Mass times and locations."}
        ]
        # Simulate `site:` search by filtering mock results to the diocese's website
        filtered_mock_results = [res for res in mock_results if res['link'].startswith(diocese_website_url)]
        return analyze_search_snippet_with_genai(filtered_mock_results, diocese_name)

    # --- Actual Google Custom Search API Call Logic (executes if use_mock_search is False) ---
    try:
        # `build` is imported at the top of this cell for clarity when live calls are made.
        service = build("customsearch", "v1", developerKey=SEARCH_API_KEY)
        # Construct multiple queries to increase chances of finding the directory
        queries = [
            f"parish directory site:{diocese_website_url}",
            f"list of churches site:{diocese_website_url}",
            f"find a parish site:{diocese_website_url}",
            f"{diocese_name} parish directory" # Broader query without site restriction as a last resort
        ]
        search_results_items = []
        unique_links = set() # To avoid duplicate results from different queries

        for q in queries:
            if len(search_results_items) >= 5: break # Limit total API calls/results
            print(f"    Executing search query: {q}")
            # Use the retry-enabled helper for the API call
            res_items = _invoke_search_api_with_retry(service, q, SEARCH_CX).get('items', [])
            for item in res_items:
                link = item.get('link')
                if link and link not in unique_links:
                    search_results_items.append(item)
                    unique_links.add(link)
            time.sleep(0.2) # Brief pause between queries to be polite to the API
        
        if not search_results_items:
            print(f"    Search engine returned no results for {diocese_name}.")
            return None
            
        # Format results for the snippet analyzer
        formatted_results = [{'link': item.get('link'), 'title': item.get('title'), 'snippet': item.get('snippet')} for item in search_results_items]
        return analyze_search_snippet_with_genai(formatted_results, diocese_name)
    except RetryError as e:
        print(f"    Search API call failed after multiple retries for {diocese_name}: {e}")
        return None
    except Exception as e:
        print(f"    Error during search engine call for {diocese_name}: {e}")
        return None

In [None]:
# Cell 5: Process URLs, Apply Analysis Stages, and Write Results to Database

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=2, max=10),
    retry=retry_if_exception_type((TimeoutException, WebDriverException)),
    reraise=True
)
def get_page_with_retry(driver_instance, url):
    """Wraps driver.get() with retry logic."""
    # print(f"    Attempting to load page: {url}") # Uncomment for debugging retries
    driver_instance.get(url)

if 'dioceses_to_scan' in locals() and dioceses_to_scan:
    conn_db = sqlite3.connect('data.db')
    cursor_db = conn_db.cursor()
    # Define table schema with PRIMARY KEY on diocese_url for INSERT OR REPLACE behavior
    cursor_db.execute('''CREATE TABLE IF NOT EXISTS DiocesesParishDirectory
                      (diocese_url TEXT PRIMARY KEY, 
                       parish_directory_url TEXT, 
                       found TEXT,  -- Status: Success, Not Found, Error details
                       found_method TEXT)''') -- Method used: e.g., genai_direct, search_engine_genai
    conn_db.commit()

    driver_instance = setup_driver() # Initialize the WebDriver
    if driver_instance:
        print(f"Processing {len(dioceses_to_scan)} dioceses with Selenium...")
        for diocese_info in dioceses_to_scan:
            current_url = diocese_info['url']
            diocese_name = diocese_info['name']
            print(f"--- Processing: {current_url} ({diocese_name}) ---")
            
            parish_dir_url_found = None
            status_text = "Not Found" # Default status if no URL is found
            method = "not_found_all_stages" # Default method if all stages fail

            try:
                # Stage 1: Load page with Selenium (with retries)
                get_page_with_retry(driver_instance, current_url)
                time.sleep(0.5) # Brief pause for any JS rendering after page load
                page_source = driver_instance.page_source
                soup = BeautifulSoup(page_source, 'html.parser')
                
                # Stage 2: Find candidate links from direct page content
                candidate_links = find_candidate_urls(soup, current_url)

                if candidate_links:
                    # Stage 3: Analyze direct page candidates with GenAI (or mock)
                    # print(f"    Found {len(candidate_links)} candidates from direct page. Analyzing...") # Verbose
                    parish_dir_url_found = analyze_links_with_genai(candidate_links, diocese_name)
                    if parish_dir_url_found:
                        method = "genai_direct_page_analysis"
                        status_text = "Success"
                    # else: print(f"    GenAI (direct page) did not find a suitable URL for {current_url}.") # Verbose
                # else: print(f"    No candidate links found by direct page scan for {current_url}.") # Verbose

                # Stage 4: If not found, try search engine fallback
                if not parish_dir_url_found:
                    # print(f"    Direct page analysis failed for {current_url}. Trying search engine fallback...") # Verbose
                    parish_dir_url_found = search_for_directory_link(diocese_name, current_url)
                    if parish_dir_url_found:
                        method = "search_engine_snippet_genai"
                        status_text = "Success"
                    # else: print(f"    Search engine fallback also failed for {current_url}.") # Verbose
                
                # Log final result for this diocese
                if parish_dir_url_found:
                     print(f"    Result: Parish Directory URL for {current_url}: {parish_dir_url_found} (Method: {method})")
                else:
                     # Method will be 'not_found_all_stages' if it reached here without finding a URL
                     print(f"    Result: No Parish Directory URL definitively found for {current_url} (Final method: {method})")
                
                cursor_db.execute("INSERT OR REPLACE INTO DiocesesParishDirectory VALUES (?, ?, ?, ?)",
                               (current_url, parish_dir_url_found, status_text, method))

            except RetryError as e: # Catch retry errors specifically for page load from get_page_with_retry
                error_message = str(e).replace('"', "''") 
                print(f"    Result: Page load failed after multiple retries for {current_url}: {error_message[:100]}")
                status_text = f"Error: Page load failed - {error_message[:60]}" # Truncate for DB
                method = "error_page_load_failed"
                cursor_db.execute("INSERT OR REPLACE INTO DiocesesParishDirectory VALUES (?, ?, ?, ?)",
                               (current_url, None, status_text, method))
            except Exception as e: # Catch any other exceptions during processing of a diocese
                error_message = str(e).replace('"', "''")
                print(f"    Result: General error processing {current_url}: {error_message[:100]}")
                status_text = f"Error: {error_message[:100]}" # Truncate for DB
                method = "error_processing_general"
                cursor_db.execute("INSERT OR REPLACE INTO DiocesesParishDirectory VALUES (?, ?, ?, ?)",
                               (current_url, None, status_text, method))
            conn_db.commit() # Commit result for each diocese

        close_driver() # Close WebDriver after processing all dioceses
    else:
        print("Selenium WebDriver not available. Skipping URL processing.")
    
    if 'conn_db' in locals() and conn_db: # Ensure connection is closed
        conn_db.close()
        print("\nDatabase connection closed after processing.")
else:
    print("No dioceses to scan (dioceses_to_scan is empty or not defined). Ensure Cell 3 ran correctly and data.db is populated.")

In [None]:
# Cell 6: Verify the data in the SQLite database

print("--- Verification Cell Output ---")
try:
    conn = sqlite3.connect('data.db')
    cursor = conn.cursor()
    print("\nDisplaying first 5 rows from DiocesesParishDirectory (if any):")
    cursor.execute("SELECT * FROM DiocesesParishDirectory LIMIT 5")
    rows = cursor.fetchall()
    if rows:
        for row in rows:
            print(row)
    else:
        print("No data found in DiocesesParishDirectory table.")

    print("\nDisplaying counts by found_method:")
    cursor.execute("SELECT found_method, COUNT(*) FROM DiocesesParishDirectory GROUP BY found_method")
    rows_count = cursor.fetchall()
    if rows_count:
        for row_count in rows_count:
            print(row_count)
    else:
        print("No data to aggregate by found_method (DiocesesParishDirectory table might be empty).")
except sqlite3.Error as e:
    print(f"Database error during verification: {e}")
finally:
    if 'conn' in locals() and conn:
        conn.close()
    print("\nDatabase connection for verification closed")

In [None]:
# Cell 7: Commit changes and push to GitHub

# This cell is for committing the notebook and data.db (if changed) to the GitHub repository.
# Ensure that this notebook file ('Find_Parish_Directory.ipynb') is correctly named here.
!git add data.db Find_Parish_Directory.ipynb

# Using the comprehensive commit message drafted in the final review.
!git commit -m "feat: Enhance parish directory finding with GenAI, search fallback, and retries
#
# This commit significantly refactors and enhances the Find_Parish_Directory.ipynb 
# notebook to improve its ability to locate parish directory URLs on diocesan websites.
#
# Key features and improvements:
# 1. Selenium Integration: Uses Selenium WebDriver to fetch dynamic page content.
# 2. Advanced Candidate Link Finding: Employs expanded keywords and URL patterns.
# 3. GenAI-Powered Link Analysis: Introduces GenAI (mocked by default) to evaluate 
#    candidate links from direct page content and search snippets.
# 4. Search Engine Fallback: Adds Google Custom Search (mocked by default) if direct 
#    analysis fails.
# 5. Robust Error Handling & Retries: Integrates Tenacity for retries on page loads 
#    and API calls (GenAI, Search).
# 6. Standardized Database Logging: `DiocesesParishDirectory` table now includes 
#    a `found_method` column with clear, standardized values.
# 7. Configuration and Usability: Clear comments for API key setup, defaults to mocked 
#    APIs for runnability. All cell outputs and execution counts cleared."

# Push the changes to the main branch of the remote repository.
!git push origin main