<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Adoration_and_Reconciliation_Measurement_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import re
from urllib.parse import urljoin, urlparse
import time

In [None]:
# Cell 2 (Updated): Define helper functions for web scraping and data extraction

import logging
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import time

# Configure logging with timestamps and log levels
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scraping.log"),
        logging.StreamHandler()
    ]
)

def get_soup(url, retries=3, backoff_factor=1.0):
    """
    Fetches the content at the given URL and returns a BeautifulSoup object.
    Implements retries with exponential backoff in case of request failures.
    """
    headers = {
        'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/58.0.3029.110 Safari/537.3'),
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    for attempt in range(1, retries + 1):
        try:
            logging.info(f"Attempt {attempt}: Fetching URL: {url}")
            response = requests.get(url, headers=headers, timeout=20)  # Increased timeout to 20 seconds
            logging.info(f"Received status code: {response.status_code}")
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            logging.warning(f"Attempt {attempt} failed with error: {e}")
            if attempt == retries:
                logging.error(f"All {retries} attempts failed for URL: {url}")
                return None
            sleep_time = backoff_factor * (2 ** (attempt - 1))
            logging.info(f"Retrying in {sleep_time} seconds...")
            time.sleep(sleep_time)

def extract_dioceses(usccb_url):
    """
    Extracts dioceses information from the USCCB website.
    Returns a list of dictionaries with diocese details.
    """
    try:
        logging.info(f"Extracting dioceses from {usccb_url}")
        soup = get_soup(usccb_url)
        if not soup:
            logging.error("Failed to retrieve the dioceses page.")
            return []

        dioceses = []
        diocese_containers = soup.find_all('div', class_='views-row')

        for container in diocese_containers:
            da_wrap = container.find('div', class_='da-wrap')
            if da_wrap:
                name_div = da_wrap.find('div', class_='da-title')
                if name_div:
                    full_name = name_div.get_text(strip=True)
                    # Remove 'back to top' and extract only the diocese name
                    diocese_name = full_name.split('back')[0].strip()
                else:
                    diocese_name = "N/A"

                address_div = da_wrap.find('div', class_='da-address')
                address_parts = []
                if address_div:
                    address1 = address_div.find('div', class_='address_1').get_text(strip=True) if address_div.find('div', class_='address_1') else ""
                    address2 = address_div.find('div', class_='address_2').get_text(strip=True) if address_div.find('div', class_='address_2') else ""
                    city_div = address_div.find('div', class_='city')
                    if city_div:
                        city = city_div.find('div', class_='field--name-field-da-city').get_text(strip=True) if city_div.find('div', class_='field--name-field-da-city') else ""
                        state = city_div.find('div', class_='field--name-field-da-state-abbreviation').get_text(strip=True) if city_div.find('div', class_='field--name-field-da-state-abbreviation') else ""
                        zip_code = city_div.find('div', class_='field--name-field-da-zip-code').get_text(strip=True) if city_div.find('div', class_='field--name-field-da-zip-code') else ""

                    if address1:
                        address_parts.append(address1)
                    if address2:
                        address_parts.append(address2)
                    if city or state or zip_code:
                        address_parts.append(f"{city}, {state} {zip_code}".strip())

                website = da_wrap.find('div', class_='site').find('a')['href'] if da_wrap.find('div', class_='site') and da_wrap.find('div', class_='site').find('a') else "N/A"

                dioceses.append({
                    'Name': diocese_name,
                    'Address': ", ".join(address_parts),
                    'Website': website
                })

        logging.info(f"Extracted {len(dioceses)} dioceses.")
        return dioceses
    except Exception as e:
        logging.error(f"An error occurred while extracting dioceses: {e}")
        return []

def extract_parish_links(diocese_url):
    """
    Extracts parish links from a diocese's homepage.
    Returns a list of dictionaries with parish details.
    """
    try:
        logging.info(f"Extracting parishes from diocese URL: {diocese_url}")
        soup = get_soup(diocese_url)
        if not soup:
            return []

        parishes = []
        # This part depends on the diocese website structure
        # Here, we'll look for links that likely lead to parish directories
        links = soup.find_all('a', href=True)
        parish_page_patterns = ['parishes', 'directory', 'listing']
        parish_pages = [urljoin(diocese_url, link['href']) for link in links
                       if any(pattern in link['href'].lower() for pattern in parish_page_patterns)]

        # Remove duplicates
        parish_pages = list(set(parish_pages))

        logging.info(f"Found {len(parish_pages)} potential parish directory pages.")

        for page in parish_pages:
            parish_soup = get_soup(page)
            if not parish_soup:
                continue
            # Assuming parishes are listed in a table, list, or specific div
            # Adjust selectors based on actual structure
            parish_links = parish_soup.find_all('a', href=True)
            for link in parish_links:
                parish_name = link.get_text(strip=True)
                parish_url = urljoin(page, link['href'])
                # Optional: Extract address if available nearby
                address = ""
                address_tag = link.find_next('p')  # Assuming address is in a <p> tag after the link
                if address_tag:
                    address = address_tag.get_text(strip=True)
                parishes.append({
                    'Parish Name': parish_name,
                    'Parish Website': parish_url,
                    'Address': address
                })
        # Deduplicate parishes based on name and website
        unique_parishes = { (p['Parish Name'], p['Parish Website']): p for p in parishes }.values()
        logging.info(f"Extracted {len(unique_parishes)} unique parishes.")
        return list(unique_parishes)
    except Exception as e:
        logging.error(f"An error occurred while extracting parishes from {diocese_url}: {e}")
        return []

def extract_parish_info(parish_url):
    """
    Extracts hours of adoration and reconciliation from a parish's website.
    Returns a dictionary with the hours information.
    """
    try:
        logging.info(f"Extracting hours from parish URL: {parish_url}")
        soup = get_soup(parish_url)
        if not soup:
            return {}

        text = soup.get_text(separator=' ', strip=True).lower()
        adoration_hours = None
        reconciliation_hours = None

        # Simple regex patterns to find hours
        adoration_pattern = re.compile(r'(adoration.*?)(?:hours|schedule|celebrated at)?\s*[:\-]?\s*([a-z0-9\s,.-]+)', re.IGNORECASE)
        reconciliation_pattern = re.compile(r'(reconciliation.*?)(?:hours|schedule|available at)?\s*[:\-]?\s*([a-z0-9\s,.-]+)', re.IGNORECASE)

        adoration_match = adoration_pattern.search(text)
        reconciliation_match = reconciliation_pattern.search(text)

        if adoration_match:
            adoration_hours = adoration_match.group(2)
            logging.info(f"Found Adoration Hours: {adoration_hours}")
        if reconciliation_match:
            reconciliation_hours = reconciliation_match.group(2)
            logging.info(f"Found Reconciliation Hours: {reconciliation_hours}")

        return {
            'Adoration Hours': adoration_hours,
            'Reconciliation Hours': reconciliation_hours
        }
    except Exception as e:
        logging.error(f"An error occurred while extracting parish info from {parish_url}: {e}")
        return {}

In [None]:
# Cell 2a: Test the enhanced get_soup function

test_url = "https://www.usccb.org/about/bishops-and-dioceses/all-dioceses"
soup = get_soup(test_url)

if soup:
    logging.info("Successfully fetched and parsed the dioceses page.")
    print("Dioceses page fetched successfully.")
else:
    print("Failed to fetch the dioceses page. Please check your connection or the URL.")

Dioceses page fetched successfully.


In [None]:
# Cell 3 (Updated): Scrape the list of dioceses from the USCCB website with the updated function

usccb_dioceses_url = "https://www.usccb.org/about/bishops-and-dioceses/all-dioceses"
dioceses = extract_dioceses(usccb_dioceses_url)

# Convert to DataFrame for easier handling
dioceses_df = pd.DataFrame(dioceses)

# Display the first few entries to verify successful scraping
dioceses_df.head(10)

Unnamed: 0,Name,Address,Website
0,Archdiocese of Mobile,"400 Government Street, Mobile, AL 36602",https://mobarch.org/
1,Diocese of Birmingham,"2121 3rd Avenue North, P.O. Box 12047, Birming...",http://www.bhmdiocese.org/
2,Archdiocese of Anchorage-Juneau,"225 Cordova Street, Anchorage, AK 99501-2409",http://www.aoaj.org
3,Diocese of Fairbanks,"1316 Peger Road, Fairbanks, AK 99709-5199",http://www.cbna.info/
4,Holy Protection of Mary Byzantine Catholic Epa...,"8105 North 16th Street, Phoenix, AZ 85020",http://www.eparchyofphoenix.org/
5,Diocese of Phoenix,"400 East Monroe Street, Phoenix, AZ 85004-2336",http://www.diocesephoenix.org/
6,Diocese of Tucson,"P.O. Box 31, Tucson, AZ 85702",http://www.diocesetucson.org/
7,Diocese of Little Rock,"2500 N. Tyler Street, Little Rock, AR 72207",http://www.dolr.org/
8,Armenian Catholic Eparchy of Our Lady of Nareg...,"1510 East Mountain St, Glendale, CA 91207",http://www.ourladyofnareg.org/
9,Chaldean Catholic Eparchy of St. Peter the Apo...,"1627 Jamacha Way, El Cajon, CA 92019",https://www.stpeterdiocese.org/


In [None]:
# Cell 4: Initialize SQLite database and create tables

# Connect to SQLite database (or create it)
conn = sqlite3.connect('catholic_dioceses_parishes.db')
cursor = conn.cursor()

# Create dioceses table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS Dioceses (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        Name TEXT,
        Address TEXT,
        Website TEXT
    )
''')

# Create parishes table
cursor.execute('''
    CREATE TABLE IF NOT EXISTS Parishes (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        Diocese_ID INTEGER,
        Parish_Name TEXT,
        Parish_Website TEXT,
        Address TEXT,
        Adoration_Hours TEXT,
        Reconciliation_Hours TEXT,
        FOREIGN KEY(Dioceses.id) REFERENCES Dioceses(id)
    )
''')

conn.commit()

In [None]:
# Cell 5: Populate Dioceses table

# Insert dioceses into the database
for _, row in dioceses_df.iterrows():
    cursor.execute('''
        INSERT INTO Dioceses (Name, Address, Website)
        VALUES (?, ?, ?)
    ''', (row['Name'], row['Address'], row['Website']))
conn.commit()

In [None]:
# Cell 6: Scrape parishes within each diocese and populate Parishes table

# Retrieve dioceses from the database
cursor.execute('SELECT id, Name, Website FROM Dioceses')
dioceses_db = cursor.fetchall()

for diocese in dioceses_db:
    diocese_id, diocese_name, diocese_url = diocese
    print(f"Processing Diocese: {diocese_name}")
    if not diocese_url:
        print(f"No website found for {diocese_name}. Skipping.")
        continue
    parishes = extract_parish_links(diocese_url)
    print(f"Found {len(parishes)} parishes in {diocese_name}.")
    for parish in parishes:
        parish_name = parish.get('Parish Name')
        parish_website = parish.get('Parish Website')
        # Insert parish without address and hours for now
        cursor.execute('''
            INSERT INTO Parishes (Diocese_ID, Parish_Name, Parish_Website)
            VALUES (?, ?, ?)
        ''', (diocese_id, parish_name, parish_website))
    conn.commit()
    # To avoid overwhelming servers, wait for a short period
    time.sleep(1)

In [None]:
# Cell 7a: Find the parish directory page for each diocese

import re
import sqlite3
from urllib.parse import urljoin

def find_parish_directory(diocese_url):
    """
    Searches the diocese website to find the page that lists parishes.
    Returns the URL of the parish directory page if found, None otherwise.
    """
    try:
        soup = get_soup(diocese_url)
        if not soup:
            return None

        # List of common terms that might indicate a parish directory
        directory_terms = ['parish', 'parishes', 'churches', 'locations', 'directory']

        # Search for links containing these terms
        for link in soup.find_all('a', href=True):
            href = link['href']
            text = link.text.lower()
            if any(term in text or term in href.lower() for term in directory_terms):
                # Construct full URL if it's a relative path
                full_url = urljoin(diocese_url, href)
                logging.info(f"Found potential parish directory: {full_url}")
                return full_url

        logging.warning(f"Could not find a parish directory page on {diocese_url}")
        return None
    except Exception as e:
        logging.error(f"Error while searching for parish directory on {diocese_url}: {e}")
        return None

# Establish database connection
conn = sqlite3.connect('catholic_dioceses_parishes.db')
cursor = conn.cursor()

# Process each diocese
cursor.execute('SELECT Name, Website FROM Dioceses')
dioceses_db = cursor.fetchall()

for diocese in dioceses_db:
    diocese_name, diocese_url = diocese
    print(f"\nProcessing Diocese: {diocese_name}")

    directory_url = find_parish_directory(diocese_url)
    if directory_url:
        print(f"Found parish directory at: {directory_url}")
    else:
        print("Could not find parish directory. You may need to search manually.")

    # Ask user if they want to proceed
    user_input = input("Do you want to proceed to the next diocese? (yes/no): ").lower().strip()
    if user_input != 'yes':
        print("Stopping the process.")
        break

print("Diocese processing completed.")

# Close the database connection
conn.close()

OperationalError: no such table: Dioceses

In [None]:
# Cell 7b: Extract parishes from the directory page

def extract_parishes(directory_url):
    """
    Extracts parish information from the directory page.
    Returns a list of dictionaries containing parish details.
    """
    try:
        soup = get_soup(directory_url)
        if not soup:
            return []

        parishes = []
        # This is a generic approach and might need adjustment for specific diocese websites
        for link in soup.find_all('a', href=True):
            href = link['href']
            text = link.text.strip()
            if text and any(term in text.lower() for term in ['church', 'parish', 'cathedral']):
                parish_url = urljoin(directory_url, href)
                parishes.append({
                    'Parish Name': text,
                    'Parish Website': parish_url,
                    'Address': ''  # We might need to visit each parish page to get the address
                })

        return parishes
    except Exception as e:
        logging.error(f"Error extracting parishes from {directory_url}: {e}")
        return []

# Process each diocese and extract parishes
cursor.execute('SELECT Name, Website FROM Dioceses')
dioceses_db = cursor.fetchall()

for diocese in dioceses_db:
    diocese_name, diocese_url = diocese
    print(f"\nProcessing Diocese: {diocese_name}")

    directory_url = find_parish_directory(diocese_url)
    if directory_url:
        print(f"Found parish directory at: {directory_url}")
        parishes = extract_parishes(directory_url)
        print(f"Extracted {len(parishes)} parishes.")

        # Display sample of parishes
        print("\nSample of parishes (up to 5):")
        for parish in parishes[:5]:
            print(f"- {parish['Parish Name']}: {parish['Parish Website']}")

        # Ask user if they want to save this data
        save_data = input("Do you want to save this parish data? (yes/no): ").lower().strip()
        if save_data == 'yes':
            try:
                cursor.executemany('''
                    INSERT INTO Parishes (Diocese_ID, Parish_Name, Parish_Website, Address)
                    SELECT Dioceses.rowid, ?, ?, ?
                    FROM Dioceses
                    WHERE Dioceses.Name = ?
                ''', [(parish['Parish Name'], parish['Parish Website'], parish['Address'], diocese_name) for parish in parishes])
                conn.commit()
                print("Parish data saved successfully.")
            except Exception as exc:
                logging.error(f'Error saving parish data for {diocese_name}: {exc}')
                print(f"Error saving data. See log for details.")
    else:
        print("Could not find parish directory. Skipping parish extraction for this diocese.")

    # Ask user if they want to proceed
    user_input = input("Do you want to proceed to the next diocese? (yes/no): ").lower().strip()
    if user_input != 'yes':
        print("Stopping the process.")
        break

print("Diocese and parish processing completed.")
conn.close()

In [None]:
# Cell 8: Finalize and close the database connection

# Optional: Export the database to a DataFrame
parishes_final_df = pd.read_sql_query('''
    SELECT Parishes.Parish_Name, Parishes.Parish_Website, Dioceses.Name as Diocese,
           Parishes.Address, Parishes.Adoration_Hours, Parishes.Reconciliation_Hours
    FROM Parishes
    JOIN Dioceses ON Parishes.Diocese_ID = Dioceses.id
''', conn)

# Display a sample of the final data
parishes_final_df.head()

# Close the database connection
conn.close()