<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Adoration_and_Reconciliation_Measurement_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Cell 1: Import necessary libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import re
from urllib.parse import urljoin, urlparse
import time
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scraping.log"),
        logging.StreamHandler()
    ]
)

In [11]:
# Cell 2: Define helper functions

def get_soup(url, retries=3, backoff_factor=1.0):
    """
    Fetches the content at the given URL and returns a BeautifulSoup object.
    Implements retries with exponential backoff in case of request failures.
    """
    headers = {
        'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/58.0.3029.110 Safari/537.3'),
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    for attempt in range(1, retries + 1):
        try:
            logging.info(f"Attempt {attempt}: Fetching URL: {url}")
            response = requests.get(url, headers=headers, timeout=20)
            logging.info(f"Received status code: {response.status_code}")
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            logging.warning(f"Attempt {attempt} failed with error: {e}")
            if attempt == retries:
                logging.error(f"All {retries} attempts failed for URL: {url}")
                return None
            sleep_time = backoff_factor * (2 ** (attempt - 1))
            logging.info(f"Retrying in {sleep_time} seconds...")
            time.sleep(sleep_time)

def extract_dioceses(soup):
    """
    Extracts dioceses information from the parsed HTML.
    Returns a list of dictionaries with diocese details.
    """
    dioceses = []
    diocese_containers = soup.find_all('div', class_='views-row')

    logging.info(f"Found {len(diocese_containers)} potential diocese containers")

    for i, container in enumerate(diocese_containers):
        logging.info(f"Processing container {i+1}")

        da_wrap = container.find('div', class_='da-wrap')
        if not da_wrap:
            logging.warning(f"No da-wrap found in container {i+1}")
            continue

        name_div = da_wrap.find('div', class_='da-title')
        diocese_name = name_div.get_text(strip=True) if name_div else "N/A"
        logging.info(f"Diocese name: {diocese_name}")

        address_div = da_wrap.find('div', class_='da-address')
        address_parts = []
        if address_div:
            for div in address_div.find_all('div', recursive=False):
                text = div.get_text(strip=True)
                if text:
                    address_parts.append(text)

        address = ", ".join(address_parts)
        logging.info(f"Address: {address}")

        website_div = da_wrap.find('div', class_='site')
        website_url = website_div.find('a')['href'] if website_div and website_div.find('a') else "N/A"
        logging.info(f"Website: {website_url}")

        dioceses.append({
            'Name': diocese_name,
            'Address': address,
            'Website': website_url
        })

    return dioceses

In [12]:
# Cell 3: Fetch and parse the HTML content from URL

url = "https://www.usccb.org/about/bishops-and-dioceses/all-dioceses"
soup = get_soup(url)

if soup:
    print("Successfully fetched and parsed the dioceses page.")
    # Print the first 1000 characters of the HTML to check its structure
    print("First 1000 characters of the HTML:")
    print(soup.prettify()[:1000])
else:
    print("Failed to fetch the dioceses page. Please check your connection or the URL.")
    exit()

Successfully fetched and parsed the dioceses page.
First 1000 characters of the HTML:
<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
 <head>
  <!-- Google Tag Manager -->
  <script>
   (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-5JHPTP');
  </script>
  <!-- End Google Tag Manager -->
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"NRJS-91f65b8eb2c03037f7e",applicationID:"578652560"};;/*! For license information please see nr-loader-rum-1.266.0.min.js.LICENSE.txt */
(()=>{var e,t,r={8122:(e,t,r)=>{"use strict";r.d(t,{a:()=>i});var n=r(944);

In [13]:
# Cell 4: Extract dioceses information

dioceses = extract_dioceses(soup)
print(f"Extracted information for {len(dioceses)} dioceses.")

if len(dioceses) == 0:
    print("No dioceses were extracted. Printing the structure of the page:")
    print(soup.prettify())

Extracted information for 196 dioceses.


In [14]:
# Cell 5: Create a DataFrame and display results

dioceses_df = pd.DataFrame(dioceses)
print(dioceses_df.head())

                                                Name  \
0                              Archdiocese of Mobile   
1                              Diocese of Birmingham   
2                    Archdiocese of Anchorage-Juneau   
3                               Diocese of Fairbanks   
4  Holy Protection of Mary Byzantine Catholic Epa...   

                                             Address  \
0  400 Government Street, Mobile,AL36602, https:/...   
1  2121 3rd Avenue North, P.O. Box 12047, Birming...   
2  225 Cordova Street, Anchorage,AK99501-2409, ht...   
3  1316 Peger Road, Fairbanks,AK99709-5199, http:...   
4  8105 North 16th Street, Phoenix,AZ85020, http:...   

                            Website  
0              https://mobarch.org/  
1        http://www.bhmdiocese.org/  
2               http://www.aoaj.org  
3             http://www.cbna.info/  
4  http://www.eparchyofphoenix.org/  


In [15]:
# Cell 6: Save to CSV

dioceses_df.to_csv('dioceses.csv', index=False)
print("CSV file 'dioceses.csv' has been created.")

CSV file 'dioceses.csv' has been created.


In [16]:
# Cell 7: Initialize SQLite database and create table

conn = sqlite3.connect('catholic_dioceses.db')
cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE IF NOT EXISTS Dioceses (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        Name TEXT,
        Address TEXT,
        Website TEXT
    )
''')

conn.commit()


In [17]:
# Cell 8: Populate Dioceses table

for _, row in dioceses_df.iterrows():
    cursor.execute('''
        INSERT INTO Dioceses (Name, Address, Website)
        VALUES (?, ?, ?)
    ''', (row['Name'], row['Address'], row['Website']))

conn.commit()
print("Data has been inserted into the SQLite database.")

# Cell 9: Query and display data from the database

cursor.execute('SELECT * FROM Dioceses LIMIT 5')
results = cursor.fetchall()

print("First 5 entries from the database:")
for row in results:
    print(row)

# Close the database connection
conn.close()

print("Database connection closed.")


Data has been inserted into the SQLite database.
First 5 entries from the database:
(1, 'Archdiocese of Mobile', '400 Government Street, Mobile,AL36602, https://mobarch.org/', 'https://mobarch.org/')
(2, 'Diocese of Birmingham', '2121 3rd Avenue North, P.O. Box 12047, Birmingham,AL35202-2047, http://www.bhmdiocese.org/', 'http://www.bhmdiocese.org/')
(3, 'Archdiocese of Anchorage-Juneau', '225 Cordova Street, Anchorage,AK99501-2409, http://www.aoaj.org', 'http://www.aoaj.org')
(4, 'Diocese of Fairbanks', '1316 Peger Road, Fairbanks,AK99709-5199, http://www.cbna.info/', 'http://www.cbna.info/')
(5, 'Holy Protection of Mary Byzantine Catholic Eparchy of Phoenix', '8105 North 16th Street, Phoenix,AZ85020, http://www.eparchyofphoenix.org/', 'http://www.eparchyofphoenix.org/')
Database connection closed.


In [19]:
# Cell 9: Find the parish directory page for each diocese

import re
import sqlite3
from urllib.parse import urljoin

def find_parish_directory(diocese_url):
    """
    Searches the diocese website to find the page that lists parishes.
    Returns the URL of the parish directory page if found, None otherwise.
    """
    try:
        soup = get_soup(diocese_url)
        if not soup:
            return None

        # List of common terms that might indicate a parish directory
        directory_terms = ['parish', 'parishes', 'churches', 'locations', 'directory']

        # Search for links containing these terms
        for link in soup.find_all('a', href=True):
            href = link['href']
            text = link.text.lower()
            if any(term in text or term in href.lower() for term in directory_terms):
                # Construct full URL if it's a relative path
                full_url = urljoin(diocese_url, href)
                logging.info(f"Found potential parish directory: {full_url}")
                return full_url

        logging.warning(f"Could not find a parish directory page on {diocese_url}")
        return None
    except Exception as e:
        logging.error(f"Error while searching for parish directory on {diocese_url}: {e}")
        return None

# Establish database connection
conn = sqlite3.connect('catholic_dioceses.db')
cursor = conn.cursor()

# Process each diocese
cursor.execute('SELECT Name, Website FROM Dioceses')
dioceses_db = cursor.fetchall()

for diocese in dioceses_db:
    diocese_name, diocese_url = diocese
    print(f"\nProcessing Diocese: {diocese_name}")

    directory_url = find_parish_directory(diocese_url)
    if directory_url:
        print(f"Found parish directory at: {directory_url}")
    else:
        print("Could not find parish directory. You may need to search manually.")

    # Ask user if they want to proceed
    user_input = input("Do you want to proceed to the next diocese? (yes/no): ").lower().strip()
    if user_input != 'yes':
        print("Stopping the process.")
        break

print("Diocese processing completed.")

# Close the database connection
conn.close()




Processing Diocese: Archdiocese of Mobile


ERROR:root:All 3 attempts failed for URL: https://mobarch.org/


Could not find parish directory. You may need to search manually.
Do you want to proceed to the next diocese? (yes/no): yes





Processing Diocese: Diocese of Birmingham


ERROR:root:All 3 attempts failed for URL: http://www.bhmdiocese.org/


Could not find parish directory. You may need to search manually.
Do you want to proceed to the next diocese? (yes/no): yes





Processing Diocese: Archdiocese of Anchorage-Juneau


ERROR:root:All 3 attempts failed for URL: http://www.aoaj.org


Could not find parish directory. You may need to search manually.


KeyboardInterrupt: Interrupted by user

In [None]:
# Cell 10: Extract parishes from the directory page

def extract_parishes(directory_url):
    """
    Extracts parish information from the directory page.
    Returns a list of dictionaries containing parish details.
    """
    try:
        soup = get_soup(directory_url)
        if not soup:
            return []

        parishes = []
        # This is a generic approach and might need adjustment for specific diocese websites
        for link in soup.find_all('a', href=True):
            href = link['href']
            text = link.text.strip()
            if text and any(term in text.lower() for term in ['church', 'parish', 'cathedral']):
                parish_url = urljoin(directory_url, href)
                parishes.append({
                    'Parish Name': text,
                    'Parish Website': parish_url,
                    'Address': ''  # We might need to visit each parish page to get the address
                })

        return parishes
    except Exception as e:
        logging.error(f"Error extracting parishes from {directory_url}: {e}")
        return []

# Process each diocese and extract parishes
cursor.execute('SELECT Name, Website FROM Dioceses')
dioceses_db = cursor.fetchall()

for diocese in dioceses_db:
    diocese_name, diocese_url = diocese
    print(f"\nProcessing Diocese: {diocese_name}")

    directory_url = find_parish_directory(diocese_url)
    if directory_url:
        print(f"Found parish directory at: {directory_url}")
        parishes = extract_parishes(directory_url)
        print(f"Extracted {len(parishes)} parishes.")

        # Display sample of parishes
        print("\nSample of parishes (up to 5):")
        for parish in parishes[:5]:
            print(f"- {parish['Parish Name']}: {parish['Parish Website']}")

        # Ask user if they want to save this data
        save_data = input("Do you want to save this parish data? (yes/no): ").lower().strip()
        if save_data == 'yes':
            try:
                cursor.executemany('''
                    INSERT INTO Parishes (Diocese_ID, Parish_Name, Parish_Website, Address)
                    SELECT Dioceses.rowid, ?, ?, ?
                    FROM Dioceses
                    WHERE Dioceses.Name = ?
                ''', [(parish['Parish Name'], parish['Parish Website'], parish['Address'], diocese_name) for parish in parishes])
                conn.commit()
                print("Parish data saved successfully.")
            except Exception as exc:
                logging.error(f'Error saving parish data for {diocese_name}: {exc}')
                print(f"Error saving data. See log for details.")
    else:
        print("Could not find parish directory. Skipping parish extraction for this diocese.")

    # Ask user if they want to proceed
    user_input = input("Do you want to proceed to the next diocese? (yes/no): ").lower().strip()
    if user_input != 'yes':
        print("Stopping the process.")
        break

print("Diocese and parish processing completed.")
conn.close()

In [None]:
# Cell 11: Finalize and close the database connection

# Optional: Export the database to a DataFrame
parishes_final_df = pd.read_sql_query('''
    SELECT Parishes.Parish_Name, Parishes.Parish_Website, Dioceses.Name as Diocese,
           Parishes.Address, Parishes.Adoration_Hours, Parishes.Reconciliation_Hours
    FROM Parishes
    JOIN Dioceses ON Parishes.Diocese_ID = Dioceses.id
''', conn)

# Display a sample of the final data
parishes_final_df.head()

# Close the database connection
conn.close()