<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Find_Adoration_and_Reconciliation_information_for_a_Parish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import sqlite3
import pandas as pd

In [2]:
# Cell 2
def get_sitemap_urls(url):
    try:
        response = requests.get(urljoin(url, '/sitemap.xml'))
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'xml')
            return [loc.text for loc in soup.find_all('loc')]
    except:
        pass
    return []

In [3]:
# Cell 3
def search_for_keywords(url, keywords):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text().lower()
            return any(keyword.lower() in text for keyword in keywords)
    except:
        pass
    return False

In [4]:
# Cell 4
def extract_time_info(url, keyword):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()

            # Look for patterns like "X hours per week" or "X hours per month"
            time_pattern = re.compile(r'(\d+)\s*hours?\s*per\s*(week|month)', re.IGNORECASE)
            match = time_pattern.search(text)

            if match:
                hours = int(match.group(1))
                period = match.group(2).lower()
                return f"{hours} hours per {period}"

            # If no clear pattern is found, return the paragraph containing the keyword
            paragraphs = soup.find_all('p')
            for p in paragraphs:
                if keyword.lower() in p.text.lower():
                    return p.text.strip()
    except:
        pass
    return "Information not found"

In [5]:
# Cell 5
def scrape_parish_data(url):
    sitemap_urls = get_sitemap_urls(url)
    all_urls = [url] + sitemap_urls

    reconciliation_found = False
    adoration_found = False
    reconciliation_info = ""
    adoration_info = ""

    for page_url in all_urls:
        if not reconciliation_found and search_for_keywords(page_url, ['Reconciliation', 'Confession']):
            reconciliation_found = True
            reconciliation_info = extract_time_info(page_url, 'Reconciliation')

        if not adoration_found and search_for_keywords(page_url, ['Adoration']):
            adoration_found = True
            adoration_info = extract_time_info(page_url, 'Adoration')

        if reconciliation_found and adoration_found:
            break

    return {
        'url': url,
        'offers_reconciliation': reconciliation_found,
        'reconciliation_info': reconciliation_info,
        'offers_adoration': adoration_found,
        'adoration_info': adoration_info
    }

In [6]:
# Cell 6
parish_urls = [
    'https://allsaintsdunwoody.org/',
    'https://sacredheartatlanta.org/',
    'https://cathedralctk.com/',
    'https://www.christourhopeatl.org/'
]

results = []
for url in parish_urls:
    print(f"Scraping {url}...")
    result = scrape_parish_data(url)
    result['parish_name'] = url.split('//')[1].split('.')[0]
    results.append(result)
    print(f"Completed scraping {url}")

Scraping https://allsaintsdunwoody.org/...
Completed scraping https://allsaintsdunwoody.org/
Scraping https://sacredheartatlanta.org/...
Completed scraping https://sacredheartatlanta.org/
Scraping https://cathedralctk.com/...
Completed scraping https://cathedralctk.com/
Scraping https://www.christourhopeatl.org/...
Completed scraping https://www.christourhopeatl.org/


In [7]:
# Cell 7
df = pd.DataFrame(results)
print(df)

                                 url  offers_reconciliation  \
0     https://allsaintsdunwoody.org/                   True   
1    https://sacredheartatlanta.org/                  False   
2          https://cathedralctk.com/                   True   
3  https://www.christourhopeatl.org/                  False   

                                 reconciliation_info  offers_adoration  \
0  Penance/Reconciliation\nConfessions are heard ...              True   
1                                                                False   
2                              Information not found              True   
3                                                                False   

          adoration_info         parish_name  
0  Information not found   allsaintsdunwoody  
1                         sacredheartatlanta  
2  Information not found        cathedralctk  
3                                        www  


In [8]:
# Cell 8
conn = sqlite3.connect('parish_data.db')
df.to_sql('parishes', conn, if_exists='replace', index=False)
conn.close()

print("Data saved to parish_data.db")

Data saved to parish_data.db


In [9]:
# Cell 9
# Verify data in the database
conn = sqlite3.connect('parish_data.db')
df_from_db = pd.read_sql_query("SELECT * FROM parishes", conn)
conn.close()

print(df_from_db)

                                 url  offers_reconciliation  \
0     https://allsaintsdunwoody.org/                      1   
1    https://sacredheartatlanta.org/                      0   
2          https://cathedralctk.com/                      1   
3  https://www.christourhopeatl.org/                      0   

                                 reconciliation_info  offers_adoration  \
0  Penance/Reconciliation\nConfessions are heard ...                 1   
1                                                                    0   
2                              Information not found                 1   
3                                                                    0   

          adoration_info         parish_name  
0  Information not found   allsaintsdunwoody  
1                         sacredheartatlanta  
2  Information not found        cathedralctk  
3                                        www  
