<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Find_Adoration_and_Reconciliation_information_for_a_Parish.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
# Cell 1
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import sqlite3
import pandas as pd

In [34]:
# Cell 2
def get_sitemap_urls(url):
    try:
        response = requests.get(urljoin(url, '/sitemap.xml'))
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'xml')
            return [loc.text for loc in soup.find_all('loc')]
    except:
        pass
    return []

In [35]:
# Cell 3
def search_for_keywords(url, keywords):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text().lower()
            return any(keyword.lower() in text for keyword in keywords)
    except:
        pass
    return False

In [36]:
# Cell 4
def extract_time_info(url, keyword):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()

            # Look for patterns like "X hours per week" or "X hours per month"
            time_pattern = re.compile(r'(\d+)\s*hours?\s*per\s*(week|month)', re.IGNORECASE)
            match = time_pattern.search(text)

            if match:
                hours = int(match.group(1))
                period = match.group(2).lower()
                return f"{hours} hours per {period}"

            # If no clear pattern is found, return the paragraph containing the keyword
            paragraphs = soup.find_all('p')
            for p in paragraphs:
                if keyword.lower() in p.text.lower():
                    return p.text.strip()
    except:
        pass
    return "Information not found"

In [48]:
# Cell 5 (Changed)
def scrape_parish_data(url):
    sitemap_urls = get_sitemap_urls(url)
    all_urls = [url] + sitemap_urls
    print(f"Found {len(all_urls)} URLs on Sitemap page:")
    for page_url  in all_urls:
        print(page_url)

    reconciliation_found = False
    adoration_found = False
    reconciliation_info = ""
    adoration_info = ""
    reconciliation_page = ""
    adoration_page = ""

    for page_url in all_urls:
        print(f"Checking {page_url}...")
        if not reconciliation_found and search_for_keywords(page_url, ['Reconciliation', 'Confession']):
            reconciliation_found = True
            reconciliation_info = extract_time_info(page_url, 'Reconciliation')
            reconciliation_page = page_url

        if not adoration_found and search_for_keywords(page_url, ['Adoration']):
            adoration_found = True
            adoration_info = extract_time_info(page_url, 'Adoration')
            adoration_page = page_url

        if reconciliation_found and adoration_found:
            break

    return {
        'url': url,
        'offers_reconciliation': reconciliation_found,
        'reconciliation_info': reconciliation_info,
        'reconciliation_page': reconciliation_page,
        'offers_adoration': adoration_found,
        'adoration_info': adoration_info,
        'adoration_page': adoration_page
    }


In [49]:
# Cell 6
parish_urls = [
    'https://allsaintsdunwoody.org/',
#    'https://sacredheartatlanta.org/',
#    'https://cathedralctk.com/',
    'https://www.christourhopeatl.org/'
]

results = []
for url in parish_urls:
    print(f"Scraping {url}sitemap...")
    result = scrape_parish_data(url)
    result['parish_name'] = url.split('//')[1].split('.')[0]
    results.append(result)
    print(f"Completed scraping {url}")

Scraping https://allsaintsdunwoody.org/sitemap...
Found 9 URLs on Sitemap page:
https://allsaintsdunwoody.org/
https://allsaintsdunwoody.org/wp-sitemap-posts-post-1.xml
https://allsaintsdunwoody.org/wp-sitemap-posts-page-1.xml
https://allsaintsdunwoody.org/wp-sitemap-posts-calendar-1.xml
https://allsaintsdunwoody.org/wp-sitemap-posts-envira-1.xml
https://allsaintsdunwoody.org/wp-sitemap-taxonomies-category-1.xml
https://allsaintsdunwoody.org/wp-sitemap-taxonomies-calendar_category-1.xml
https://allsaintsdunwoody.org/wp-sitemap-taxonomies-media_category-1.xml
https://allsaintsdunwoody.org/wp-sitemap-users-1.xml
Checking https://allsaintsdunwoody.org/...
Completed scraping https://allsaintsdunwoody.org/
Scraping https://www.christourhopeatl.org/sitemap...
Found 1 URLs on Sitemap page:
https://www.christourhopeatl.org/
Checking https://www.christourhopeatl.org/...
Completed scraping https://www.christourhopeatl.org/


In [39]:
# Cell 7
df = pd.DataFrame(results)
print(df)

                                                 url  offers_reconciliation  \
0  https://allsaintsdunwoody.org/wp-sitemap-users...                   True   
1                    https://sacredheartatlanta.org/                  False   
2  https://cathedralctk.com/wp-sitemap-taxonomies...                   True   
3                  https://www.christourhopeatl.org/                  False   

                                 reconciliation_info  \
0  Penance/Reconciliation\nConfessions are heard ...   
1                                                      
2                              Information not found   
3                                                      

              reconciliation_page  offers_adoration         adoration_info  \
0  https://allsaintsdunwoody.org/              True  Information not found   
1                                             False                          
2       https://cathedralctk.com/              True  Information not found   
3          

In [40]:
# Cell 8
conn = sqlite3.connect('parish_data.db')
df.to_sql('parishes', conn, if_exists='replace', index=False)
conn.close()

print("Data saved to parish_data.db")

Data saved to parish_data.db


In [41]:
# Cell 9
# Verify data in the database
conn = sqlite3.connect('parish_data.db')
df_from_db = pd.read_sql_query("SELECT * FROM parishes", conn)
conn.close()

print(df_from_db)

                                                 url  offers_reconciliation  \
0  https://allsaintsdunwoody.org/wp-sitemap-users...                      1   
1                    https://sacredheartatlanta.org/                      0   
2  https://cathedralctk.com/wp-sitemap-taxonomies...                      1   
3                  https://www.christourhopeatl.org/                      0   

                                 reconciliation_info  \
0  Penance/Reconciliation\nConfessions are heard ...   
1                                                      
2                              Information not found   
3                                                      

              reconciliation_page  offers_adoration         adoration_info  \
0  https://allsaintsdunwoody.org/                 1  Information not found   
1                                                 0                          
2       https://cathedralctk.com/                 1  Information not found   
3          