<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Find_Parish_Directory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install necessary libraries

import requests
from bs4 import BeautifulSoup
import sqlite3
import re
import os
from google.colab import userdata

In [None]:
# Cell 2: Clone GitHub repository and configure Git

# GitHub credentials
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# GitHub repository URL
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# Check if the repository directory already exists
if not os.path.exists(GITHUB_REPO):
    # Clone the repository
    !git clone {REPO_URL}
    os.chdir(GITHUB_REPO)
else:
    print(f"Repository {GITHUB_REPO} already exists. Updating...")
    os.chdir(GITHUB_REPO)
    !git pull origin main

# Configure Git
!git config --global user.email "tomk@github.leemail.me"
!git config --global user.name "tomknightatl"

In [None]:
# Cell 3: Fetch URLs from SQLite database and parse content

import requests
from bs4 import BeautifulSoup
import sqlite3

conn = sqlite3.connect('data.db')
cursor = conn.cursor()

# Fetch first few URLs from the Dioceses table
cursor.execute("SELECT Website FROM Dioceses LIMIT 3")
urls = [row[0] for row in cursor.fetchall()]

print(f"Fetched {len(urls)} URLs from the database")

# Close the database connection
conn.close()

# List to store parsed content
parsed_contents = []

# Fetch and parse content for each URL
# for url in urls:
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         parsed_contents.append((url, soup))
#         print(f"Successfully fetched and parsed: {url}")
#     except Exception as e:
#         print(f"Error fetching or parsing {url}: {str(e)}")
#         parsed_contents.append((url, None))

# print(f"\nProcessed {len(parsed_contents)} URLs")

In [None]:
# Cell 4: Improved function to find parish listing URL

import re
from urllib.parse import urljoin

def find_parish_url(soup, base_url):
    # List of potential keywords and phrases
    keywords = [
        'parishes', 'parish', 'churches', 'locations', 'directory', 'find a parish',
        'our parishes', 'parish finder', 'mass times', 'parish list',
        'find a church', 'locator', 'parish search', 'map'
    ]

    # Function to check if a string contains any of the keywords
    def contains_keyword(text):
        return any(keyword in text.lower() for keyword in keywords)

    # Search for links with matching text
    for a in soup.find_all('a', href=True):
        if contains_keyword(a.text):
            return urljoin(base_url, a['href'])

    # Search for links with matching href
    for a in soup.find_all('a', href=True):
        if contains_keyword(a['href']):
            return urljoin(base_url, a['href'])

    # Search for navigation menus
    for nav in soup.find_all(['nav', 'ul', 'div'], class_=re.compile('(nav|menu)', re.I)):
        for a in nav.find_all('a', href=True):
            if contains_keyword(a.text) or contains_keyword(a['href']):
                return urljoin(base_url, a['href'])

    # Search for buttons
    for button in soup.find_all(['button', 'a'], class_=re.compile('(btn|button)', re.I)):
        if contains_keyword(button.text):
            href = button.get('href')
            if href:
                return urljoin(base_url, href)

    # If still not found, look for any link containing '/parish' or '/church'
    for a in soup.find_all('a', href=re.compile('/(parish|church)', re.I)):
        return urljoin(base_url, a['href'])

    # If nothing found, return None
    return None

# Example usage:
# parish_url = find_parish_url(soup, base_url)
# if parish_url:
#     print(f"Found parish URL: {parish_url}")
# else:
#     print("No parish URL found")

In [None]:
# Cell 5: Iterate through URLs and find URLs for parish directories, maps, etc.

conn = sqlite3.connect('data.db')
cursor = conn.cursor()

# Create table if not exists
cursor.execute('''CREATE TABLE IF NOT EXISTS DiocesesParishDirectory
                  (diocese TEXT, name TEXT, address TEXT, url TEXT)''')

for url in urls:
    print(f"Processing: {url}")
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        parish_url = find_parish_url(soup)
        if parish_url:
            if not parish_url.startswith('http'):
                parish_url = url + parish_url if parish_url.startswith('/') else url + '/' + parish_url

            parish_response = requests.get(parish_url)
            parish_soup = BeautifulSoup(parish_response.content, 'html.parser')

            parishes = parish_soup.find_all('div', class_=re.compile('parish'))

            for parish in parishes:
                name = parish.find('h2')
                name = name.text.strip() if name else "Name not found"

                address = parish.find('p', class_=re.compile('address'))
                address = address.text.strip() if address else "Address not found"

                parish_url = parish.find('a', href=re.compile('http'))
                parish_url = parish_url['href'] if parish_url else "URL not found"

                cursor.execute("INSERT INTO parishes VALUES (?, ?, ?, ?)", (url, name, address, parish_url))

            print(f"Processed {len(parishes)} parishes for {url}")
        else:
            print(f"No parish listing found for {url}")
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

conn.commit()
conn.close()
print("\nDatabase connection closed")

In [None]:
# Cell 6: Verify the data in the SQLite database

conn = sqlite3.connect('data.db')
cursor = conn.cursor()

cursor.execute("SELECT * FROM DiocesesParishDirectory LIMIT 5")
rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()
print("\nDatabase connection closed")

In [None]:
# Cell 7: Commit changes and push to GitHub
# Add changes to git
!git add data.db

# Commit changes
!git commit -m "Updated parishes data for multiple dioceses"

# Push changes to GitHub
!git push origin main