<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Find_Parish_Directory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install necessary libraries

import requests
from bs4 import BeautifulSoup
import sqlite3
import re
import os
from google.colab import userdata

In [None]:
# Cell 2: Clone GitHub repository and configure Git

# GitHub credentials
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# GitHub repository URL
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# Check if the repository directory already exists
if not os.path.exists(GITHUB_REPO):
    # Clone the repository
    !git clone {REPO_URL}
    os.chdir(GITHUB_REPO)
else:
    print(f"Repository {GITHUB_REPO} already exists. Updating...")
    os.chdir(GITHUB_REPO)
    !git pull origin main

# Configure Git
!git config --global user.email "tomk@github.leemail.me"
!git config --global user.name "tomknightatl"

In [None]:
# Cell 3: Fetch URLs from SQLite database for dioceses without parish directory URLs

import sqlite3

conn = sqlite3.connect('data.db')
cursor = conn.cursor()

# SQL query to join Dioceses and DiocesesParishDirectory tables
# and select dioceses without parish directory URLs
query = """
SELECT d.Website
FROM Dioceses d
LEFT JOIN DiocesesParishDirectory dpd ON d.Website = dpd.diocese_url
WHERE dpd.parish_directory_url IS NULL OR dpd.parish_directory_url = ''
"""

cursor.execute(query)
urls = [row[0] for row in cursor.fetchall()]

print(f"Fetched {len(urls)} URLs from the database for dioceses without parish directory URLs")

# Close the database connection
conn.close()

# List to store parsed content
parsed_contents = []

# Fetch and parse content for each URL
# (Commented out to prevent actual web requests during demonstration)
# for url in urls:
#     try:
#         response = requests.get(url)
#         soup = BeautifulSoup(response.content, 'html.parser')
#         parsed_contents.append((url, soup))
#         print(f"Successfully fetched and parsed: {url}")
#     except Exception as e:
#         print(f"Error fetching or parsing {url}: {str(e)}")
#         parsed_contents.append((url, None))

# print(f"\nProcessed {len(parsed_contents)} URLs")

In [None]:
# Cell 4: Improved function to find parish listing URL with multiple link names

from urllib.parse import urljoin
import re

def find_parish_url(soup, base_url):
    # List of possible link names for parish directories
    parish_link_names = ['Churches', 'Directory of Parishes', 'Parishes', 'parishfinder', 'Parish Finder', 'Find a Parish', 'Locations']

    # Look for links with exact matches to the names in our list
    for name in parish_link_names:
        link = soup.find('a', string=lambda text: text and text.strip() == name)
        if link and 'href' in link.attrs:
            return urljoin(base_url, link['href'])

    # If exact match not found, look for partial matches in link text
    all_links = soup.find_all('a', href=True)
    for link in all_links:
        if any(name.lower() in link.text.lower() for name in parish_link_names):
            return urljoin(base_url, link['href'])

    # If still not found, look in navigation menus
    nav_menus = soup.find_all(['nav', 'ul', 'div'], class_=lambda x: x and 'nav' in x.lower())
    for menu in nav_menus:
        for name in parish_link_names:
            link = menu.find('a', string=lambda text: text and name.lower() in text.lower())
            if link and 'href' in link.attrs:
                return urljoin(base_url, link['href'])

    # If nothing found, return None
    return None

# Example usage:
# url = "https://www.stpeterdiocese.org/"
# response = requests.get(url)
# soup = BeautifulSoup(response.content, 'html.parser')
# parish_url = find_parish_url(soup, url)
# if parish_url:
#     print(f"Found parish directory URL: {parish_url}")
# else:
#     print("No parish directory URL found")

In [None]:
# This is temporary code to drop the table if it exists.  Only needed until the table data structure is finalized.
import sqlite3

# Connect to the SQLite3 database
# Replace 'your_database.db' with the path to your database file
connection = sqlite3.connect('data.db')

try:
    # Create a cursor object using the connection
    cursor = connection.cursor()

    # SQL command to drop the table
    drop_table_query = "DROP TABLE IF EXISTS DiocesesParishDirectory;"

    # Execute the SQL command
    cursor.execute(drop_table_query)

    # Commit the changes
    connection.commit()

    print("Table DiocesesParishDirectory deleted successfully.")

except sqlite3.Error as error:
    print(f"Error while deleting table: {error}")

finally:
    # Close the connection
    if connection:
        connection.close()

In [None]:
# Cell 5: Write the results to the database.

conn = sqlite3.connect('data.db')
cursor = conn.cursor()

# Create table if not exists
cursor.execute('''CREATE TABLE IF NOT EXISTS DiocesesParishDirectory
                  (diocese_url TEXT, parish_directory_url TEXT, found TEXT)''')

for url in urls:
    print(f"Processing: {url}")
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        parish_directory_url = find_parish_url(soup, url)

        if parish_directory_url:
            print(f"Found parish directory URL: {parish_directory_url}")
            cursor.execute("INSERT OR REPLACE INTO DiocesesParishDirectory VALUES (?, ?, ?)",
                           (url, parish_directory_url, "Success"))

        else:
            print(f"No parish directory URL found for {url}")
            # Log the unsuccessful attempt in the database
            cursor.execute("INSERT OR REPLACE INTO DiocesesParishDirectory VALUES (?, ?, ?)",
                           (url, None, f"No parish directory URL found for {url}"))

    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

conn.commit()
conn.close()
print("\nDatabase connection closed")

In [None]:
# Cell 6: Verify the data in the SQLite database

conn = sqlite3.connect('data.db')
cursor = conn.cursor()

cursor.execute("SELECT * FROM DiocesesParishDirectory LIMIT 5")
rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()
print("\nDatabase connection closed")

In [None]:
# Cell 7: Commit changes and push to GitHub
# Add changes to git
!git add data.db

# Commit changes
!git commit -m "Updated Parish Directory URLs for multiple dioceses in Find_Parish_Directory.ipynb"

# Push changes to GitHub
!git push origin main