<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build_Parishes_Database_From_Map.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Install libraries
import os
from google.colab import userdata

# Define the filename for the Chrome installer
chrome_installer_filename = "google-chrome-stable_current_amd64.deb"

# Check if the file already exists before downloading
if not os.path.exists(chrome_installer_filename):
    # Download the Google Chrome installer if it doesn't exist
    !wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb

# Install the downloaded package
!dpkg -i {chrome_installer_filename}

# Fix any dependency issues
!apt-get -f install -y

# Install webdriver-manager
!pip install webdriver-manager

# Install Selenium
!pip install selenium

In [None]:
# Cell 2: Clone GitHub repository and configure Git

# GitHub credentials
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# GitHub repository URL
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# Check if the repository directory already exists
if not os.path.exists(GITHUB_REPO):
    # Clone the repository
    !git clone {REPO_URL}
    os.chdir(GITHUB_REPO)
else:
    print(f"Repository {GITHUB_REPO} already exists. Updating...")
    os.chdir(GITHUB_REPO)
    !git pull origin main

# Configure Git
!git config --global user.email "tomk@github.leemail.me"
!git config --global user.name "tomknightatl"

In [None]:
# Cell 2.1: Display initial database status
# This is after cloning the repo and before database operations.
# The CWD should be the repo root due to os.chdir in the previous cell.
print("--- Displaying Initial Database Status (Build_Parishes_Database_From_Map.ipynb) ---")
display_database_status('data.db')

In [None]:
# Cell 3
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import sqlite3
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException


In [None]:
# Cell 3.1: Import display_database_status from db_utils.py
import sys
import os
# Assuming db_utils.py is in the root of the repo
if '.' not in sys.path: # Add current dir to path if not already there
    sys.path.insert(0, '.')
try:
    from db_utils import display_database_status
    print("Successfully imported display_database_status from db_utils.py")
except ImportError as e:
    print(f"Error importing display_database_status: {e}")
    print("Make sure db_utils.py is in the same directory or sys.path is configured correctly.")

In [None]:
# Cell 4: Set up Selenium with Chrome
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [None]:
# Cell 9.1: Display final database status with details for parishes table
print("--- Displaying Final Database Status (Build_Parishes_Database_From_Map.ipynb) ---")
display_database_status('data.db', show_details=True, tables_to_show=['parishes'])

In [None]:
# Cell 5: Fetch the web page
url = "https://archatl.com/parishes/find-a-parish/"
driver.get(url)

# Wait for the content to load
wait = WebDriverWait(driver, 30)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-bind="html:$data.title"]')))

# Function to scroll to bottom of page
def scroll_to_bottom():
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Scroll to bottom to load all content
scroll_to_bottom()

# Give it a little more time for all elements to load
time.sleep(5)

In [None]:
# Cell 6: Extract parish information
parishes = []
elements = driver.find_elements(By.CSS_SELECTOR, 'span[data-bind="html:$data.title"]')

for element in elements:
    try:
        parish_info = element.text.strip()
        parts = parish_info.split(',')
        if len(parts) >= 2:
            name = ','.join(parts[:-1]).strip()  # Join all parts except the last one
            city = parts[-1].strip()
            parishes.append((name, city))
            print(f"Parsed: Name: {name}, City: {city}")
        else:
            # Handle entries without a city
            name = parish_info
            city = "N/A"
            parishes.append((name, city))
            print(f"Parsed (No City): Name: {name}, City: {city}")
    except StaleElementReferenceException:
        print("Stale element encountered, skipping...")
        continue

print(f"\nTotal parishes found: {len(parishes)}")

# Close the browser
driver.quit()


In [None]:
# Cell 7: Create SQLite database and table
conn = sqlite3.connect('data.db')
cursor = conn.cursor()
cursor.execute('''
    CREATE TABLE IF NOT EXISTS parishes (
        id INTEGER PRIMARY KEY,
        name TEXT,
        city TEXT
    )
''')


In [None]:
# Cell 7: Insert data into the database
cursor.executemany('INSERT INTO parishes (name, city) VALUES (?, ?)', parishes)
conn.commit()

In [None]:
# Cell 8: Verify the data in the database
cursor.execute('SELECT * FROM parishes')
rows = cursor.fetchall()
for row in rows[:20]:  # Print first 20 for brevity
    print(row)

print(f"\nTotal parishes in database: {len(rows)}")

In [None]:
# Cell 9: Close the database connection
conn.close()

print(f"Total parishes added to the database: {len(parishes)}")

In [None]:
# Cell 10: Commit changes and push to GitHub
!git add data.db

# Commit changes
!git commit -m "Added data in data.db using Build_Parishes_Database_From_Map.ipynb"

# Push changes to GitHub
!git push origin main

In [None]:
# Expected Output for https://archatl.com/parishes/find-a-parish/:
# # [(1, 'All Saints', 'Dunwoody', '2443 Mount Vernon Road, Dunwoody, GA 30338', 'http://www.allsaintscatholic.org'),
#  (2, 'Annunciation', 'Milledgeville', '171 South Jefferson Street, Milledgeville, GA 31061', 'https://www.annunciationmilledgeville.org/'),
#  (3, 'Ascension', 'Brookhaven', '2989 Lavista Road, Brookhaven, GA 30329', 'http://www.ascca.net'),
#  (4, 'Asian Pacific Ministry', 'Doraville', '2699 Shallowford Road, Doraville, GA 30360', 'N/A'),
#  (5, 'Basilica of the Sacred Heart of Jesus', 'Atlanta', '353 Peachtree Street NE, Atlanta, GA 30308', 'http://www.sacredheartatlanta.org')]