<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build_Parishes_Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1
import os

# Define the filename for the Chrome installer
chrome_installer_filename = "google-chrome-stable_current_amd64.deb"

# Check if the file already exists before downloading
if not os.path.exists(chrome_installer_filename):
    # Download the Google Chrome installer if it doesn't exist
    !wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb

# Install the downloaded package
!dpkg -i {chrome_installer_filename}

# Fix any dependency issues
!apt-get -f install -y

# Install webdriver-manager
!pip install webdriver-manager

# Install Selenium
!pip install selenium

--2024-09-21 20:15:42--  https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
Resolving dl.google.com (dl.google.com)... 172.217.204.93, 172.217.204.91, 172.217.204.190, ...
Connecting to dl.google.com (dl.google.com)|172.217.204.93|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 111913828 (107M) [application/x-debian-package]
Saving to: ‘google-chrome-stable_current_amd64.deb’


2024-09-21 20:15:43 (178 MB/s) - ‘google-chrome-stable_current_amd64.deb’ saved [111913828/111913828]

Selecting previously unselected package google-chrome-stable.
(Reading database ... 123599 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (129.0.6668.58-1) ...
[1mdpkg:[0m dependency problems prevent configuration of google-chrome-stable:
 google-chrome-stable depends on libvulkan1; however:
  Package libvulkan1 is not installed.

[1mdpkg:[0m error processing package go

In [2]:
# Cell 2
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import sqlite3
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException


In [17]:
# Set up Selenium with Chrome
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

In [18]:
# Cell 3
# Fetch the web page
url = "https://archatl.com/parishes/find-a-parish/"
driver.get(url)

# Wait for the content to load
wait = WebDriverWait(driver, 30)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-bind="html:$data.title"]')))

# Function to scroll to bottom of page
def scroll_to_bottom():
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Scroll to bottom to load all content
scroll_to_bottom()

# Give it a little more time for all elements to load
time.sleep(5)

In [19]:
# Cell 4
# Extract parish information
parishes = []
elements = driver.find_elements(By.CSS_SELECTOR, 'span[data-bind="html:$data.title"]')

for element in elements:
    try:
        parish_info = element.text.strip()
        parts = parish_info.split(',')
        if len(parts) >= 2:
            name = ','.join(parts[:-1]).strip()  # Join all parts except the last one
            city = parts[-1].strip()
            parishes.append((name, city))
            print(f"Parsed: Name: {name}, City: {city}")
        else:
            # Handle entries without a city
            name = parish_info
            city = "N/A"
            parishes.append((name, city))
            print(f"Parsed (No City): Name: {name}, City: {city}")
    except StaleElementReferenceException:
        print("Stale element encountered, skipping...")
        continue

print(f"\nTotal parishes found: {len(parishes)}")

# Close the browser
driver.quit()


Parsed: Name: Queen of Angels, City: Thomson
Parsed: Name: Heritage Center, City: Sharon
Parsed: Name: Church of the Purification of the Blessed Virgin Mary, City: Crawfordville
Parsed: Name: Saint Joseph, City: Washington
Parsed: Name: Saint Mary Mission, City: Elberton
Parsed: Name: Sacred Heart of Jesus, City: Hartwell
Parsed: Name: Sacred Heart, City: Milledgeville
Parsed: Name: Christ Our King and Savior, Greensboro, City: GA
Parsed: Name: Saint James Catholic Church, City: Madison
Parsed: Name: Saint Aelred, City: Bishop

Total parishes found: 10


In [20]:
# Cell 5
# Create SQLite database and table
conn = sqlite3.connect('parishes.db')
cursor = conn.cursor()
cursor.execute('''
    CREATE TABLE IF NOT EXISTS parishes (
        id INTEGER PRIMARY KEY,
        name TEXT,
        city TEXT
    )
''')


<sqlite3.Cursor at 0x7a8e250c0ac0>

In [21]:
# Cell 6
# Insert data into the database
cursor.executemany('INSERT INTO parishes (name, city) VALUES (?, ?)', parishes)
conn.commit()

In [22]:
# Cell 7
# Verify the data in the database
cursor.execute('SELECT * FROM parishes')
rows = cursor.fetchall()
for row in rows[:20]:  # Print first 20 for brevity
    print(row)

print(f"\nTotal parishes in database: {len(rows)}")

(1, 'Queen of Angels', 'Thomson')
(2, 'Heritage Center', 'Sharon')
(3, 'Church of the Purification of the Blessed Virgin Mary', 'Crawfordville')
(4, 'Saint Joseph', 'Washington')
(5, 'Saint Mary Mission', 'Elberton')
(6, 'Sacred Heart of Jesus', 'Hartwell')
(7, 'Sacred Heart', 'Milledgeville')
(8, 'Christ Our King and Savior, Greensboro', 'GA')
(9, 'Saint James Catholic Church', 'Madison')
(10, 'Saint Aelred', 'Bishop')
(11, 'Queen of Angels', 'Thomson')
(12, 'Heritage Center', 'Sharon')
(13, 'Church of the Purification of the Blessed Virgin Mary', 'Crawfordville')
(14, 'Saint Joseph', 'Washington')
(15, 'Saint Mary Mission', 'Elberton')
(16, 'Sacred Heart of Jesus', 'Hartwell')
(17, 'Sacred Heart', 'Milledgeville')
(18, 'Christ Our King and Savior, Greensboro', 'GA')
(19, 'Saint James Catholic Church', 'Madison')
(20, 'Saint Aelred', 'Bishop')

Total parishes in database: 30


In [23]:
# Cell 8
# Close the database connection
conn.close()

print(f"Total parishes added to the database: {len(parishes)}")

Total parishes added to the database: 10


In [10]:
# Expected Output for https://archatl.com/parishes/find-a-parish/:
# # [(1, 'All Saints', 'Dunwoody', '2443 Mount Vernon Road, Dunwoody, GA 30338', 'http://www.allsaintscatholic.org'),
#  (2, 'Annunciation', 'Milledgeville', '171 South Jefferson Street, Milledgeville, GA 31061', 'https://www.annunciationmilledgeville.org/'),
#  (3, 'Ascension', 'Brookhaven', '2989 Lavista Road, Brookhaven, GA 30329', 'http://www.ascca.net'),
#  (4, 'Asian Pacific Ministry', 'Doraville', '2699 Shallowford Road, Doraville, GA 30360', 'N/A'),
#  (5, 'Basilica of the Sacred Heart of Jesus', 'Atlanta', '353 Peachtree Street NE, Atlanta, GA 30308', 'http://www.sacredheartatlanta.org')]