<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build_Parishes_Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

# Define the filename for the Chrome installer
chrome_installer_filename = "google-chrome-stable_current_amd64.deb"

# Check if the file already exists before downloading
if not os.path.exists(chrome_installer_filename):
    # Download the Google Chrome installer if it doesn't exist
    !wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb

# Install the downloaded package
!dpkg -i {chrome_installer_filename}

# Fix any dependency issues
!apt-get -f install -y

# Install webdriver-manager
!pip install webdriver-manager

# Install Selenium
!pip install selenium

# Set up Selenium with Chrome
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 123754 files and directories currently installed.)
Preparing to unpack google-chrome-stable_current_amd64.deb ...
Unpacking google-chrome-stable (129.0.6668.58-1) over (129.0.6668.58-1) ...
Setting up google-chrome-stable (129.0.6668.58-1) ...
Processing triggers for man-db (2.10.2-1) ...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
0 upgraded, 0 newly installed, 0 to remove and 4

In [2]:
# Cell 1
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import sqlite3


In [3]:
# Cell 3
# Fetch the web page
url = "https://archatl.com/parishes/find-a-parish/"
driver.get(url)

# Wait for the content to load
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'span[data-bind="html:$data.title"]')))

# Give it a little more time for all elements to load
time.sleep(10)

In [4]:
# Cell 4
# Extract parish information
parishes = []
elements = driver.find_elements(By.CSS_SELECTOR, 'span[data-bind="html:$data.title"]')

for element in elements:
    parish_info = element.text.strip()
    parts = parish_info.split(',')
    if len(parts) >= 2:
        name = ','.join(parts[:-1]).strip()  # Join all parts except the last one
        city = parts[-1].strip()
        parishes.append((name, city))
        print(f"Parsed: Name: {name}, City: {city}")
    else:
        print(f"Skipping invalid entry: {parish_info}")

print(f"\nTotal parishes found: {len(parishes)}")

# Close the browser
driver.quit()

Parsed: Name: Saint Helena Catholic Church, City: Clayton
Parsed: Name: Sacred Heart of Jesus, City: Hartwell
Parsed: Name: Saint Mary, City: Toccoa
Parsed: Name: Saint Mark, City: Clarkesville
Parsed: Name: Saint Mary Mission, City: Elberton
Parsed: Name: Saint Francis of Assisi, City: Blairsville
Skipping invalid entry: Capilla Santo Domingo
Parsed: Name: Saint Paul the Apostle, City: Cleveland
Parsed: Name: Saint Joseph, City: Washington
Parsed: Name: Queen of Angels, City: Thomson

Total parishes found: 9


In [5]:
# Cell 5
# Create SQLite database and table
conn = sqlite3.connect('parishes.db')
cursor = conn.cursor()
cursor.execute('''
    CREATE TABLE IF NOT EXISTS parishes (
        id INTEGER PRIMARY KEY,
        name TEXT,
        city TEXT
    )
''')


<sqlite3.Cursor at 0x7fed02a00dc0>

In [6]:
# Cell 6
# Insert data into the database
cursor.executemany('INSERT INTO parishes (name, city) VALUES (?, ?)', parishes)
conn.commit()

In [7]:
# Cell 7
# Verify the data in the database
cursor.execute('SELECT * FROM parishes')
rows = cursor.fetchall()
for row in rows[:10]:  # Print first 10 for brevity
    print(row)

(1, 'Saint Helena Catholic Church', 'Clayton')
(2, 'Sacred Heart of Jesus', 'Hartwell')
(3, 'Saint Mary', 'Toccoa')
(4, 'Saint Mark', 'Clarkesville')
(5, 'Saint Mary Mission', 'Elberton')
(6, 'Saint Francis of Assisi', 'Blairsville')
(7, 'Saint Paul the Apostle', 'Cleveland')
(8, 'Saint Joseph', 'Washington')
(9, 'Queen of Angels', 'Thomson')


In [8]:
# Cell 8
# Close the database connection
conn.close()

print(f"Total parishes added to the database: {len(parishes)}")

Total parishes added to the database: 9


In [9]:
# Cell 9
# Quit the driver after use
driver.quit()



In [None]:
# Expected Output for https://archatl.com/parishes/find-a-parish/:
# # [(1, 'All Saints', 'Dunwoody', '2443 Mount Vernon Road, Dunwoody, GA 30338', 'http://www.allsaintscatholic.org'),
#  (2, 'Annunciation', 'Milledgeville', '171 South Jefferson Street, Milledgeville, GA 31061', 'https://www.annunciationmilledgeville.org/'),
#  (3, 'Ascension', 'Brookhaven', '2989 Lavista Road, Brookhaven, GA 30329', 'http://www.ascca.net'),
#  (4, 'Asian Pacific Ministry', 'Doraville', '2699 Shallowford Road, Doraville, GA 30360', 'N/A'),
#  (5, 'Basilica of the Sacred Heart of Jesus', 'Atlanta', '353 Peachtree Street NE, Atlanta, GA 30308', 'http://www.sacredheartatlanta.org')]