<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build%20Dioceses%20Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1.1: Import display_database_status from db_utils.py
import sys
import os
# Assuming db_utils.py is in the root of the repo, and notebooks might be run from repo root
# If notebooks are in a subdirectory, this might need adjustment, but for now, let's assume repo root.
if '.' not in sys.path: # Add current dir to path if not already there
    sys.path.insert(0, '.')
try:
    from db_utils import display_database_status
    print("Successfully imported display_database_status from db_utils.py")
except ImportError as e:
    print(f"Error importing display_database_status: {e}")
    print("Make sure db_utils.py is in the same directory or sys.path is configured correctly.")

In [None]:
# Cell 1: Import necessary libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import re
from urllib.parse import urljoin, urlparse
import time
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scraping.log"),
        logging.StreamHandler()
    ]
)

In [None]:
# Cell 2: Clone GitHub repository and configure Git
import os
from google.colab import userdata

# GitHub credentials
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# GitHub repository URL
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# Check if the repository directory already exists
if not os.path.exists(GITHUB_REPO):
    # Clone the repository
    !git clone {REPO_URL}
    os.chdir(GITHUB_REPO)
else:
    print(f"Repository {GITHUB_REPO} already exists. Updating...")
    os.chdir(GITHUB_REPO)
    !git pull origin main

# Configure Git
!git config --global user.email "tomk@github.leemail.me"
!git config --global user.name "tomknightatl"

In [None]:
# Cell 2.1: Display initial database status
# This is after cloning the repo and before database operations.
# The CWD should be the repo root due to os.chdir in the previous cell.
print("--- Displaying Initial Database Status ---")
display_database_status('data.db')

In [None]:
# Cell 3: Define helper functions

def get_soup(url, retries=3, backoff_factor=1.0):
    """
    Fetches the content at the given URL and returns a BeautifulSoup object.
    Implements retries with exponential backoff in case of request failures.
    """
    headers = {
        'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/58.0.3029.110 Safari/537.3'),
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    for attempt in range(1, retries + 1):
        try:
            logging.info(f"Attempt {attempt}: Fetching URL: {url}")
            response = requests.get(url, headers=headers, timeout=20)
            logging.info(f"Received status code: {response.status_code}")
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            logging.warning(f"Attempt {attempt} failed with error: {e}")
            if attempt == retries:
                logging.error(f"All {retries} attempts failed for URL: {url}")
                return None
            sleep_time = backoff_factor * (2 ** (attempt - 1))
            logging.info(f"Retrying in {sleep_time} seconds...")
            time.sleep(sleep_time)

def extract_dioceses(soup):
    """
    Extracts dioceses information from the parsed HTML.
    Returns a list of dictionaries with diocese details.
    """
    dioceses = []
    diocese_containers = soup.find_all('div', class_='views-row')

    logging.info(f"Found {len(diocese_containers)} potential diocese containers")

    for i, container in enumerate(diocese_containers):
        logging.info(f"Processing container {i+1}")

        da_wrap = container.find('div', class_='da-wrap')
        if not da_wrap:
            logging.warning(f"No da-wrap found in container {i+1}")
            continue

        name_div = da_wrap.find('div', class_='da-title')
        diocese_name = name_div.get_text(strip=True) if name_div else "N/A"
        logging.info(f"Diocese name: {diocese_name}")

        address_div = da_wrap.find('div', class_='da-address')
        address_parts = []
        if address_div:
            for div in address_div.find_all('div', recursive=False):
                text = div.get_text(strip=True)
                if text:
                    address_parts.append(text)

        address = ", ".join(address_parts)
        logging.info(f"Address: {address}")

        website_div = da_wrap.find('div', class_='site')
        website_url = website_div.find('a')['href'] if website_div and website_div.find('a') else "N/A"
        logging.info(f"Website: {website_url}")

        dioceses.append({
            'Name': diocese_name,
            'Address': address,
            'Website': website_url
        })

    return dioceses

In [None]:
# Cell 9.1: Display final database status with details
print("--- Displaying Final Database Status (with details for Dioceses table) ---")
display_database_status('data.db', show_details=True, tables_to_show=['Dioceses'])

In [None]:
# Cell 4: Fetch and parse the HTML content from URL

url = "https://www.usccb.org/about/bishops-and-dioceses/all-dioceses"
soup = get_soup(url)

if soup:
    print("Successfully fetched and parsed the dioceses page.")
    # Print the first 1000 characters of the HTML to check its structure
    print("First 1000 characters of the HTML:")
    print(soup.prettify()[:1000])
else:
    print("Failed to fetch the dioceses page. Please check your connection or the URL.")
    exit()

In [None]:
# Cell 5: Extract dioceses information

dioceses = extract_dioceses(soup)
print(f"Extracted information for {len(dioceses)} dioceses.")

if len(dioceses) == 0:
    print("No dioceses were extracted. Printing the structure of the page:")
    print(soup.prettify())

In [None]:
# Cell 6: Create a DataFrame and display results

dioceses_df = pd.DataFrame(dioceses)
print(dioceses_df.head())

In [None]:
# Cell 8: Initialize SQLite database and create table

conn = sqlite3.connect('data.db')
cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE IF NOT EXISTS Dioceses (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        Name TEXT,
        Address TEXT,
        Website TEXT
    )
''')

conn.commit()


In [None]:
# Cell 9: Populate Dioceses table

for _, row in dioceses_df.iterrows():
    cursor.execute('''
        INSERT INTO Dioceses (Name, Address, Website)
        VALUES (?, ?, ?)
    ''', (row['Name'], row['Address'], row['Website']))

conn.commit()
print("Data has been inserted into the SQLite database.")

# Cell 9: Query and display data from the database

cursor.execute('SELECT * FROM Dioceses LIMIT 5')
results = cursor.fetchall()

print("First 5 entries from the database:")
for row in results:
    print(row)

# Close the database connection
conn.close()

print("Database connection closed.")


In [None]:
# Cell 10: Commit changes and push to GitHub
# Add changes to git
!git add data.db

# Commit changes
!git commit -m "Added records in data.db using Build Dioceses Database.ipynb"

# Push changes to GitHub
!git push origin main