<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build%20Dioceses%20Database.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install supabase

In [None]:
# Cell for Supabase client initialization
from supabase import create_client, Client
import os

# IMPORTANT: Replace with your actual Supabase URL and Key
# It's recommended to use environment variables or a secure way to manage credentials.
# For demonstration, placeholders are used here.
# You can set these as environment variables in your Colab environment (e.g., using os.environ)
# or directly replace the strings below if you are running this in a secure, private environment.

SUPABASE_URL = "YOUR_SUPABASE_URL"  # Replace with your Supabase project URL
SUPABASE_KEY = "YOUR_SUPABASE_ANON_KEY" # Replace with your Supabase anon key

# Attempt to get from environment variables if set, otherwise use placeholders
supabase_url = os.environ.get("SUPABASE_URL", SUPABASE_URL)
supabase_key = os.environ.get("SUPABASE_KEY", SUPABASE_KEY)

if supabase_url == "YOUR_SUPABASE_URL" or supabase_key == "YOUR_SUPABASE_ANON_KEY":
    print("WARNING: Supabase URL or Key is using placeholder values.")
    print("Please replace 'YOUR_SUPABASE_URL' and 'YOUR_SUPABASE_ANON_KEY' in this cell with your actual Supabase credentials.")
    print("Alternatively, set them as environment variables SUPABASE_URL and SUPABASE_KEY.")
    # Initialize with placeholders, a connection attempt might fail if these are not valid
    # Or, you might choose to raise an error or prevent client creation until they are set.
    # For now, we'll allow initialization to proceed to show the structure.
    supabase: Client = create_client(supabase_url, supabase_key)
    print("Supabase client initialized with placeholder credentials (likely non-functional).")
else:
    try:
        supabase: Client = create_client(supabase_url, supabase_key)
        print("Successfully initialized Supabase client.")
        # Optional: You could add a small test here, like listing tables if permissions allow,
        # but for now, just initialization is fine.
    except Exception as e:
        print(f"Error initializing Supabase client: {e}")
        print("Please ensure your SUPABASE_URL and SUPABASE_KEY are correct.")

In [None]:
# Cell 1: Import necessary libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
from urllib.parse import urljoin, urlparse
import time
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scraping.log"),
        logging.StreamHandler()
    ]
)

In [None]:
# Cell 3: Define helper functions

def get_soup(url, retries=3, backoff_factor=1.0):
    """
    Fetches the content at the given URL and returns a BeautifulSoup object.
    Implements retries with exponential backoff in case of request failures.
    """
    headers = {
        'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                       'AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/58.0.3029.110 Safari/537.3'),
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }

    for attempt in range(1, retries + 1):
        try:
            logging.info(f"Attempt {attempt}: Fetching URL: {url}")
            response = requests.get(url, headers=headers, timeout=20)
            logging.info(f"Received status code: {response.status_code}")
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            logging.warning(f"Attempt {attempt} failed with error: {e}")
            if attempt == retries:
                logging.error(f"All {retries} attempts failed for URL: {url}")
                return None
            sleep_time = backoff_factor * (2 ** (attempt - 1))
            logging.info(f"Retrying in {sleep_time} seconds...")
            time.sleep(sleep_time)

def extract_dioceses(soup):
    """
    Extracts dioceses information from the parsed HTML.
    Returns a list of dictionaries with diocese details.
    """
    dioceses = []
    diocese_containers = soup.find_all('div', class_='views-row')

    logging.info(f"Found {len(diocese_containers)} potential diocese containers")

    for i, container in enumerate(diocese_containers):
        logging.info(f"Processing container {i+1}")

        da_wrap = container.find('div', class_='da-wrap')
        if not da_wrap:
            logging.warning(f"No da-wrap found in container {i+1}")
            continue

        name_div = da_wrap.find('div', class_='da-title')
        diocese_name = name_div.get_text(strip=True) if name_div else "N/A"
        logging.info(f"Diocese name: {diocese_name}")

        address_div = da_wrap.find('div', class_='da-address')
        address_parts = []
        if address_div:
            for div in address_div.find_all('div', recursive=False):
                text = div.get_text(strip=True)
                if text:
                    address_parts.append(text)

        address = ", ".join(address_parts)
        logging.info(f"Address: {address}")

        website_div = da_wrap.find('div', class_='site')
        website_url = website_div.find('a')['href'] if website_div and website_div.find('a') else "N/A"
        logging.info(f"Website: {website_url}")

        dioceses.append({
            'Name': diocese_name,
            'Address': address,
            'Website': website_url
        })

    return dioceses

In [None]:
# Cell 4: Fetch and parse the HTML content from URL

url = "https://www.usccb.org/about/bishops-and-dioceses/all-dioceses"
soup = get_soup(url)

if soup:
    print("Successfully fetched and parsed the dioceses page.")
    # Print the first 1000 characters of the HTML to check its structure
    print("First 1000 characters of the HTML:")
    print(soup.prettify()[:1000])
else:
    print("Failed to fetch the dioceses page. Please check your connection or the URL.")
    exit()

In [None]:
# Cell 5: Extract dioceses information

dioceses = extract_dioceses(soup)
print(f"Extracted information for {len(dioceses)} dioceses.")

if len(dioceses) == 0:
    print("No dioceses were extracted. Printing the structure of the page:")
    print(soup.prettify())

In [None]:
# Cell 6: Create a DataFrame and display results

dioceses_df = pd.DataFrame(dioceses)
print(dioceses_df.head())

In [None]:
# This cell assumes 'dioceses_df' is available from previous cells
# and 'supabase' client is initialized from a previous cell.

print("Data extraction from website complete and DataFrame is ready.")

# Check if Supabase client is initialized and not using placeholder credentials
if 'supabase' in locals() and supabase_url != "YOUR_SUPABASE_URL" and supabase_key != "YOUR_SUPABASE_ANON_KEY":
    print("Attempting to insert data into Supabase table 'Dioceses'...")
    try:
        for index, row in dioceses_df.iterrows():
            # Convert row to dictionary
            row_dict = row.to_dict()
            # Insert data into Supabase
            data, error = supabase.table('Dioceses').insert(row_dict).execute()
            if error:
                print(f"Error inserting row {index}: {error}")
            # else:
            #     print(f"Successfully inserted row {index}") # Optional: too verbose for many rows

        print("Data insertion into Supabase 'Dioceses' table complete.")

        # Query and display data from the Supabase database
        print("Fetching first 5 entries from the Supabase 'Dioceses' table:")
        # The result from execute() is typically a tuple (data, error) or a custom object.
        # For Supabase Python client v1, it's often an APIResponse object.
        # For Supabase Python client v2 (supabase-py), data is directly in response.data.
        response = supabase.table('Dioceses').select('*').limit(5).execute()

        if response.data:
            results = response.data
            if results:
                for row_data in results:
                    print(row_data)
            else:
                print("No data returned or empty data after successful query.")
        elif hasattr(response, 'error') and response.error:
            print(f"Error querying data: {response.error.message if hasattr(response.error, 'message') else response.error}")
        else:
            # Fallback for unexpected response structure or if data is empty and no error is explicitly set
            print("No data returned or an unexpected response structure. Full response:")
            # Be cautious printing the full response; it might be large or contain sensitive info depending on the client version and error details.
            # print(response) 

    except Exception as e:
        print(f"An error occurred during Supabase operations: {e}")
        print("Please ensure the 'Dioceses' table exists in Supabase with columns: Name, Address, Website.")
else:
    print("Supabase client not properly initialized or using placeholder credentials. Skipping Supabase operations.")
    print("Please ensure Supabase is correctly configured with actual credentials in the 'supabase-init-cell'.")
