<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build_Parishes_Database_From_Table.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1.1: Import display_database_status from db_utils.py
import sys
import os
# Assuming db_utils.py is in the root of the repo
if '.' not in sys.path: # Add current dir to path if not already there
    sys.path.insert(0, '.')
try:
    from db_utils import display_database_status
    print("Successfully imported display_database_status from db_utils.py")
except ImportError as e:
    print(f"Error importing display_database_status: {e}")
    print("Make sure db_utils.py is in the same directory or sys.path is configured correctly.")

In [None]:
# Cell 1: Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import os
from google.colab import userdata

In [None]:
# Cell 2: Clone GitHub repository and configure Git


# GitHub credentials
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# GitHub repository URL
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# Check if the repository directory already exists
if not os.path.exists(GITHUB_REPO):
    # Clone the repository
    !git clone {REPO_URL}
    os.chdir(GITHUB_REPO)
else:
    print(f"Repository {GITHUB_REPO} already exists. Updating...")
    os.chdir(GITHUB_REPO)
    !git pull origin main

# Configure Git
!git config --global user.email "tomk@github.leemail.me"
!git config --global user.name "tomknightatl"

In [None]:
# Cell 2.1: Display initial database status
# This is after cloning the repo and before database operations.
print("--- Displaying Initial Database Status (Build_Parishes_Database_From_Table.ipynb) ---")
display_database_status('data.db')

In [None]:
# Cell 3: Retrieve URLs from the database
conn = sqlite3.connect('data.db')
cursor = conn.cursor()

# Fetch non-null parish directory URLs.  Note this is temporarily limited to 3 records, for testing.
cursor.execute("SELECT parish_directory_url FROM DiocesesParishDirectory WHERE parish_directory_url IS NOT NULL LIMIT 3")
urls = cursor.fetchall()


In [None]:
# Cell 4.1: Display final database status with details
print("--- Displaying Final Database Status (Build_Parishes_Database_From_Table.ipynb) ---")
display_database_status('data.db', show_details=True)

In [None]:
# Cell 4: Process each URL
for url in urls:
    url = url[0]  # Extract URL from tuple
    print(f"Processing URL: {url}")

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table (adjust the selector if needed)
    table = soup.find('table', {'id': 'table_1'})

    if table:
        rows = table.find_all('tr')

        data = []
        for row in rows[1:]:  # Skip the header row
            cols = row.find_all('td')
            row_data = [col.text.strip() for col in cols[:-1]]  # Extract all columns except the last one

            # Extract the hyperlink from the last column
            web_col = cols[-1]
            link = web_col.find('a')
            if link:
                row_data.append(link.get('href'))
            else:
                row_data.append('')

            data.append(row_data)

        # Create a DataFrame
        columns = ['Name', 'Status', 'Deanery', 'EST', 'Street Address', 'City', 'State', 'Zipcode', 'Phone Number', 'Web']
        df = pd.DataFrame(data, columns=columns)

        # Store the data in the database
        table_name = f"parishes_{url.split('/')[-2]}"  # Create a unique table name based on the URL
        df.to_sql(table_name, conn, if_exists='replace', index=False)
        print(f"Data stored in table: {table_name}")
    else:
        print(f"No table found for URL: {url}")

In [None]:
# Cell 6: Commit changes and push to GitHub
# Add changes to git
!git add data.db

# Commit changes
!git commit -m "Added data to  data.db using Build_Parishes_Database_From_Table.ipynb"

# Push changes to GitHub
!git push origin main