<a href="https://colab.research.google.com/github/tomknightatl/USCCB/blob/main/Build_Parishes_Database_Using_AgenticAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1.1: Import display_database_status from db_utils.py
import sys
import os
if '.' not in sys.path:
    sys.path.insert(0, '.')
try:
    from db_utils import display_database_status
    print("Successfully imported display_database_status from db_utils.py")
except ImportError as e:
    print(f"Error importing display_database_status: {e}")
    print("Make sure db_utils.py is in the same directory or sys.path is configured correctly.")

In [6]:
# Cell 1: Import required libraries
!pip install openai

import requests
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3
import os
from google.colab import userdata
from openai import OpenAI
from urllib.parse import urlparse
import json



In [7]:
# Cell 2: Clone GitHub repository and configure Git


# GitHub credentials
GITHUB_REPO = 'USCCB'
GITHUB_USERNAME = userdata.get('GitHubUserforUSCCB')
GITHUB_PAT = userdata.get('GitHubPATforUSCCB')

# GitHub repository URL
REPO_URL = f"https://{GITHUB_USERNAME}:{GITHUB_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# Check if the repository directory already exists
if not os.path.exists(GITHUB_REPO):
    # Clone the repository
    !git clone {REPO_URL}
    os.chdir(GITHUB_REPO)
else:
    print(f"Repository {GITHUB_REPO} already exists. Updating...")
    os.chdir(GITHUB_REPO)
    !git pull origin main

# Configure Git
!git config --global user.email "tomk@github.leemail.me"
!git config --global user.name "tomknightatl"

Cloning into 'USCCB'...
remote: Enumerating objects: 189, done.[K
remote: Counting objects:   1% (1/53)[Kremote: Counting objects:   3% (2/53)[Kremote: Counting objects:   5% (3/53)[Kremote: Counting objects:   7% (4/53)[Kremote: Counting objects:   9% (5/53)[Kremote: Counting objects:  11% (6/53)[Kremote: Counting objects:  13% (7/53)[Kremote: Counting objects:  15% (8/53)[Kremote: Counting objects:  16% (9/53)[Kremote: Counting objects:  18% (10/53)[Kremote: Counting objects:  20% (11/53)[Kremote: Counting objects:  22% (12/53)[Kremote: Counting objects:  24% (13/53)[Kremote: Counting objects:  26% (14/53)[Kremote: Counting objects:  28% (15/53)[Kremote: Counting objects:  30% (16/53)[Kremote: Counting objects:  32% (17/53)[Kremote: Counting objects:  33% (18/53)[Kremote: Counting objects:  35% (19/53)[Kremote: Counting objects:  37% (20/53)[Kremote: Counting objects:  39% (21/53)[Kremote: Counting objects:  41% (22/53)[Kremote: Counting o

In [None]:
# Cell 2.1: Display initial database status
# This is after cloning the repo and before database operations.
print("--- Displaying Initial Database Status (Build_Parishes_Database_Using_AgenticAI.ipynb) ---")
display_database_status('data.db')

In [8]:
# Cell 3: Retrieve URLs from the database
conn = sqlite3.connect('data.db')
cursor = conn.cursor()

# Fetch non-null parish directory URLs.  Note this is temporarily limited to 3 records, for testing.
cursor.execute("SELECT parish_directory_url FROM DiocesesParishDirectory WHERE parish_directory_url IS NOT NULL LIMIT 3")
urls = cursor.fetchall()


In [12]:
# Cell 4: Process each URL using OpenAI's API

# Set up OpenAI API key
api_key = userdata.get('OpenAIAPIKeyforUSCCBKey')
client = OpenAI(api_key=api_key)

def extract_domain(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

def process_url_with_openai(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract visible text from the webpage
    visible_text = ' '.join([s for s in soup.stripped_strings])

    # Prepare the prompt for OpenAI
    prompt = f"""
    Extract parish information from the following webpage content.
    The information should include: Name, Status, Deanery, EST (Established Date),
    Street Address, City, State, Zipcode, Phone Number, and Website.
    If any information is missing, use null.
    Format the output as a valid JSON object with these exact keys:
    {{"Name": null, "Status": null, "Deanery": null, "EST": null, "Street Address": null, "City": null, "State": null, "Zipcode": null, "Phone Number": null, "Website": null}}

    Webpage content:
    {visible_text[:40000]}  # Limit to 4000 characters to fit within API limits
    """

    # Call OpenAI API
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts structured data from unstructured text. Always return a valid JSON object."},
                {"role": "user", "content": prompt}
            ]
        )

        # Attempt to parse the JSON response
        content = response.choices[0].message.content.strip()
        print(f"API Response: {content}")  # Log the raw API response
        extracted_data = json.loads(content)
        return extracted_data
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {str(e)}")
        print(f"Raw API Response: {content}")
        return None
    except Exception as e:
        print(f"Error calling OpenAI API: {str(e)}")
        return None

# Process each URL
for url in urls:
    url = url[0]  # Extract URL from tuple
    print(f"Processing URL: {url}")

    try:
        parish_data = process_url_with_openai(url)

        if parish_data is None:
            print(f"Skipping URL due to processing error: {url}")
            continue

        # Add the source URL and domain to the data
        parish_data['source_url'] = url
        parish_data['domain'] = extract_domain(url)

        # Insert data into the Parishes table
        cursor.execute('''
            INSERT INTO Parishes (
                Name, Status, Deanery, EST, StreetAddress, City, State, Zipcode,
                PhoneNumber, Website, source_url, domain
            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            parish_data.get('Name'),
            parish_data.get('Status'),
            parish_data.get('Deanery'),
            parish_data.get('EST'),
            parish_data.get('Street Address'),
            parish_data.get('City'),
            parish_data.get('State'),
            parish_data.get('Zipcode'),
            parish_data.get('Phone Number'),
            parish_data.get('Website'),
            parish_data['source_url'],
            parish_data['domain']
        ))

        conn.commit()
        print(f"Data inserted for: {parish_data.get('Name', 'Unknown Parish')}")
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

print("All URLs processed.")

Processing URL: http://www.eparchyofphoenix.org/directory-of-parishes
API Response: ```json
{
  "Name": null,
  "Status": null,
  "Deanery": null,
  "EST": null,
  "Street Address": null,
  "City": null,
  "State": null,
  "Zipcode": null,
  "Phone Number": null,
  "Website": null
}
```
JSON Decode Error: Expecting value: line 1 column 1 (char 0)
Raw API Response: ```json
{
  "Name": null,
  "Status": null,
  "Deanery": null,
  "EST": null,
  "Street Address": null,
  "City": null,
  "State": null,
  "Zipcode": null,
  "Phone Number": null,
  "Website": null
}
```
Skipping URL due to processing error: http://www.eparchyofphoenix.org/directory-of-parishes
Processing URL: http://www.dolr.org/parishes
API Response: {
    "Name": null,
    "Status": null,
    "Deanery": null,
    "EST": null,
    "Street Address": null,
    "City": null,
    "State": null,
    "Zipcode": null,
    "Phone Number": null,
    "Website": null
}
Error processing http://www.dolr.org/parishes: table Parishes has 

In [None]:
# Cell 4.1: Display final database status with details for Parishes table
print("--- Displaying Final Database Status (Build_Parishes_Database_Using_AgenticAI.ipynb) ---")
display_database_status('data.db', show_details=True, tables_to_show=['Parishes'])

In [10]:
# Cell 6: Commit changes and push to GitHub
# Add changes to git
!git add data.db

# Commit changes
!git commit -m "Added data to  data.db using Build_Parishes_Database_From_Table.ipynb"

# Push changes to GitHub
!git push origin main

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Everything up-to-date
