# Build Dioceses Database

This notebook scrapes the USCCB website to build the initial dioceses database.

**Prerequisites**: Run `00_Colab_Setup.ipynb` first.

In [None]:
# Cell 1: Imports and Setup
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime

from config.settings import get_config
from src.utils.webdriver import setup_driver, load_page, clean_text

# Get configuration
try:
    config = get_config()
    print("‚úÖ Configuration loaded")
except RuntimeError:
    print("‚ùå Please run 00_Colab_Setup.ipynb first")
    raise

In [None]:
# Cell 2: Scrape USCCB Dioceses Page
def scrape_dioceses_from_usccb():
    """Scrape dioceses information from USCCB website"""
    url = "https://www.usccb.org/about/bishops-and-dioceses/all-dioceses"
    print(f"Scraping dioceses from: {url}")
    
    driver = setup_driver()
    try:
        soup = load_page(driver, url)
        print("‚úÖ Page loaded successfully")
        
        # Find diocese containers
        diocese_containers = soup.find_all('div', class_='views-row')
        print(f"Found {len(diocese_containers)} potential diocese containers")
        
        dioceses = []
        
        for i, container in enumerate(diocese_containers):
            diocese_data = extract_diocese_info(container)
            if diocese_data:
                dioceses.append(diocese_data)
                if len(dioceses) % 10 == 0:
                    print(f"Processed {len(dioceses)} dioceses...")
        
        print(f"‚úÖ Successfully extracted {len(dioceses)} dioceses")
        return dioceses
        
    finally:
        driver.quit()

def extract_diocese_info(container):
    """Extract diocese information from a container element"""
    try:
        da_wrap = container.find('div', class_='da-wrap')
        if not da_wrap:
            return None
        
        # Extract name
        name_div = da_wrap.find('div', class_='da-title')
        if not name_div:
            return None
        name = clean_text(name_div.get_text())
        
        # Extract address
        address_div = da_wrap.find('div', class_='da-address')
        address_parts = []
        if address_div:
            for div in address_div.find_all('div', recursive=False):
                text = clean_text(div.get_text())
                if text:
                    address_parts.append(text)
        
        address = ", ".join(address_parts) if address_parts else None
        
        # Extract website
        website_div = da_wrap.find('div', class_='site')
        website = None
        if website_div:
            link = website_div.find('a')
            if link:
                website = link.get('href')
        
        if name and len(name) > 2:
            return {
                'Name': name,
                'Address': address,
                'Website': website,
                'extracted_at': datetime.now().isoformat()
            }
    
    except Exception as e:
        print(f"Error extracting diocese info: {e}")
    
    return None

# Run the scraping
dioceses_data = scrape_dioceses_from_usccb()

In [None]:
# Cell 3: Create DataFrame and Display Results
if dioceses_data:
    df = pd.DataFrame(dioceses_data)
    
    print(f"üìä Created DataFrame with {len(df)} dioceses")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nFirst 5 dioceses:")
    display(df.head())
    
    # Check for missing websites
    missing_websites = df['Website'].isna().sum()
    print(f"\nüìä Statistics:")
    print(f"Total dioceses: {len(df)}")
    print(f"With websites: {len(df) - missing_websites}")
    print(f"Missing websites: {missing_websites}")
    
else:
    print("‚ùå No dioceses data extracted")
    df = pd.DataFrame()

In [None]:
# Cell 4: Save to Database
if not df.empty and config.supabase:
    print("üíæ Saving dioceses to database...")
    
    # Convert DataFrame to list of dictionaries
    records = df.to_dict('records')
    
    try:
        # Insert data in batches to avoid timeouts
        batch_size = 20
        total_inserted = 0
        
        for i in range(0, len(records), batch_size):
            batch = records[i:i + batch_size]
            
            try:
                response = config.supabase.table('Dioceses').insert(batch).execute()
                
                if hasattr(response, 'error') and response.error:
                    print(f"‚ùå Database error for batch {i//batch_size + 1}: {response.error}")
                else:
                    total_inserted += len(batch)
                    print(f"‚úÖ Inserted batch {i//batch_size + 1}: {len(batch)} dioceses")
            
            except Exception as e:
                print(f"‚ùå Error inserting batch {i//batch_size + 1}: {e}")
            
            # Small delay between batches
            time.sleep(0.5)
        
        print(f"\nüìä Final Results:")
        print(f"Total dioceses extracted: {len(df)}")
        print(f"Successfully saved to database: {total_inserted}")
        
        if total_inserted > 0:
            print(f"‚úÖ Dioceses database built successfully!")
        
    except Exception as e:
        print(f"‚ùå Database operation failed: {e}")

elif df.empty:
    print("‚ùå No data to save")
    
else:
    print("‚ö†Ô∏è Database not configured - data not saved")
    print("You can still use the extracted data from the DataFrame 'df'")

In [None]:
# Cell 5: Export to CSV (Optional)
if not df.empty:
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'dioceses_extracted_{timestamp}.csv'
    
    try:
        df.to_csv(filename, index=False)
        print(f"üìÅ Data exported to: {filename}")
        
        # Download file in Colab
        try:
            from google.colab import files
            files.download(filename)
            print(f"‚¨áÔ∏è File downloaded")
        except ImportError:
            print(f"üìÅ File saved locally: {filename}")
    
    except Exception as e:
        print(f"‚ùå Export failed: {e}")
else:
    print("‚ùå No data to export")