In [1]:
### Packages ###
import os
import re
import requests
from pathlib import Path
from bs4 import BeautifulSoup


In [2]:
def GetHomily_Years(PAPAL_Name):
    """Extracts the year's URLs from a Pope's Vatican page."""

    ### INDEX_URL ###
    BASE_URL = "https://www.vatican.va"
    INDEX_URL = f"{BASE_URL}/content/{PAPAL_Name}/en/homilies.index.html"

    ### Fetch URL ###
    response = requests.get(INDEX_URL)
    if response.status_code != 200:
        print("Failed to fetch the homily index page")
        return []

    ### Parse HTML Content ###
    soup = BeautifulSoup(response.text, "html.parser")

    ### Find all year links inside the main homily section ###
    homily_links_years = {}
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if f"/content/{PAPAL_Name}/en/homilies/" in href and href.endswith(".html"):
            full_url = BASE_URL + href if href.startswith("/") else href
            YearNumber = full_url[-15:-11]
            homily_links_years[YearNumber] = full_url

    return homily_links_years


In [3]:
def GetHomily_Links(homily_links_years, Language = "English"):
    """Extracts all English homily URLs from a given Vatican index page."""
    
    ### Set Up ###
    page_number = 1
    homily_links = []

    while True:

        ### URL Page Link ###
        page_url = f"{homily_links_years}" if page_number == 1 else f"{homily_links_years.replace('.html', f'.{page_number}.html')}"
        response = requests.get(page_url)

        ### Exit if no more pages ###
        if response.status_code != 200:
            break

        ### Parse HTML ###
        soup = BeautifulSoup(response.text, 'html.parser')

        ### Determine if there are any links on this page ###
        found_links = 0
        for li in soup.find_all("li"):
            a_tag = li.find("a", string=Language)
            if a_tag and a_tag.get("href"):
                homily_link = "https://www.vatican.va" + a_tag.get("href")
                homily_links.append(homily_link)
                found_links += 1

        ### Exit if there are noy more links ###
        if found_links == 0:
            break
        
        ### Reassign page number ###
        page_number += 1

    return homily_links

In [4]:
import requests
from bs4 import BeautifulSoup
import re

def ExtractText(homily_link):
    """
    Fetches the homily text from a Vatican URL, extracting the title, date, and text.
    
    Args:
        homily_link (str): The URL of the homily page.
    Returns:
        dict: A dictionary containing the 'Title', 'Date', and 'Text' of the homily.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    response = requests.get(homily_link, headers=headers)
    
    if response.status_code != 200:
        return {"Error": f"Failed to retrieve the webpage. Status code: {response.status_code}"}
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extract Title
    title = ""
    title_element = (
        soup.find("h1") or 
        soup.find("h2") or 
        soup.find("div", class_="titolo")
    )
    if title_element:
        title = title_element.get_text(strip=True)
    
    # Extract Date
    date = ""
    date_element = soup.find("p", class_="docdate")
    if date_element:
        date = date_element.get_text(strip=True)

    # Initialize text content
    homily_text = ""

    # Define possible content containers
    content_divs = [
        soup.find("div", class_="testo"),
        soup.find("div", class_="text parbase vaticancontent"),
        soup.find("div", class_="abstract text parbase vaticancontent")
    ]

    for content_div in content_divs:
        if content_div:
            paragraphs = []
            # Get paragraphs and texts
            for p in content_div.find_all(['p', 'div']):
                # Skip irrelevant classes
                if p.get('class') and any(cls in ['clearfix', 'title'] for cls in p.get('class')):
                    continue

                text = p.get_text(strip=True)
                # Filter out boilerplate text or short fragments
                if text and len(text.split()) > 3:
                    if not any(marker in text for marker in [
                        "L'Osservatore Romano",
                        "Weekly Edition in English",
                        "Copyright ©",
                        '[1]', '[2]', '[3]',
                        '[Multimedia]',
                        '_' * 3,  # separator lines
                    ]):
                        paragraphs.append(text)

            # Join paragraphs and break if found content
            if paragraphs:
                homily_text = "\n\n".join(paragraphs)
                break

    # If no content found, handle alternative structures
    if not homily_text:
        # Look for various separators
        separator = soup.find('hr', attrs={'color': '#C0C0C0'})
        
        if separator:
            # Get all content after the separator
            main_content = []
            for sibling in separator.find_all_next():
                if sibling.name == 'p' or sibling.name == 'div':
                    text = sibling.get_text(strip=True)
                    # Check for validity of the text
                    if text and len(text.split()) > 3:
                        main_content.append(text)
                elif sibling.name is None:  # If text node
                    text = sibling.strip()
                    if text and len(text.split()) > 3:
                        main_content.append(text)

            homily_text = "\n\n".join(main_content)

    # Final cleanup
    if homily_text:
        homily_text = re.sub(r'\s+', ' ', homily_text).strip()  # Normalize whitespace
        homily_text = re.sub(r'\[.*?\]', '', homily_text)  # Remove bracketed content
        homily_text = homily_text.strip()
    
    return {
        "Title": title,
        "Date": date,
        "Text": homily_text
    }

In [5]:
def SaveHomilies(PopeName="francesco", base_dir="/Users/simondn/Documents/CSSS594/Project/Data"):
    """
    Downloads and saves homilies for a given pope, organizing them by year.

    Parameters:
    - HomilyYearLinks: dict (keys: years, values: URLs for each year)
    - PopeName: str (name of the pope, used as directory name)
    - base_dir: str (base directory where homilies will be saved)
    """

    ### Directory ###
    PopeDirectory = os.path.join(base_dir, PopeName)
    Path(PopeDirectory).mkdir(parents=True, exist_ok=True)

    ### Extract HomilyYearLinks ###
    HomilyYearLinks = GetHomily_Years(PAPAL_Name = PopeName)

    for Year, URL_Year in HomilyYearLinks.items():
        print(f"YEAR: {Year}")

        ### Create a subdirectory for the year ###
        YearDirectory = os.path.join(PopeDirectory, str(Year))
        Path(YearDirectory).mkdir(parents=True, exist_ok=True)

        ### Get Homily Links for the Year ###
        HomilyLink_YEAR = GetHomily_Links(URL_Year, Language="English")

        ### For each homily in the Year ###
        for URL_Homily in HomilyLink_YEAR:
            # print(f"Processing: {URL_Homily}")

            ## Extract text from homily ##
            homily_data = ExtractText(URL_Homily)
            homily_text = homily_data.get("Text", "").strip()

            if homily_text:  # Only save if text is found
                # Define filename: Extract last part of URL as filename
                homily_filename = os.path.basename(URL_Homily) + ".txt"
                homily_path = os.path.join(YearDirectory, homily_filename)

                # Save the homily text
                with open(homily_path, "w", encoding="utf-8") as file:
                    file.write(homily_text)
                    
                print(f"Saved: {URL_Homily}")
            else:
                print(f"Skipping (no text found): {URL_Homily}")

    print("All homilies saved successfully!")

In [6]:
# Test function
def test_extractor(url):
    """Helper function to test the extractor with detailed output"""
    result = ExtractText(url)
    print("URL:", url)
    print("\nTitle:", result.get("Title"))
    print("Date:", result.get("Date"))
    print("\nText length:", len(result.get("Text", "")))
    print("\nFirst 300 characters of text:")
    print(result.get("Text", "")[:300])
    return result

---

In [7]:
HomilyYearLinks = GetHomily_Years(PAPAL_Name = "francesco")                               # Extract yearly links
HomilyLinks = GetHomily_Links(HomilyYearLinks["2023"])

In [8]:
### Test Case 1 ###
url = "https://www.vatican.va/content/francesco/en/homilies/2023/documents/20231102_omelia-fedelidefunti.html"
homily_data = ExtractText(url)
print(homily_data["Text"])

HOLY MASS IN COMMEMORATION OF THE FALLEN HOMILY OF HIS HOLINESS POPE FRANCIS Rome War Cemetery, RomeThursday, 2 November 2023 The celebration of a day such as today leads us to two thoughts:remembranceandhope. Remembranceof those who preceded us, who led their life, who concluded this life; remembrance of the many people who were good to us: in the family, among friends… And also remembrance of those who did not manage to do so much good, but who have been received in God’s memory, in God’s mercy. It is the mystery of the Lord’s great mercy. And thenhope. Today is a day of remembrance in order to look forward, to look at our journey, our path. We walk towards an encounter, with the Lord and with everyone. And we must ask the Lord for this grace of hope: the hope that never lets us down; the hope that is the everyday virtue that carries us forward, that helps us to solve problems and to look for ways out. But always forward, forward. That fruitful hope, that every-day theological virtue

In [9]:
### Test Case 2 ###
url = "https://www.vatican.va/content/benedict-xv/it/homilies/documents/hf_ben-xv_hom_19160730_bambini-roma.html"
homily_data = ExtractText(url)
print(homily_data["Text"])

INCONTRO CON I BAMBINI DI ROMAOMELIA DEL SANTO PADRE BENEDETTO XV30 luglio 1916Era ben giusto e naturale che all’invito da Noi rivolto a tutti i bambini di Europa affinché in questo giorno, anniversario di luttuoso avvenimento, si accostassero numerosi e ferventi alla sacra Mensa Eucaristica, corrispondessero per primi i Fanciulli della Nostra Roma. Più vicino al cuore del Vicario di Cristo, essi ne veggono più dappresso i bisogni, ne conoscono meglio le aspirazioni, i dolori; nati cittadini di Roma, essi sentono, pur nella loro tenera età, le pulsazioni di quel cuore del mondo, che è la Sede del Papa; discendenti dai fortunati progenitori di nostra fede, portano essi nelle loro vene il sangue di Tarcisio, che spinge il loro cuore verso il Sacramento dell’altare, nel quale vive ogni ragione della loro fede e della loro romanità.Epperò siamo grati a Lei, signor Cardinale, di averci oggi procurata la soddisfazione di poter contemplare questa eletta e numerosa schiera di fanciulli, che da

In [10]:
### Test Case 3 ###
url = "https://www.vatican.va/content/francesco/en/homilies/2023/documents/20230408-omelia-veglia-pasquale.html"
homily_data = ExtractText(url)
print(homily_data["Text"])

EASTER VIGIL IN THE HOLY NIGHT OF EASTER HOMILY OF HIS HOLINESS POPE FRANCIS St Peter's BasilicaHoly Saturday, 8 April 2023 The night is drawing to a close and the first light of dawn is appearing upon the horizon as the women set out toward Jesus’ tomb. They make their way forward, bewildered and dismayed, their hearts overwhelmed with grief at the death that took away their Beloved. Yet upon arriving and seeing the empty tomb, they turn around and retrace their steps. They leave the tomb behind and run to the disciples to proclaim a change of course: Jesus is risen andawaits them in Galilee. In their lives, those women experienced Easter as a Pasch, apassage. They pass from walking sorrowfully towards the tomb to running back with joy to the disciples to tell them not only that the Lord is risen, but also that they are to set out immediately to reach a destination, Galilee. There they will meet the Risen Lord. The rebirth of the disciples, the resurrection of their hearts, passes thr

---

In [11]:
SaveHomilies(PopeName="francesco", base_dir="/Users/simondn/Documents/CSSS594/Project/Data")

YEAR: 2025
Saved: https://www.vatican.va/content/francesco/en/homilies/2025/documents/20250201-omelia-presentazione-del-signore.html
Saved: https://www.vatican.va/content/francesco/en/homilies/2025/documents/20250126-domenica-parola-didio.html
Saved: https://www.vatican.va/content/francesco/en/homilies/2025/documents/20250125-vespri-unita-cristiani.html
Saved: https://www.vatican.va/content/francesco/en/homilies/2025/documents/20250106-omelia-epifania.html
Saved: https://www.vatican.va/content/francesco/en/homilies/2025/documents/20250101-omelia-madredidio-pace.html
YEAR: 2024
Saved: https://www.vatican.va/content/francesco/en/homilies/2024/documents/20241224-omelia-natale.html
Saved: https://www.vatican.va/content/francesco/en/homilies/2024/documents/20241215-ajaccio-messa.html
Saved: https://www.vatican.va/content/francesco/en/homilies/2024/documents/20241212-omelia-guadalupe.html
Saved: https://www.vatican.va/content/francesco/en/homilies/2024/documents/20241208-omelia-nuovi-cardina