In [16]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse


def is_allowed_domain(link, main_domain, additional_domains):
    """
    Check if the link belongs to the main domain or its subdomains,
    or any of the additional allowed domains.
    """
    parsed_link = urlparse(link)
    domain = parsed_link.netloc

    # Allow the main domain and all its subdomains
    if domain == main_domain or domain.endswith(f".{main_domain}"):
        return True

    # Allow explicitly allowed additional domains
    return any(
        domain == add_domain or domain.endswith(f".{add_domain}")
        for add_domain in additional_domains
    )


def is_pdf_file(link):
    """Check if the link points to a PDF file."""
    return link.lower().endswith(".pdf")


def download_pdf(url, save_folder):
    """Download a PDF file from the given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        filename = os.path.join(save_folder, os.path.basename(urlparse(url).path))
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"Downloaded PDF: {filename}")
    except Exception as e:
        print(f"Failed to download PDF {url}: {e}")


def get_page_title(soup):
    """Extract the title of the webpage."""
    title_tag = soup.find("title")
    return title_tag.get_text(strip=True) if title_tag else "No Title"


def url_to_filename(url):
    """
    Convert a URL to a filename-friendly string.
    Example: 'https://example.com' -> 'example_com'
    """
    parsed_url = urlparse(url)
    domain = parsed_url.netloc  # Extract the domain
    return domain.replace(".", "_").replace("www_", "")


def crawl_website(base_url, additional_domains, max_links=None):
    """Crawl a website and extract all text content."""
    # Parse the main domain from the base URL
    parsed_base = urlparse(base_url)
    main_domain = parsed_base.netloc

    visited = set()
    to_visit = [base_url]
    output_text = ""

    # Create folder for PDFs
    pdf_folder = f"pdfs/{url_to_filename(base_url)}"
    os.makedirs(pdf_folder, exist_ok=True)

    crawled_count = 0  # Counter for the number of links processed

    while to_visit:
        if max_links is not None and crawled_count >= max_links:
            print(f"Reached the test limit of {max_links} links.")
            break

        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        print(f"Visiting: {current_url}")
        visited.add(current_url)
        crawled_count += 1

        if is_pdf_file(current_url):
            download_pdf(current_url, pdf_folder)
            continue

        try:
            response = requests.get(current_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract title and content
            title = get_page_title(soup)
            content = soup.get_text(strip=True)

            # Append to output in the specified format
            output_text += (
                f"LINK: {current_url}\nTITLE: {title}\nCONTENT: {content}\n\n"
            )

            # Find all links on the page
            for link in soup.find_all("a", href=True):
                current_base = response.url  # Use the actual URL of the current page
                full_url = urljoin(current_base, link["href"])

                # Remove fragment identifiers from the URL
                parsed_url = urlparse(full_url)
                full_url = urlunparse(parsed_url._replace(fragment=""))
                if (
                    is_allowed_domain(full_url, main_domain, additional_domains)
                    and full_url not in visited
                ):
                    to_visit.append(full_url)
        except Exception as e:
            print(f"Failed to fetch {current_url}: {e}")

    return output_text

In [17]:
# # Example usage
# if __name__ == "__main__":
#     # Set `max_links` for testing (e.g., 20). Set to `None` for production.
#     max_test_links = 1000

#     website_url = "https://www.uiw.edu/hebsba/faculty-and-staff/index.html"
#     filename = url_to_filename(website_url)
#     additional_domains = {"relateddomain.com"}
#     result = crawl_website(website_url, additional_domains, max_links=max_test_links)

#     # Save the output to a file
#     with open(f"crawled_content_{filename}.txt", "w", encoding="utf-8") as file:
#         file.write(result)
#     print(
#         "Crawling complete. Content saved to 'crawled_content.txt'. PDFs are saved in 'pdfs/' folder."
#     )

In [18]:
def format_content(soup):
    """
    Format the content of a webpage by preserving headers, bullet points, and numbered lists.
    """
    formatted_content = []

    for element in soup.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "label"]
    ):
        # Add headers with line breaks
        if element.name.startswith("h"):
            formatted_content.append(f"\n{element.get_text(strip=True)}\n")
        # Add bullet points for list items
        elif element.name == "li":
            formatted_content.append(f"- {element.get_text(strip=True)}")
        # Add paragraphs as-is
        elif element.name == "p":
            formatted_content.append(element.get_text(strip=True))

    # Join all elements with newlines
    return "\n".join(formatted_content)


def crawl_website(base_url, additional_domains, max_links=None):
    """Crawl a website and extract all text content."""
    # Parse the main domain from the base URL
    parsed_base = urlparse(base_url)
    main_domain = parsed_base.netloc

    visited = set()
    to_visit = [base_url]
    output_text = ""

    # Create folder for PDFs
    pdf_folder = f"pdfs/{url_to_filename(base_url)}"
    os.makedirs(pdf_folder, exist_ok=True)

    crawled_count = 0  # Counter for the number of links processed

    while to_visit:
        if max_links is not None and crawled_count >= max_links:
            print(f"Reached the test limit of {max_links} links.")
            break

        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        print(f"Visiting: {current_url}")
        visited.add(current_url)
        crawled_count += 1

        if is_pdf_file(current_url):
            download_pdf(current_url, pdf_folder)
            continue

        try:
            response = requests.get(current_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract title and formatted content
            title = get_page_title(soup)
            content = format_content(soup)

            # Append to output in the specified format
            output_text += (
                f"LINK: {current_url}\nTITLE: {title}\nCONTENT:\n{content}\n\n"
            )

            # Find all links on the page
            for link in soup.find_all("a", href=True):
                current_base = response.url  # Use the actual URL of the current page
                full_url = urljoin(current_base, link["href"])

                # Remove fragment identifiers from the URL
                parsed_url = urlparse(full_url)
                full_url = urlunparse(parsed_url._replace(fragment=""))
                if (
                    is_allowed_domain(full_url, main_domain, additional_domains)
                    and full_url not in visited
                ):
                    to_visit.append(full_url)
        except Exception as e:
            print(f"Failed to fetch {current_url}: {e}")

    return output_text


# # Example usage
# if __name__ == "__main__":
#     # Set `max_links` for testing (e.g., 20). Set to `None` for production.
#     max_test_links = 100

#     website_url = "https://www.uiw.edu/hebsba/faculty-and-staff/index.html"
#     filename = url_to_filename(website_url)
#     additional_domains = {"relateddomain.com"}
#     result = crawl_website(website_url, additional_domains, max_links=max_test_links)

#     # Save the output to a file
#     with open(f"crawled_content_{filename}_4.txt", "w", encoding="utf-8") as file:
#         file.write(result)
#     print(
#         "Crawling complete. Content saved to 'crawled_content.txt'. PDFs are saved in 'pdfs/' folder."
#     )

In [19]:
def remove_common_sections(soup):
    """
    Remove common menu, header, and footer sections from the parsed HTML.
    """
    # Define common section selectors to remove
    common_selectors = ["header", "footer", ".menu", ".navbar", ".footer"]

    for selector in common_selectors:
        for element in soup.select(selector):
            element.decompose()  # Remove the element from the DOM

    return soup


def format_content(soup):
    """
    Format the content of a webpage by preserving headers, bullet points, and numbered lists.
    """
    formatted_content = []
    # TODO: the div should be based on the class and get only content if they have relevent information.
    for element in soup.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "label", "span"]  # , "div"
    ):
        # Add headers with line breaks
        if (
            (element.name.startswith("h"))
            | (element.name == "label")
            | (element.name == "span")
            # | (element.name == "div")
        ):
            formatted_content.append(f"\n{element.get_text(strip=True)}\n")
        # Add bullet points for list items
        elif element.name == "li":
            formatted_content.append(f"- {element.get_text(strip=True)}")
        # Add paragraphs as-is
        elif element.name == "p":
            formatted_content.append(element.get_text(strip=True))
        else:
            formatted_content.append(element.get_text(strip=True))

    # Join all elements with newlines
    return "\n".join(formatted_content)


def crawl_website(base_url, additional_domains, max_links=None):
    """Crawl a website and extract all text content."""
    # Parse the main domain from the base URL
    parsed_base = urlparse(base_url)
    main_domain = parsed_base.netloc

    visited = set()
    to_visit = [base_url]
    output_text = ""

    # Create folder for PDFs
    pdf_folder = f"pdfs/{url_to_filename(base_url)}"
    os.makedirs(pdf_folder, exist_ok=True)

    crawled_count = 0  # Counter for the number of links processed

    while to_visit:
        if max_links is not None and crawled_count >= max_links:
            print(f"Reached the test limit of {max_links} links.")
            break

        current_url = to_visit.pop(0)
        if current_url in visited:
            continue

        print(f"Visiting: {current_url}")
        visited.add(current_url)
        crawled_count += 1

        if is_pdf_file(current_url):
            download_pdf(current_url, pdf_folder)
            continue

        try:
            response = requests.get(current_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Remove common sections
            soup = remove_common_sections(soup)

            # Extract title and formatted content
            title = get_page_title(soup)
            content = format_content(soup)

            # Append to output in the specified format
            output_text += (
                f"LINK: {current_url}\nTITLE: {title}\nCONTENT:\n{content}\n\n"
            )

            # Find all links on the page
            for link in soup.find_all("a", href=True):
                current_base = response.url  # Use the actual URL of the current page
                full_url = urljoin(current_base, link["href"])

                # Remove fragment identifiers from the URL
                parsed_url = urlparse(full_url)
                full_url = urlunparse(parsed_url._replace(fragment=""))
                if (
                    is_allowed_domain(full_url, main_domain, additional_domains)
                    and full_url not in visited
                ):
                    to_visit.append(full_url)
        except Exception as e:
            print(f"Failed to fetch {current_url}: {e}")

    return output_text


# # Example usage
# if __name__ == "__main__":
#     # Set `max_links` for testing (e.g., 20). Set to `None` for production.
#     max_test_links = 100

#     website_url = "https://www.uiw.edu/hebsba/faculty-and-staff/index.html"
#     filename = url_to_filename(website_url)
#     additional_domains = {"relateddomain.com"}
#     result = crawl_website(website_url, additional_domains, max_links=max_test_links)

#     # Save the output to a file
#     with open(f"crawled_content_{filename}_6.txt", "w", encoding="utf-8") as file:
#         file.write(result)
#     print(
#         "Crawling complete. Content saved to 'crawled_content.txt'. PDFs are saved in 'pdfs/' folder."
#     )

In [21]:
# # Example usage
# if __name__ == "__main__":
#     # Set `max_links` for testing (e.g., 20). Set to `None` for production.
#     max_test_links = 50

#     website_url = "https://www.stmary.edu/faculty/index"
#     filename = url_to_filename(website_url)
#     additional_domains = {"relateddomain.com"}
#     result = crawl_website(website_url, additional_domains, max_links=max_test_links)

#     # Save the output to a file
#     with open(f"crawled_content_{filename}_6.txt", "w", encoding="utf-8") as file:
#         file.write(result)
#     print(
#         "Crawling complete. Content saved to 'crawled_content.txt'. PDFs are saved in 'pdfs/' folder."
#     )

In [23]:
# Example usage
if __name__ == "__main__":
    # Set `max_links` for testing (e.g., 20). Set to `None` for production.
    max_test_links = 40

    website_url = "https://klesse.utsa.edu/mechanical/faculty/"
    filename = url_to_filename(website_url)
    additional_domains = {"relateddomain.com"}
    result = crawl_website(website_url, additional_domains, max_links=max_test_links)

    # Save the output to a file
    with open(f"crawled_content_{filename}_1.txt", "w", encoding="utf-8") as file:
        file.write(result)
    print(
        "Crawling complete. Content saved to 'crawled_content.txt'. PDFs are saved in 'pdfs/' folder."
    )

Visiting: https://klesse.utsa.edu/mechanical/faculty/
Visiting: https://klesse.utsa.edu/index.html
Visiting: https://klesse.utsa.edu/mechanical/index.html
Visiting: https://klesse.utsa.edu/mechanical/faculty/index.html
Visiting: https://klesse.utsa.edu/mechanical/programs.html
Visiting: https://klesse.utsa.edu/mechanical/information.html
Visiting: https://klesse.utsa.edu/mechanical/faculty/advisory.html
Visiting: https://klesse.utsa.edu/mechanical/faculty/resources.html
Visiting: https://klesse.utsa.edu/mechanical/faculty/openings.html
Visiting: https://klesse.utsa.edu/mechanical/research.html
Visiting: https://klesse.utsa.edu/mechanical/students.html
Visiting: https://klesse.utsa.edu/mechanical/contact.html
Visiting: https://klesse.utsa.edu/requestinfo.html
Visiting: https://klesse.utsa.edu/faculty/index.html
Visiting: https://klesse.utsa.edu/faculty/profiles/abbas-omar.html
Visiting: https://klesse.utsa.edu/faculty/profiles/araya-guillermo.html
Visiting: https://klesse.utsa.edu/facul