In [4]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urlunparse
import re


import tldextract


def get_main_domain(url):
    """
    Extract the main domain name from a URL, removing subdomains and 'www'.
    Example:
        - 'https://www.tamu.edu/' -> 'tamu.edu'
        - 'https://grad.tamu.edu/' -> 'tamu.edu'
    """
    # Use tldextract to parse the URL
    extracted = tldextract.extract(url)
    # Combine domain and suffix (e.g., 'tamu' + 'edu')
    return f"{extracted.domain}.{extracted.suffix}"


def is_allowed_domain(link, main_domain, additional_domains):
    """
    Check if the link belongs to the main domain or its subdomains,
    or any of the additional allowed domains.
    """
    parsed_link = urlparse(link)
    domain = parsed_link.netloc

    # Allow the main domain and all its subdomains
    if domain == main_domain or domain.endswith(f".{main_domain}"):
        return True

    # Allow explicitly allowed additional domains
    return any(
        domain == add_domain or domain.endswith(f".{add_domain}")
        for add_domain in additional_domains
    )


def is_pdf_file(link):
    """Check if the link points to a PDF file."""
    return link.lower().endswith(".pdf")


def download_pdf(url, save_folder):
    """Download a PDF file from the given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        filename = os.path.join(save_folder, os.path.basename(urlparse(url).path))
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"Downloaded PDF: {filename}")
    except Exception as e:
        print(f"Failed to download PDF {url}: {e}")


def get_page_title(soup):
    """Extract the title of the webpage."""
    title_tag = soup.find("title")
    return title_tag.get_text(strip=True) if title_tag else "No Title"


def url_to_filename(url):
    """
    Convert a URL to a filename-friendly string.
    Example: 'https://example.com' -> 'example_com'
    """
    parsed_url = urlparse(url)
    domain = parsed_url.netloc  # Extract the domain
    return domain.replace(".", "_").replace("www_", "")

In [5]:
def remove_common_sections(soup):
    """
    Remove common menu, header, and footer sections from the parsed HTML.
    """
    # Define common section selectors to remove
    common_selectors = ["header", "footer", ".menu", ".navbar", ".footer"]  #

    for selector in common_selectors:
        for element in soup.select(selector):
            element.decompose()  # Remove the element from the DOM

    return soup


def is_relevant_content(text):
    """
    Check if the text is relevant and not boilerplate or empty.
    """
    irrelevant_phrases = [
        "top of the page",
        "skip to content",
        "loading...",
        "menu",
        "navigation",
    ]
    # Check if the text is non-empty and not in the irrelevant phrases
    return (
        text
        and text.strip()
        and not any(phrase in text.lower() for phrase in irrelevant_phrases)
    )


def format_content(soup):
    """
    Format the content of a webpage by preserving headers, bullet points, and numbered lists,
    while avoiding duplicate and irrelevant content, and normalizing spaces.
    """
    formatted_content = []
    seen_content = set()  # Track already added content to avoid duplicates

    for element in soup.find_all(
        ["h1", "h2", "h3", "h4", "h5", "h6", "p", "li", "label", "span", "div"]
    ):
        # Skip tags that have child elements already processed
        if element.findChildren():
            continue

        # Extract and normalize text
        text = " ".join(element.stripped_strings)  # Combine child strings with spaces
        text = re.sub(
            r"\s+", " ", text
        )  # Replace multiple spaces/newlines with a single space
        text = text.strip()

        # Skip if the text is irrelevant or already seen
        if is_relevant_content(text) and text not in seen_content:
            # Add headers with line breaks
            if element.name.startswith("h"):
                formatted_content.append(f"\n{text}\n")
            # Add bullet points for list items
            elif element.name == "li":
                formatted_content.append(f"- {text}")
            # Add paragraphs or other content as-is
            else:
                formatted_content.append(text)
            seen_content.add(text)  # Mark content as added

    # Join all elements with newlines
    return "\n".join(formatted_content)


def should_ignore_link(url):
    """
    Check if the URL should be ignored based on its extension or specific keywords in the content.
    """
    # File extensions to ignore
    ignore_extensions = {
        ".jpg",
        ".jpeg",
        ".png",
        ".gif",
        ".bmp",
        ".tiff",
        ".svg",
        ".webp",
        ".mp4",
        ".avi",
        ".mov",
        ".mkv",
        ".flv",
        ".wmv",
        ".webm",
        ".3gp",
        ".mp3",
        ".wav",
        ".ogg",
        ".aac",
        ".flac",
        ".m4a",
        ".pdf",
        ".doc",
        ".docx",
        ".ppt",
        ".pptx",
        ".xls",
        ".xlsx",
        ".rtf",
        ".odt",
        ".ods",
        ".zip",
        ".rar",
        ".7z",
        ".tar",
        ".gz",
        ".bz2",
        ".exe",
        ".bat",
        ".sh",
        ".msi",
        ".bin",
        ".js",
        ".css",
        ".json",
        ".xml",
        ".yaml",
        ".yml",
        ".ico",
        ".eot",
        ".ttf",
        ".woff",
        ".woff2",
        ".swf",
        ".apk",
        ".aspx",
    }

    # Keywords to ignore
    ignore_keywords = [
        r"\bnews\b",
        r"\bevents\b",
        r"\bcalendar\b",
        r"\binternship\b",
        r"\bdashboard\b",
        r"\bgiving\b",
        r"\bsheet\b",
        r"/search",
        r"/tag",
        r"\bcollection\b",
        "wiki",
        "login",
        r"\bdownload\b",
        r"\bpresident\b",
        r"\bbluebook\b",
        r"\blib\.\b",
        r"/lib",
        r"/documents",
        r"\bpublicsafty\b",
        r"\bhistory\b",
        r"\bemployment\b",
        r"\bchapter\b",
        r"\bfaq\b",
        r"\bmilitary\b",
        r"\bfacilities\b",
        r"\bguides\b",
        r"\bcareer\b",
        r"\bbudget\b",
        r"\bmytraining\b",
        r"\btour\b",
        r"(^|[^a-zA-Z0-9])undergraduate([^a-zA-Z0-9]|$)",
        r"\btoday\b",
        r"\bindex\.php\b",
        r"\bstudyabroad\b",
    ]

    # Exception keywords to allow
    allow_keywords = [r"\bprofile\b", r"\bfaculty\b", r"\bgraduate\b"]

    # Check for ignored extensions
    if any(url.lower().endswith(ext) for ext in ignore_extensions):
        return True  # Ignore if the URL has a disallowed extension

    # Check if the URL contains any of the allow keywords
    if any(re.search(allow, url.lower()) for allow in allow_keywords):
        return False  # Do not ignore if an allow keyword is found

    # Check if the URL contains any of the ignore keywords
    if any(re.search(ignore, url.lower()) for ignore in ignore_keywords):
        return True  # Ignore if an ignore keyword is found

    # Check for YEAR pattern (e.g., 20{00-30})
    if re.search(r"\b20(0[0-9]|1[0-9]|2[0-9]|30)\b", url):
        return True  # Ignore if a YEAR pattern is found

    return False  # Do not ignore otherwise

In [6]:
def crawl_website(base_url, additional_domains, max_links=None, save_interval=1000):
    """
    Crawl a website and extract all text content, saving the output every `save_interval` pages.
    """
    # Parse the main domain from the base URL
    # parsed_base = urlparse(base_url)
    # main_domain = parsed_base.netloc
    main_domain = get_main_domain(base_url)

    visited = set()
    to_visit = [base_url]
    output_text = ""
    saved_files_count = 0  # Counter for saved files

    # Create folder for PDFs
    pdf_folder = f"pdfs/{url_to_filename(base_url)}"
    os.makedirs(pdf_folder, exist_ok=True)

    crawled_count = 0  # Counter for the number of links processed

    while to_visit:
        if max_links is not None and crawled_count >= max_links:
            print(f"Reached the test limit of {max_links} links.")
            break

        current_url = to_visit.pop(0)
        if current_url in visited:
            continue
        if should_ignore_link(current_url):
            print(f"Link contains ignore list keywords: {current_url}")
            continue

        print(f"{crawled_count}: Visiting: {current_url}")
        visited.add(current_url)
        crawled_count += 1

        if is_pdf_file(current_url):
            print("PDF found but not downloaded.")
            continue

        try:
            response = requests.get(current_url, timeout=10)
            response.raise_for_status()
            soup_base = BeautifulSoup(response.text, "html.parser")

            # Remove common sections
            soup = remove_common_sections(soup_base)

            # Extract title and formatted content
            title = get_page_title(soup)
            content = format_content(soup)

            # Append to output in the specified format
            output_text += (
                f"LINK: {current_url}\nTITLE: {title}\nCONTENT:\n{content}\n\n"
            )

            # Save the content every `save_interval` pages
            if crawled_count % save_interval == 0:
                save_file(output_text, base_url, crawled_count)
                saved_files_count += 1
                output_text = ""  # Reset the output content after saving

            # Find all links on the page
            for link in soup_base.find_all("a", href=True):
                current_base = response.url  # Use the actual URL of the current page
                full_url = urljoin(current_base, link["href"])

                # Remove fragment identifiers from the URL
                parsed_url = urlparse(full_url)
                full_url = urlunparse(parsed_url._replace(fragment=""))
                if (
                    is_allowed_domain(full_url, main_domain, additional_domains)
                    and full_url not in visited
                ):
                    to_visit.append(full_url)
        except requests.exceptions.Timeout:
            print(f"Request timed out for URL: {current_url}")
            continue
        except requests.exceptions.RequestException as e:
            print(f"Request failed for URL: {current_url} - {e}")
            continue

    # Save any remaining content if the crawl ends
    if output_text.strip():
        print("5 step")
        save_file(output_text, base_url, crawled_count)

    print(f"Total saved files: {saved_files_count}")
    return output_text


def save_file(content, base_url, page_count):
    """
    Save the parsed content to a file with the page count in the filename.
    """
    filename = url_to_filename(base_url)
    os.makedirs("../data/crawled_content", exist_ok=True)  # Ensure the directory exists
    file_path = f"../data/crawled_content/{filename}_{page_count}.txt"
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(content)
    print(f"Saved content to: {file_path}")

In [7]:
if __name__ == "__main__":
    # Set `max_links` for testing or `None` for production.
    max_test_links = 5

    website_url_ls = [
        "https://utexas.edu/",
    ]  # "https://gradschool.utexas.edu/degrees-programs"

    for website_url in website_url_ls:
        filename = url_to_filename(website_url)
        additional_domains = {"relateddomain.com"}

        # Crawl with a save interval of 1000 pages
        result = crawl_website(
            website_url,
            additional_domains,
            max_links=max_test_links,
            save_interval=2000,
        )
        print("Crawling complete.")

0: Visiting: https://utexas.edu/


  if element.findChildren():


1: Visiting: https://www.utexas.edu/
Link contains ignore list keywords: https://giving.utexas.edu/what-starts-here?utm_source=main&amp;utm_medium=test&amp;utm_campaign=launch
Link contains ignore list keywords: https://news.utexas.edu/2025/05/12/celebrating-the-class-of-2025/
Link contains ignore list keywords: https://news.utexas.edu/2025/05/09/making-ut-shine-for-commencement/
Link contains ignore list keywords: https://news.utexas.edu/2025/05/05/ready-to-change-the-world-dat-duong/
Link contains ignore list keywords: https://www.news.utexas.edu/
Link contains ignore list keywords: https://calendar.utexas.edu/
2: Visiting: https://admissions.utexas.edu/
3: Visiting: https://faculty.utexas.edu/career
Link contains ignore list keywords: https://president.utexas.edu/innovation-board
Link contains ignore list keywords: https://giving.utexas.edu/foundation-relations/
Link contains ignore list keywords: https://www.utexas.edu/military
4: Visiting: https://discoveries.utexas.edu/about/
Rea

In [None]:
if __name__ == "__main__":
    # Set `max_links` for testing or `None` for production.
    max_test_links = 10000

    website_url_ls = [
        "https://www.tamu.edu/"
    ]  # "https://grad.tamu.edu/academics/program-directory", "https://www.tamu.edu/academics/colleges-schools/index.html" ,

    for website_url in website_url_ls:
        filename = url_to_filename(website_url)
        additional_domains = {"relateddomain.com"}

        # Crawl with a save interval of 1000 pages
        result = crawl_website(
            website_url,
            additional_domains,
            max_links=max_test_links,
            save_interval=2000,
        )
        print("Crawling complete.")

In [None]:
if __name__ == "__main__":
    # Set `max_links` for testing or `None` for production.
    max_test_links = 10000

    website_url = "https://www.utdallas.edu/"
    filename = url_to_filename(website_url)
    additional_domains = {"relateddomain.com"}

    # Crawl with a save interval of 1000 pages
    result = crawl_website(
        website_url, additional_domains, max_links=max_test_links, save_interval=2000
    )
    print("Crawling complete.")