In [37]:
import os
from pathlib import Path
import re
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
from collections import deque
from dotenv import load_dotenv

In [38]:
def find_project_root(current_directory, marker):
    current_directory = Path(current_directory).absolute()
    for parent in current_directory.parents:
        if (parent / marker).exists():
            return parent
    raise FileNotFoundError(f"Project root with {marker} not found")

current_directory = Path.cwd()
project_root = find_project_root(current_directory, '.git')

# Load the environment variables from the .env file
env_path = project_root / '.env'
load_dotenv(dotenv_path=env_path)
PATH_TO_URLS = current_directory / os.getenv("PATH_TO_URLS")

base_url = "https://www.uscis.gov"
output_dir = "new_uscis_data"
visited_urls = set()

In [39]:
# Function to read URLs from the file
def read_urls_from_file():
    return deque(line.strip() for line in open(PATH_TO_URLS, 'r'))
    
url_queue = read_urls_from_file()

In [40]:
# Function to save content to a file
def save_content(content, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'w', encoding='utf-8') as file:
        file.write(content)

In [41]:
# Function to append new URLs to the file
def append_new_url(url):
    with open(PATH_TO_URLS, 'a') as file:
        file.write(url + '\n')

In [42]:
def sanitize_filename(part):
    # Remove or replace characters that are not allowed in filenames
    part = re.sub(r'[<>:"/\\|?*]', '_', part)  # Replace these characters with underscore
    part = re.sub(r'\s+', '_', part)  # Replace spaces with underscores
    return part

In [43]:
# Function to recursively extract text from elements as they appear on the web page
def extract_text(element, buffer, depth=0):
    if isinstance(element, NavigableString):
        stripped_text = str(element).strip()
        if stripped_text:
            buffer.append(" " * (depth * 2) + stripped_text)
    elif isinstance(element, Tag):
        # add new line after headers
        if element.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
            buffer.append('\n')

        # format list items nicely
        if element.name == 'li':
            buffer.append("\n" + " " * (depth * 2) + "- ")

        # get links from anchor tags and add them to url_queue and new urls found
        if element.name == 'a' and 'href' in element.attrs:
            link_text = element.get_text(strip=True)
            full_url = element['href']
            buffer.append(link_text + f" [URL: {full_url}]")

            if 'http' not in full_url:
                full_url = base_url + full_url

            if full_url.startswith(base_url) and not full_url.endswith('.pdf'):
                if full_url not in visited_urls and full_url not in url_queue:
                    url_queue.append(full_url)
                    append_new_url(full_url)
                    print(f"New URL found {full_url}")

        # format tables and their entries nicely
        if element.name == 'table':
            buffer.append('\n' + " " * (depth * 2) + "Table Start:")
            for row in element.find_all('tr'):
                row_buffer = []
                for cell in row.find_all(['td', 'th']):
                    cell_text = cell.get_text(strip=True, separator=' ').replace('\n', ' ')
                    row_buffer.append(cell_text)
                buffer.append("\n" + " " * (depth * 4) + ' | '.join(row_buffer) + ' |')
            buffer.append("\n" + " " * (depth * 2) + "Table End.\n")

        # extract rest of html elements
        else:
            for child in element.children:
                extract_text(child, buffer, depth + (1 if element.name in ['ul', 'ol'] else 0))

        if element.name in ['p', 'div', 'article', 'section']:
            buffer.append('\n')

In [44]:
# Function to scrape a page
def scrape_page(url):
    if url in visited_urls:
        print(f"Already visited {url}")
        return
    
    print(f"Scraping {url}")
    visited_urls.add(url)

    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch {url} with status code {response.status_code}")
            return
    except requests.exceptions.RequestException as e:  # This catches all requests-based exceptions
        print(f"Failed to connect to {url}.")
        return

    soup = BeautifulSoup(response.content, 'html.parser')
    main_container = soup.find('div', class_='container container--main')
    sidebar = soup.find('ul', class_="menu--main leftnav-menu")

    if url.endswith('.pdf'):
        return

    if main_container:
        text_segments = []
        extract_text(main_container, text_segments)
        text_content = ''.join(text_segments)

        # Adjust URL path for saving
        path_segments = url.split('/')
        sanitized_path_segments = [sanitize_filename(part) for part in path_segments[2:]] if len(path_segments) > 3 else [sanitize_filename(path_segments[-1])]
        text_path = os.path.join(output_dir, *sanitized_path_segments) + ".txt"
        save_content(text_content, text_path)

    elif sidebar:
        links = sidebar.find_all('a', href=True)
        for link in links:
            full_url = link['href']

            if 'http' not in full_url:
                full_url = base_url + full_url

            if full_url.startswith(base_url) and not full_url.endswith('.pdf'):                
                if full_url not in visited_urls and full_url not in url_queue:
                    url_queue.append(full_url)
                    print(f"New URL found {full_url}")
                    append_new_url()
    else:
        print(f"No main or sidebar container found in {url}")

In [None]:
while url_queue:
    scrape_page(url_queue.popleft())