In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time

# WebDriver Setup
driver = webdriver.Chrome()  
driver.implicitly_wait(10) 

url = "https://codes.iccsafe.org/content/WAFC2021P1"
driver.get(url)

# Accept Cookies
try:
    accept_cookies_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//button[text()="Accept"]'))
    )
    accept_cookies_button.click()
except Exception:
    pass  

# Define Keywords and Regex Pattern for Filtering
keywords = ["Driveway", "Obstructions", "Photovoltaic array", 
            "Ridge", "Hip", "Valley", "Rakes", "Eaves", "Pathway", "Setback"]
pattern = (
    r"\b(?:Driveway|Obstructions|Photovoltaic|Ridge|Hip|Valley|Rakes|Eaves|Pathway|Setback|smoke ventilation)\b"
    r".*?\b(\d+\s*(?:foot|feet|inch|inches|meter|meters|mm|cm))\b"
)

# Open file to save output
output_file = open("extracted_content.txt", "w", encoding="utf-8")

# Function to locate and click each chapter tab to retrieve relevant content
def find_chapter_tabs():
    return driver.find_elements(By.XPATH, '//*[@id="chapter-wrapper"]/div/div//a')

# Function to extract and save content with matches
def extract_and_save_content():
    main_sections = driver.find_elements(By.XPATH, '/html/body/div[2]/div/div[1]/main/div/div/div[2]/div/div[2]/div[3]/div/div/section/section')
    
    for section in main_sections:
        try:
            # Main section title
            section_title = section.find_element(By.XPATH, './div/h1').text  
            
            # Check if "photovoltaic" or "solar" is in the section title
            if "photovoltaic" not in section_title.lower() and "solar" not in section_title.lower():
                continue  # Skip sections without relevant keywords
            
            subsections = section.find_elements(By.XPATH, './section')  # Get subsections
            
            for subsection in subsections:
                try:
                    subsection_title = subsection.find_element(By.XPATH, './div/h1').text
                except Exception:
                    subsection_title = "No title found"  
                
                subsection_content = subsection.text  # Get the full text of the subsection
                matches = re.findall(pattern, subsection_content, re.IGNORECASE | re.DOTALL)

                # If matches are found, save the details to the file
                if matches:
                    output_file.write(f"Section: {section_title}\n")
                    output_file.write(f"Subsection: {subsection_title}\n")
                    output_file.write(subsection_content.strip() + "\n\n")  # Write content with newline for readability
        except Exception:
            continue  # Skip sections where required elements are missing

# Step 6: Iterate through each chapter, extract content, and find matches
try:
    chapter_tabs = find_chapter_tabs()
    
    for index in range(len(chapter_tabs)):
        # Retry mechanism to refresh elements to avoid stale element exceptions
        for _ in range(3): 
            try:
                chapter_tabs = find_chapter_tabs()  # Refresh tabs to avoid stale references
                WebDriverWait(driver, 10).until(
                    EC.invisibility_of_element_located((By.CLASS_NAME, 'termly-styles-message-e9e76f'))
                )
                chapter_tabs[index].click()  # Click the chapter tab
                
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/div/div/main/div/div/div[2]/div/div[2]/div[3]/div/div/section'))
                )
                
                extract_and_save_content()  # Extract and save relevant content
                driver.back()  # Go back to the main page for the next chapter
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="chapter-wrapper"]/div/div//a'))
                )
                break  # Exit retry loop if successful
            except Exception as e:
                print(f"Retrying due to error: {e}")
                time.sleep(2)  # Wait a bit before retrying

except Exception as e:
    print(f"An error occurred while extracting content: {e}")

# Close the driver and file
driver.quit()
output_file.close()

print("Content has been extracted and saved to 'extracted_content.txt'")


Content has been extracted and saved to 'extracted_content.txt'


In [8]:
print(extracted_content)

Section: SECTION 1205
SOLAR PHOTOVOLTAIC POWER SYSTEMS
Subsection: 1205.2 Access and pathways.
1205.2 Access and pathways.
Roof access, pathways and spacing requirements shall be provided in accordance with Sections 1205.2.1 through 1205.3.3. Pathways shall be over areas capable of supporting fire fighters accessing the roof. Pathways shall be located in areas with minimal obstructions, such as vent pipes, conduit or mechanical equipment.
Exceptions:
1. Detached, nonhabitable Group U structures including, but not limited to, detached garages serving Group R-3 buildings, parking shade structures, carports, solar trellises and similar structures.
2. Roof access, pathways and spacing requirements need not be provided where the fire code official has determined that rooftop operations will not be employed.
3. Building-integrated photovoltaic (BIPV) systems where the BIPV systems are approved, integrated into the finished roof surface and are listed in accordance with a national test standa

In [2]:
import re

# Function to add spacing for readability
def format_text_for_readability(text):
    text = re.sub(r"(?i)^Subsection:.*$", "", text, flags=re.MULTILINE)
# Add extra newline after each section title and sub-section title
    text = re.sub(r"(\n)(1205\.\d+)", r"\1\n\2", text)  # Extra spacing after numbered section titles
    text = re.sub(r"(?<!\n)\n(?=\d{4}\.\d)", r"\n", text)  # Extra spacing at section starts
    
    # Add spacing for bulleted or listed items
    text = re.sub(r"(\n)([A-Za-z]\. |\d+\. )", r"\1\n\2", text)
    
    # Remove lines that start with "FIGURE" or "Figure"
    text = re.sub(r"(?i)^figure.*$", "", text, flags=re.MULTILINE)
    
    # Strip leading/trailing whitespace
    return text.strip()

# Step 1: Read the extracted content from the file
with open("extracted_content.txt", "r", encoding="utf-8") as file:
    extracted_content = file.read()

# Step 2: Apply the formatting function to the extracted content
formatted_content = format_text_for_readability(extracted_content)

# Step 3: Save or display the formatted content
with open("formatted_content.txt", "w", encoding="utf-8") as file:
    file.write(formatted_content)

# Optional: Print a preview of the formatted content
print("Formatted Content Preview:\n")
print(formatted_content) 


Formatted Content Preview:

Section: SECTION 1205
SOLAR PHOTOVOLTAIC POWER SYSTEMS


1205.2 Access and pathways.
Roof access, pathways and spacing requirements shall be provided in accordance with Sections 1205.2.1 through 1205.3.3. Pathways shall be over areas capable of supporting fire fighters accessing the roof. Pathways shall be located in areas with minimal obstructions, such as vent pipes, conduit or mechanical equipment.
Exceptions:

1. Detached, nonhabitable Group U structures including, but not limited to, detached garages serving Group R-3 buildings, parking shade structures, carports, solar trellises and similar structures.

2. Roof access, pathways and spacing requirements need not be provided where the fire code official has determined that rooftop operations will not be employed.

3. Building-integrated photovoltaic (BIPV) systems where the BIPV systems are approved, integrated into the finished roof surface and are listed in accordance with a national test standard deve

In [3]:
# Saving the formatted section to a text file
file_path = "C:\\Users\\user\\Documents\\Aether\\PV_Systems_Design.txt"
with open(file_path, "w") as file:
    file.write("Extracted Section:\n\n")  
    file.write(formatted_content) 

print(f"\nFormatted and extracted section saved to {file_path}")


Formatted and extracted section saved to C:\Users\user\Documents\Aether\PV_Systems_Design.txt
