In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
from collections import OrderedDict

# Setting up Chrome options for headless mode 
options = Options()
options.headless = True

# Initializing the Chrome driver
driver = webdriver.Chrome(options=options)
url = "https://codes.iccsafe.org/content/WAFC2021P1/chapter-12-energy-systems#WAFC2021P1_Pt03_Ch12_Sec1205"
driver.get(url)

# Allow page to load fully
time.sleep(3) 

try:
    # Locate the main section container
    section_content = driver.find_element(By.ID, "WAFC2021P1_Pt03_Ch12_Sec1205")
    
    # 1. Extract Titles from <h1> tags
    titles = [h1.text.strip() for h1 in section_content.find_elements(By.TAG_NAME, "h1")]
    print("\nTitles:\n", "\n".join(titles))
    
    # 2. Extract Full Text from <h1>, <p>, and <li> tags
    unique_texts = OrderedDict()

    # Function to add text to ordered dictionary if unique
    def add_unique_text(text):
        if text and text not in unique_texts:
            unique_texts[text] = None  #This value is None, as we only care about unique keys

    # Extract <h1>, <p>, and <li> tags in sequence and add to unique texts dictionary
    for element in section_content.find_elements(By.XPATH, ".//h1 | .//p | .//li"):
        add_unique_text(element.text.strip())

    # Concatenate all unique lines into a single string for full text
    full_text = "\n".join(unique_texts.keys())
    print("\nFull Text:\n", full_text)

finally:
    # Close the driver
    driver.quit()



Titles:
 SECTION 1205
SOLAR PHOTOVOLTAIC POWER SYSTEMS
1205.1 General.
1205.2 Access and pathways.
1205.2.1 Solar photovoltaic systems for Group R-3 residential and buildings built under the International Residential Code.
1205.2.1.1 Pathways to ridge.
1205.2.1.2 Setbacks at ridge.
1205.2.1.3 Alternative setbacks at ridge.
1205.2.2 Emergency escape and rescue openings.
1205.2.3 Building-integrated photovoltaic (BIPV) systems.
1205.3 Other than Group R-3 buildings.
1205.3.1 Perimeter pathways.
1205.3.2 Interior pathways.
1205.3.3 Smoke ventilation.
1205.4 Buildings with rapid shutdown.
1205.4.1 Rapid shutdown type.
1205.4.1.1 Diagram.
1205.4.1.2 Location.
1205.4.2 Buildings with more than one rapid shutdown type.
1205.4.3 Rapid shutdown switch.
1205.5 Ground-mounted photovoltaic panel systems.
1205.5.1 Vegetation control.
1205.6 Size of solar photovoltaic array.

Full Text:
 SECTION 1205
SOLAR PHOTOVOLTAIC POWER SYSTEMS
1205.1 General.
Installation, modification, or alteration of solar

In [2]:
import re

# Define a function to extract the desired sections
def extract_sections(full_text):
    # Define the markers for the first section extraction (1205.1 to 1205.4)
    start_marker_1 = "SOLAR PHOTOVOLTAIC POWER SYSTEMS"
    end_marker_1 = "1205.4 Buildings with rapid shutdown."
    
    # Extract the first section
    pattern_1 = rf"({re.escape(start_marker_1)}.*?{re.escape(end_marker_1)})"
    match_1 = re.search(pattern_1, full_text, re.DOTALL)
    
    if match_1:
        # Return the matched section without the end marker
        section_1 = match_1.group(0).strip().replace(end_marker_1, "")
    else:
        section_1 = "First specified section not found."
    
    # Define the marker for the second section extraction (1205.6)
    start_marker_2 = "1205.6 Size of solar photovoltaic array."
    
    # Extract the second section
    pattern_2 = rf"({re.escape(start_marker_2)}.*)"
    match_2 = re.search(pattern_2, full_text, re.DOTALL)
    
    if match_2:
        section_2 = match_2.group(0).strip()
    else:
        section_2 = "Second specified section not found."
    
    # Combine both sections
    extracted_sections = f"{section_1}\n\n{section_2}"
    return extracted_sections

# Extracting the sections
extracted_sections = extract_sections(full_text)
print("Extracted Sections:\n")
print(extracted_sections)


Extracted Sections:

SOLAR PHOTOVOLTAIC POWER SYSTEMS
1205.1 General.
Installation, modification, or alteration of solar photovoltaic power systems shall comply with this section. Due to the emerging technologies in the solar photovoltaic industry, it is understood fire code officials may need to amend prescriptive requirements of this section to meet the requirements for fire fighter access and product installations. Section 104.10, Alternative materials and methods, of this code shall be considered when approving the installation of solar photovoltaic power systems. Solar photovoltaic power systems shall be installed in accordance with Sections 1205.2.1 through 1205.6, the International Building Code, and chapter 19.28 RCW.
1205.2 Access and pathways.
Roof access, pathways and spacing requirements shall be provided in accordance with Sections 1205.2.1 through 1205.3.3. Pathways shall be over areas capable of supporting fire fighters accessing the roof. Pathways shall be located in ar

In [4]:
import re

# Function to add spacing after each section title and paragraph
def format_text_for_readability(text):
    # Add extra newline after section titles and at the end of paragraphs
    text = re.sub(r"(\n)(1205\.\d+)", r"\1\n\2", text)  # Space after numbered sections
    text = re.sub(r"(?<!\n)\n(?=\d{4}\.\d)", r"\n", text)  # Add spacing at section start
    
    # Space out bulleted or listed items.
    text = re.sub(r"(\n)([A-Za-z]\. |\d+\. )", r"\1\n\2", text)
    
    return text.strip()

# Apply the formatting function to the extracted section
formatted_section = format_text_for_readability(extracted_sections)

# Print the formatted section
print("Formatted Section Preview:\n")
print(formatted_section)  


Formatted Section Preview:

SOLAR PHOTOVOLTAIC POWER SYSTEMS

1205.1 General.
Installation, modification, or alteration of solar photovoltaic power systems shall comply with this section. Due to the emerging technologies in the solar photovoltaic industry, it is understood fire code officials may need to amend prescriptive requirements of this section to meet the requirements for fire fighter access and product installations. Section 104.10, Alternative materials and methods, of this code shall be considered when approving the installation of solar photovoltaic power systems. Solar photovoltaic power systems shall be installed in accordance with Sections 1205.2.1 through 1205.6, the International Building Code, and chapter 19.28 RCW.

1205.2 Access and pathways.
Roof access, pathways and spacing requirements shall be provided in accordance with Sections 1205.2.1 through 1205.3.3. Pathways shall be over areas capable of supporting fire fighters accessing the roof. Pathways shall be loca

In [5]:
# Saving the formatted section to a text file
file_path = "C:\\Users\\user\\Documents\\Aether\\PV_Systems_Design.txt"
with open(file_path, "w") as file:
    file.write("Extracted Section:\n\n")  
    file.write(formatted_section) 

print(f"\nFormatted and extracted section saved to {file_path}")


Formatted and extracted section saved to C:\Users\user\Documents\Aether\PV_Systems_Design.txt
