In [2]:
# Install the packages (only once)
!pip install selenium
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.1


In [4]:
# load libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By # Read specific information on web

In [None]:
# get current working directory
import os
cwd = os.getcwd()
cwd

# The code below is based on the YouTube tutorial. 
## https://www.youtube.com/watch?v=Qdya-7XElmc&list=PLX2zkZ3VCcLwlVlgxFLtLLIP-s0VJd_IQ&index=3

### Define the saving path

In [10]:
#save_pdf_path = "...save_pdf_path = "..."

### Set options 

In [12]:
# create webdriver options as an easy way to control the behavior of Selenium when it interacts with the 
# target website
options = webdriver.ChromeOptions()

# The first element in {} defines the path of the folder where PDFs will be saved.
# The second element enables a direct download of the target PDF when it is set to be False. Otherwise, a PDF viewer will be prompt.
options.add_experimental_option('prefs', {
    "download.default_directory": save_pdf_path,
    "download.prompt_for_download": False,
    "plugins.always_open_pdf_externally": True
})

### Create the driver to interact with the website

In [14]:
# create the driver to control an automatic interaction with the target website
driver = webdriver.Chrome(
    service=ChromeService(ChromeDriverManager().install()),
    options=options
)

### Set the URL of the target website and Link the driver

In [16]:
# set the URL of the target website here.
URL = "https://documents.dps.ny.gov/public/MatterManagement/CaseMaster.aspx?MatterSeq=63186&MNO=20-E-0380"

# associate the driver with the target website.
driver.get(URL)

# maximize the window in case that we omit anything
driver.maximize_window()
# IMPORTANT, set the number in seconds properly
# DO NOT run the next code until the website is loaded completely.
driver.implicitly_wait(60)

### Main code to download the PDFs automatically

In [18]:
# A key observation shows that each PDF is stored in an element <a></a> in html (i.e., you can observe it using the inspect function of the website)
# Therefore, we first extract all <a></a> elements
all_a_tags = driver.find_elements(By.XPATH, "//a")
# File counter
file_count = 0
# A set of skipped file indices
skipped_file_list = set()
# Loop through all extracted elements
for a in all_a_tags:
    # Another key observation is that PDFs are contained in the element with the following features:
    # 1. it must contain the "href" attribute to load the PDF.
    # 2. the "href" string must contain sub-strings such as "ViewDoc" and "DocRefId"
    a_attrs = a.get_attribute('href')
    
    # check if the "href" exists and satisfies the two features
    if a_attrs and "ViewDoc" in a_attrs and "DocRefId" in a_attrs:
        # find a file
        file_count += 1
        # check if the file can be clicked. In other words, check if the id exists.
        # for example, when the id is "", the "href" links to another webpage. 
        # otherwise, the "href" can be used to download the corresponding PDF.
        file_id = a_attrs.split("=")[-1]
        if file_id == "":  # record the skipped number and continue
            print(f"Skipped file number = {file_count}")
            skipped_file_list.add(file_count)
            continue
        else:
            a.click()  # automatic click and download the PDF
            driver.implicitly_wait(5)  # wait for 5 seconds to download the current PDF. 
            print(f"Downloaded file numer = {file_count} | name = {a.get_attribute('text')}")  # comment this line when the number of PDFs is large.
    
    # the loop will be broke when the first 20 files are processed.
    # change to different number for debug
    # remove the following two lines to download ALL PDFs.
    if file_count >= 20:
        break

# Print the total number of the PDFs
print(f"Total File Processed Number = {file_count}")
# Print the PDFs that are skipped.
print(f"Skipped file numbers = {skipped_file_list}")
print(
)

Downloaded file numer = 1 | name = Training Certification Report
Skipped file number = 2
Skipped file number = 3
Downloaded file numer = 4 | name = Request for Exception from Disclosure
Downloaded file numer = 5 | name = Cover Letter
Downloaded file numer = 6 | name = AMI NY Semi-Annual Report (May 2024)
Downloaded file numer = 7 | name = Request for Exception from Disclosure 
Downloaded file numer = 8 | name = GBE Q4 FY24 Quarterly Report
Downloaded file numer = 9 | name = Cover Letter
Downloaded file numer = 10 | name = National Grid - Q4 FY24 IT&D Report
Downloaded file numer = 11 | name = Cover Letter
Downloaded file numer = 12 | name = Earnings Adjustment Mechanisms First Quarter 2024 Report
Downloaded file numer = 13 | name = Cover Letter
Downloaded file numer = 14 | name = FY24 Q4 CIP Quarterly Report
Downloaded file numer = 15 | name = Cover Letter
Downloaded file numer = 16 | name = NMPC 2024 Q1 Electric Reliability Report
Downloaded file numer = 17 | name = Cover Letter
Downl

In [20]:
# A key observation shows that each PDF is stored in an element <a></a> in html (i.e., you can observe it using the inspect function of the website)
# Therefore, we first extract all <a></a> elements
all_a_tags = driver.find_elements(By.XPATH, "//a")
# File counter
file_count = 0
# A set of skipped file indices
skipped_file_list = set()
# Loop through all extracted elements
for a in all_a_tags:
    # Another key observation is that PDFs are contained in the element with the following features:
    # 1. it must contain the "href" attribute to load the PDF.
    # 2. the "href" string must contain sub-strings such as "ViewDoc" and "DocRefId"
    a_attrs = a.get_attribute('href')
    
    # check if the "href" exists and satisfies the two features
    if a_attrs and "ViewDoc" in a_attrs and "DocRefId" in a_attrs:
        # find a file
        file_count += 1
        # check if the file can be clicked. In other words, check if the id exists.
        # for example, when the id is "", the "href" links to another webpage. 
        # otherwise, the "href" can be used to download the corresponding PDF.
        file_id = a_attrs.split("=")[-1]
        if file_id == "":  # record the skipped number and continue
            print(f"Skipped file number = {file_count}")
            skipped_file_list.add(file_count)
            continue
        else:
            a.click()  # automatic click and download the PDF
            driver.implicitly_wait(5)  # wait for 5 seconds to download the current PDF. 
            print(f"Downloaded file numer = {file_count} | name = {a.get_attribute('text')}")  # comment this line when PDF number is large.
            
# Print the total number of the PDFs
print(f"Total File Processed Number = {file_count}")
# Print the PDFs that are skipped.
print(f"Skipped file numbers = {skipped_file_list}")
print(
)

Downloaded file numer = 1 | name = Training Certification Report
Skipped file number = 2
Skipped file number = 3
Downloaded file numer = 4 | name = Request for Exception from Disclosure
Downloaded file numer = 5 | name = Cover Letter
Downloaded file numer = 6 | name = AMI NY Semi-Annual Report (May 2024)
Downloaded file numer = 7 | name = Request for Exception from Disclosure 
Downloaded file numer = 8 | name = GBE Q4 FY24 Quarterly Report
Downloaded file numer = 9 | name = Cover Letter
Downloaded file numer = 10 | name = National Grid - Q4 FY24 IT&D Report
Downloaded file numer = 11 | name = Cover Letter
Downloaded file numer = 12 | name = Earnings Adjustment Mechanisms First Quarter 2024 Report
Downloaded file numer = 13 | name = Cover Letter
Downloaded file numer = 14 | name = FY24 Q4 CIP Quarterly Report
Downloaded file numer = 15 | name = Cover Letter
Downloaded file numer = 16 | name = NMPC 2024 Q1 Electric Reliability Report
Downloaded file numer = 17 | name = Cover Letter
Downl

In [22]:
# for some reason, on the website that we are examining, it sometimes will show the error message "Either the document does not exists or some problem occured due to unexpected reason." In those cases, the scraping code will leave those tabs open after downloading other files. In this case, there are six cases in this situation.
# in this case:
# https://documents.dps.ny.gov/public/Common/ViewDoc.aspx?DocRefId={A01E428A-0000-C73E-9FD4-83865E771CDC}
# https://documents.dps.ny.gov/public/Common/ViewDoc.aspx?DocRefId={50F6418A-0000-C673-80F8-FB2C4BD16B1D}
# https://documents.dps.ny.gov/public/Common/ViewDoc.aspx?DocRefId={396F7402-55A0-4D11-BE5C-A84246A47452}
# https://documents.dps.ny.gov/public/Common/ViewDoc.aspx?DocRefId={18A8166E-3EBC-41ED-A7AC-1C30F10B9EF8}
# https://documents.dps.ny.gov/public/Common/ViewDoc.aspx?DocRefId={2693ABE8-1CEA-4AAF-97D5-28E5730F33C7}
# https://documents.dps.ny.gov/public/Common/ViewDoc.aspx?DocRefId={FB22FAE0-F39A-4486-B4DF-8C4FA0E2A77A}
# we just need to go to these websites and download those by ourselves.

In [24]:
driver.close()