In [18]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

In [19]:
p1_folder = r'D:\JEL Codes\P19'
os.makedirs(p1_folder, exist_ok=True)
similar_folder = r'D:\JEL Codes\P19\Similar'
os.makedirs(similar_folder, exist_ok=True)

In [20]:
def initiate_browser_driver():
    options = Options()
    options.add_argument("--no-sandbox") 
    options.add_argument("--disable-dev-shm-usage")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_argument('--headless')
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.9999.999 Safari/537.36')
    prefs = {'download.default_directory': p1_folder}
    options.add_experimental_option('prefs', prefs)
#     driver = webdriver.Chrome(executable_path=r'C:\Users\31618\Documents\Python Scripts\chromedriver', options=options)
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()

    driver.get('https://papers.ssrn.com/sol3/papers.cfm?abstract_id=334304')
    # Accepting cookies popup
    try:
        driver.find_element(By.CSS_SELECTOR, 'button#onetrust-accept-btn-handler').click()
    except:
        pass
    print("Chromium Driver is Activated!")
    return driver


In [21]:
def download_pdfs(driver, p1_folder, similar_folder):
    print("Downloading PDFs...")

    def wait_for_download_completion(folder):
        wait_time = 0
        while not any(fname.endswith(".pdf") for fname in os.listdir(folder)):
            time.sleep(1)
            wait_time += 1
            if wait_time > 60:  # Maximum wait time of 60 seconds
                print("Download timed out")
                break

    def scroll_to_element(element):
        driver.execute_script("arguments[0].scrollIntoView();", element)

    # Step 1: Open the initial paper URL
    driver.get("https://papers.ssrn.com/sol3/papers.cfm?abstract_id=334304")

    # Click on the first download button
    first_download_button = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="maincontent"]/div/div[1]/div/div[1]/a'))
    )
    scroll_to_element(first_download_button)
    first_download_button.click()

    wait_for_download_completion(p1_folder)
    print("First PDF downloaded successfully.")

    # Step 2: Add "#paper-citations-widget" to the URL
    driver.get(driver.current_url + "#paper-citations-widget")

    pdf_counter = 0  # Counter to track the number of downloaded PDFs
    processed_links = set()  # Set to store links that have already been processed

    while True:
        try:
            # Scroll down to ensure the "Load More" button is in view
            driver.execute_script("window.scrollBy(0, 100);")

            # Find the "Load More" button
            load_more_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, '//*[@id="citations-widget"]/button'))
            )
            scroll_to_element(load_more_button)
            load_more_button.click()

            # Wait for new PDF download links to appear
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="citations-widget"]/ol/li/div[4]/a'))
            )

        except TimeoutException:
            print("No more 'Load More' button found or download complete.")
            break

        # Find the "Download PDF" buttons for the new PDFs
        download_buttons = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.XPATH, '//*[@id="citations-widget"]/ol/li/div[4]/a'))
        )

        # Iterate through the download buttons
        for button in download_buttons:
            try:
                link = button.get_attribute("href")

                # Skip already processed links
                if link in processed_links:
                    continue

                button_text = button.text.lower()

                # Check if the button text indicates a PDF download link
                if "download" in button_text or "pdf" in button_text:
                    # Check the associated <span> element for downloads
                    downloads_span = button.find_element(By.XPATH, '../../div/span')
                    downloads_text = downloads_span.text.strip()

                    # Skip downloading if the downloads count is "0"
                    if downloads_text == "0":
                        continue

                    scroll_to_element(button)

                    # Set download location to similar_folder for subsequent PDFs
                    prefs = {'download.default_directory': similar_folder}
                    driver.execute_cdp_cmd("Page.setDownloadBehavior", {"behavior": "allow", "downloadPath": similar_folder})

                    button.click()

                    wait_for_download_completion(similar_folder)

                    pdf_counter += 1
                    if pdf_counter >= 40:  # Download 40 PDFs
                        print("Downloaded PDFs on this page.")
                        return  # Stop downloading after 40 PDFs

                    processed_links.add(link)  # Add the link to the set of processed links

            except StaleElementReferenceException:
                print("Stale element encountered, going back to the previous page...")
                break

    driver.quit()  # Close the browser when done

In [22]:
if __name__ == "__main__":
    driver = initiate_browser_driver()
    download_pdfs(driver,p1_folder, similar_folder)
    driver.quit()

Chromium Driver is Activated!
Downloading PDFs...
First PDF downloaded successfully.
Stale element encountered, going back to the previous page...
Downloaded PDFs on this page.
