In [2]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

p1_folder = r'D:\JEL Codes\P16'
os.makedirs(p1_folder, exist_ok=True)
similar_folder = r'D:\JEL Codes\P16\Similar'
os.makedirs(similar_folder, exist_ok=True)

def initiate_browser_driver():
    options = Options()
    options.add_argument("--no-sandbox") 
    options.add_argument("--disable-dev-shm-usage")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_argument('--headless')
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.9999.999 Safari/537.36')
    prefs = {'download.default_directory': p1_folder}
    options.add_experimental_option('prefs', prefs)
#     driver = webdriver.Chrome(executable_path=r'C:\Users\31618\Documents\Python Scripts\chromedriver', options=options)
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()

    driver.get('https://papers.ssrn.com/sol3/papers.cfm?abstract_id=334304')
    # Accepting cookies popup
    try:
        driver.find_element(By.CSS_SELECTOR, 'button#onetrust-accept-btn-handler').click()
    except:
        pass
    print("Chromium Driver is Activated!")
    return driver


def download_pdfs(driver):
    print("Downloading PDFs...")
    first_download_button = driver.find_element(By.XPATH, '//*[@id="maincontent"]/div/div[1]/div/div[1]/a')
    driver.execute_script("arguments[0].scrollIntoView();", first_download_button)
    first_download_button.click()

    # Wait for the file to be downloaded
    wait_time = 0
    while not any(fname.endswith(".pdf") for fname in os.listdir(p1_folder)):
        time.sleep(1)
        wait_time += 1
        if wait_time > 60:  # Maximum wait time of 60 seconds
            print("Download timed out")
            break

    # Wait until the download is complete (remove .crdownload files)
    while any(fname.endswith(".crdownload") for fname in os.listdir(p1_folder)):
        time.sleep(1)

    print("First PDF downloaded successfully.")
    
    pdf_counter = 0  # Counter to track the number of downloaded PDFs
    processed_links = set()  # Set to store links that have already been processed

    while True:
        try:
            # Scroll down to ensure the "Load More" button is in view
            driver.execute_script("window.scrollBy(0, 100);")

            # Find the "Load More" button using the CSS selector
            load_more_button = WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '.button-load-more'))
            )

            # Scroll to the Load More button and click
            driver.execute_script("arguments[0].scrollIntoView();", load_more_button)
            load_more_button.click()

            # Wait for the Load More button to become stale (indicating successful click)
            WebDriverWait(driver, 20).until(EC.staleness_of(load_more_button))
            time.sleep(2)

        except:
            print("No more 'Load More' button found or download complete.")
            break

        # Find the "Download PDF" buttons for the rest of the PDFs
        download_buttons = driver.find_elements(By.XPATH, '//*[@id="citations-widget"]/ol/li/div[4]/a')

        # Iterate through the download buttons
        for button in download_buttons:
            try:
                link = button.get_attribute("href")

                # Skip already processed links
                if link in processed_links:
                    continue

                button_text = button.text.lower()

                # Check if the button text indicates a PDF download link
                if "download" in button_text or "pdf" in button_text:
                    driver.execute_script("arguments[0].scrollIntoView();", button)

                    # Set download location to P1/similar folder for subsequent PDFs
                    prefs = {'download.default_directory': similar_folder}
                    driver.execute_cdp_cmd("Page.setDownloadBehavior", {"behavior": "allow", "downloadPath": similar_folder})

                    button.click()

                    # Wait for the file to be downloaded
                    wait_time = 0
                    while not any(fname.endswith(".pdf") for fname in os.listdir(similar_folder)):
                        time.sleep(1)
                        wait_time += 1
                        if wait_time > 60:  # Maximum wait time of 60 seconds
                            print("Download timed out")
                            break

                    # Wait until the download is complete (remove .crdownload files)
                    while any(fname.endswith(".crdownload") for fname in os.listdir(similar_folder)):
                        time.sleep(1)

                    # Check if the downloaded file size is stable for a brief period
                    initial_size = -1
                    stable_count = 0
                    while stable_count < 5:  # Wait for 5 consecutive stable sizes
                        current_size = sum(os.path.getsize(os.path.join(similar_folder, fname)) for fname in os.listdir(similar_folder) if fname.endswith(".pdf"))
                        if current_size == initial_size:
                            stable_count += 1
                        else:
                            stable_count = 0
                            initial_size = current_size
                        time.sleep(1)

                    pdf_counter += 1
                    if pdf_counter >= 40:  # Download 40 PDFs
                        print("Downloaded PDFs on this page.")
                        return  # Stop downloading after 40 PDFs

                    processed_links.add(link)  # Add the link to the set of processed links

            except StaleElementReferenceException:
                print("Stale element encountered, going back to previous page...")
            
                break  # Break out of the inner loop and continue to the next iteration

if __name__ == "__main__":
    driver = initiate_browser_driver()
    download_pdfs(driver)
    driver.quit()


Chromium Driver is Activated!
Downloading PDFs...
First PDF downloaded successfully.
No more 'Load More' button found or download complete.


In [10]:
# correct code for dissimilar files 
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Set the download folder path
download_folder = r'D:\JEL Codes\P\dissimilar'
os.makedirs(download_folder, exist_ok=True)

def initiate_browser_driver():
    options = Options()
    options.add_argument("--no-sandbox") 
    options.add_argument("--disable-dev-shm-usage") 
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_argument('--headless')
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.9999.999 Safari/537.36')
    prefs = {'download.default_directory': download_folder}
    options.add_experimental_option('prefs', prefs)
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    driver.get('https://www.ssrn.com/index.cfm/en/')
    
    # Accepting cookies popup
    try:
        driver.find_element(By.CSS_SELECTOR, 'button#onetrust-accept-btn-handler').click()
    except:
        pass
    
    print("Chromium Driver is Activated!")
    return driver

def extract_jel_codes(driver):
    jel_code_xpath = '//*[@id="maincontent"]/div/div[3]/div[1]/div[1]/p[4]'
    jel_code_alt_xpath = '//*[@id="maincontent"]/div[2]/div[1]/div[1]/p[4]'
    jel_code_alt2_xpath = '//*[@id="maincontent"]/div/div[3]/div[1]/div[1]/p[3]'
    
    try:
        jel_code_element = driver.find_element(By.XPATH, jel_code_xpath)
    except NoSuchElementException:
        try:
            jel_code_element = driver.find_element(By.XPATH, jel_code_alt_xpath)
        except NoSuchElementException:
            try:
                jel_code_element = driver.find_element(By.XPATH, jel_code_alt2_xpath)
            except NoSuchElementException:
                return []

    jel_codes_text = jel_code_element.text.strip().replace("JEL Classification:", "").strip()
    jel_codes = [code.strip() for code in jel_codes_text.split(',')]
    return jel_codes

def generate_links_for_jel_code(jel_code, driver):
    # Generate links for a specific JEL code from the first page only
    links = set()
    url = f'https://papers.ssrn.com/sol3/jweljour_results.cfm?npage=1&form_name=Jel&code={jel_code}&lim=false&orderBy=ab_approval_date&orderDir=desc&strSelectedOption=6'
    driver.get(url)
    
    try:
        WebDriverWait(driver, 1).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.description')))
    except TimeoutException:
        return links
    
    links_elements = driver.find_elements(By.CSS_SELECTOR, 'div.description')

    for element in links_elements:
        try:
            link_element = element.find_element(By.CSS_SELECTOR, 'a[class="title optClickTitle"]')
            link = link_element.get_attribute('href')
            links.add(link)
        except NoSuchElementException:
            continue
        
    return links


def download_pdfs(driver, links, num_pdfs_to_download):
    downloaded_total = 0
    for link in links:
        if downloaded_total >= num_pdfs_to_download:
            break
        
        print(f"Downloading PDF from link: {link}")
        
        try:
            driver.get(link)
            
            # Wait for the PDF button to be clickable
            try:
                pdf_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, '//*[@id="maincontent"]/div[1]/div[1]/div/div[1]/a/span/img'))
                )
            except TimeoutException:
                print(f"Timeout Exception while waiting for PDF button on: {link}")
                continue  # Skip this link and continue with the next one
            
            # Click the PDF button
            pdf_button.click()
            
            # Wait for the download to start
            try:
                WebDriverWait(driver, 10).until(
                    lambda driver: len(driver.window_handles) == 2
                )
            except TimeoutException:
                print(f"Timeout Exception while waiting for download window on: {link}")
                continue  # Skip this link and continue with the next one
            
            # Switch to the download window and wait for the download to complete
            download_window = driver.window_handles[-1]
            driver.switch_to.window(download_window)
            WebDriverWait(driver, 300).until(
                lambda driver: driver.execute_script("return document.readyState") == "complete"
            )
            
            print(f"Downloaded PDF from: {link}")
            downloaded_total += 1
        
        except NoSuchElementException:
            print(f"Element Not Found while downloading PDF from: {link}")
        except Exception as e:
            print(f"Error during download from: {link}")
            print(f"Error details: {str(e)}")
        
        if downloaded_total >= num_pdfs_to_download:
            break


if __name__ == "__main__":
    driver = initiate_browser_driver()
    url_to_extract_jel_codes = 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2003531'
    driver.get(url_to_extract_jel_codes)
    try:
        driver.find_element(By.CSS_SELECTOR, 'button#onetrust-accept-btn-handler').click()
    except:
        pass
    
    jel_codes = extract_jel_codes(driver)

    num_jel_codes_to_process = min(5, len(jel_codes))
    num_pdfs_to_download = 40  # Total number of PDFs to download
    
    # Calculate the number of PDFs to download per JEL code
    num_pdfs_per_jel_formula = num_pdfs_to_download // num_jel_codes_to_process
    
    print("Number of JEL Codes to Process:", num_jel_codes_to_process)
    print("Total Number of PDFs to Download:", num_pdfs_to_download)
    print("Number of PDFs to Download per JEL Code (Formula):", num_pdfs_per_jel_formula)
    
    # Create a dictionary to store lists of links for each JEL code
    jel_code_links = {}
    
    # Loop through the JEL codes and generate links
    for jel_code in jel_codes[:num_jel_codes_to_process]:
        print("Processing JEL Code:", jel_code)
        
        links = generate_links_for_jel_code(jel_code, driver)
        
        # Store the links in the dictionary using JEL code as the key
        jel_code_links[jel_code] = list(links)
    
    # Create folders for each JEL code
    for jel_code in jel_codes[:num_jel_codes_to_process]:
        current_download_folder = os.path.join(download_folder, jel_code)
        os.makedirs(current_download_folder, exist_ok=True)
    
    remaining_pdfs = num_pdfs_to_download
    
    # Loop through the JEL codes and download PDFs
    for jel_code in jel_codes[:num_jel_codes_to_process]:
        current_download_folder = os.path.join(download_folder, jel_code)
        links_to_download = jel_code_links[jel_code][:min(num_pdfs_per_jel_formula, len(jel_code_links[jel_code]))]
        
        # Set the download folder for the current download
        prefs = {'download.default_directory': current_download_folder}
        driver.execute_cdp_cmd('Page.setDownloadBehavior', {'behavior': 'allow', 'downloadPath': current_download_folder})
        
        # Download PDFs for the links in this batch
        download_pdfs(driver, links_to_download, len(links_to_download))
        
        remaining_pdfs -= len(links_to_download)
    
    # Close the Selenium WebDriver
    driver.quit()            

Chromium Driver is Activated!
Number of JEL Codes to Process: 5
Total Number of PDFs to Download: 40
Number of PDFs to Download per JEL Code (Formula): 8
Processing JEL Code: D47
Processing JEL Code: D61
Processing JEL Code: D71
Processing JEL Code: C72
Processing JEL Code: D82
Downloading PDF from link: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4466506
Timeout Exception while waiting for download window on: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4466506
Downloading PDF from link: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4529978
Timeout Exception while waiting for download window on: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4529978
Downloading PDF from link: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4532911
Timeout Exception while waiting for download window on: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4532911
Downloading PDF from link: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4377561
Timeout Exception while w

Timeout Exception while waiting for download window on: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4497780
