In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import concurrent.futures
from pprint import pprint
from datetime import datetime
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime, timedelta

def initiate_browser_driver(item):
    options = Options()
    folder_name = f'D:\JEL Codes\{item["jelcodes"][0]}_{item["start_date"]}_{item["end_date"]}'
    prefs = {'download.default_directory': folder_name}
    options.add_experimental_option('prefs', prefs)
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()

    driver.get('https://www.ssrn.com/index.cfm/en/')
    # Accepting cookies popup
    try:
        driver.find_element(By.CSS_SELECTOR, 'button#onetrust-accept-btn-handler').click()
    except:
        pass
    pprint("Chromium Driver is Activated!")
    return driver


def generate_links_for_each_input(inputs, driver):
    pprint("Generating Links for each Input")
    output = []
    for item in inputs:
        if item['jelcodes'] is not None:
            links = set()
            for jelcode in item['jelcodes']:
                pprint("Searching for {} jelcode".format(jelcode))
                page = 1
                while True:
                    url = 'https://papers.ssrn.com/sol3/jweljour_results.cfm?npage={}&form_name=Jel&code={}&lim=false&orderBy=ab_approval_date&orderDir=desc&strSelectedOption=6'.format(page, jelcode)
                    driver.get(url)
                    linksEle = driver.find_elements(By.CSS_SELECTOR, 'div.description')
                    if len(linksEle) == 0:
                        break

                    for ele in linksEle:
                        try:
                            date_element = ele.find_element(By.CSS_SELECTOR, 'div.note.note-list>span:nth-child(2)')
                            date_text = date_element.get_attribute('innerHTML').replace("Posted:", "").strip()
                            if "Last Revised" in date_text:
                                date_element = ele.find_element(By.CSS_SELECTOR, 'div.note.note-list>span:nth-child(1)')
                                date_text = date_element.get_attribute('innerHTML').replace("Posted:", "").strip()

                            if date_text:
                                # Convert the date_text to a Python-readable date format
                                date_format = datetime.strptime(date_text, '%d %b %Y')
                                # Check if the paper's posted date is within the desired range
                                start_date = datetime.strptime(item['start_date'], '%d %b %Y')
                                end_date = datetime.strptime(item['end_date'], '%d %b %Y') + timedelta(days=1)  # Add 1 day to make the range inclusive
                                if start_date <= date_format <= end_date:
                                    link_element = ele.find_element(By.CSS_SELECTOR, 'a[class="title optClickTitle"]')
                                    link = link_element.get_attribute('href')
                                    links.add(link)
                        except NoSuchElementException:
                            continue

                    page += 1

            pprint("********************************")
            pprint(item)
            pprint(links)
            pprint("********************************")
            output.append(links)

    return output


# def download_free_pdf_from_links(links, driver):
#     pprint("Starting Downloading of generated Links!")
#     for link_set in links:
#         for link in link_set:
#             driver.execute_script('''window.open("{}","_blank");'''.format(link))
#             driver.switch_to.window(driver.window_handles[-1])
#             try:
#                 pdfButton = driver.find_element(By.CSS_SELECTOR, 'img[alt="PDF icon"]')
#                 pdfButton.click()
#                 # Halt the process for 5 seconds for download to begin
#                 time.sleep(5)
#                 pprint(link + " Downloaded!")
#             except:
#                 pprint(link + " Paid Paper")
#                 pass
#             driver.close()
#             driver.switch_to.window(driver.window_handles[0])

def download_pdf(link, driver):
    driver.execute_script(f'window.open("{link}","_blank");')
    driver.switch_to.window(driver.window_handles[-1])
    try:
        pdfButton = driver.find_element(By.CSS_SELECTOR, 'img[alt="PDF icon"]')
        pdfButton.click()
        # Halt the process for 5 seconds for download to begin
        time.sleep(5)
        pprint(link + " Downloaded!")
    except:
        pprint(link + " Paid Paper")
    finally:
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

def download_free_pdf_from_links(links_set, driver):
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(download_pdf, link, driver) for link in links_set}
        concurrent.futures.wait(futures)

inputs = [
    {'jelcodes': ['G00'], 'start_date': '1 Jun 2022', 'end_date': '27 Jun 2022'}
]

for item in inputs:
    driver = initiate_browser_driver(item)
    output = generate_links_for_each_input(inputs=[item], driver=driver)
    download_free_pdf_from_links(links_set=output[0], driver=driver)



'Chromium Driver is Activated!'
'Generating Links for each Input'
'Searching for G00 jelcode'
'********************************'
{'end_date': '27 Jun 2022', 'jelcodes': ['G00'], 'start_date': '1 Jun 2022'}
{'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4109485',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4112932',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4116467',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4124748',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4130640',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4132689',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4133389',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4134885',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4143867',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4147249'}
'********************************'
'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4143867 Downloaded!'
'https://papers.ssrn.com/sol3/papers.cfm?abst

In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
from pprint import pprint
import datetime


def initiate_browser_driver():
    options = Options()
    prefs = {'download.default_directory' : '/Users/nancygahlot/Downloads/'}
    options.add_experimental_option('prefs', prefs)
    # add comment to next line to see the browser working, for debugging.
    options.add_argument('--headless=new')
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()

    driver.get('https://www.ssrn.com/index.cfm/en/')
    # accepting cookies popup
    try:
        driver.find_element(By.CSS_SELECTOR,'button#onetrust-accept-btn-handler').click()
    except:
        pass
    pprint("Chromium Driver is Activated!")
    return  driver

current_date = datetime.date.today()
one_year_ago = current_date - datetime.timedelta(days=365)

def generate_links_for_each_input(input,driver):
    pprint("Generating Links for each Input")
    output= []
    for item in input:
        if item['jelcodes'] is not None and item['keywords'] is not None:
            links = list()
            for jelcode in item['jelcodes']:
                pprint("searching for {} jelcode".format(jelcode))
                driver.get('https://papers.ssrn.com/sol3/jweljour_results.cfm?code={}'.format(jelcode))
                # entering the keyword
                for keyword in item['keywords']:
                    searchBox = driver.find_element(By.CSS_SELECTOR,'input#txtKey_Words_within')
                    searchBox.send_keys(keyword)
                    searchBox.send_keys(Keys.RETURN)
                ## currently only extracting links on page 1 of the search result
                linksEle = driver.find_elements(By.CSS_SELECTOR,'a[class="title optClickTitle"]')
                tempLinks = set()
                for ele in linksEle:
                    tempLinks.add(ele.get_attribute('href'))
                pprint("generated {} links for {} jel code".format(len(tempLinks),jelcode))
                links.append(tempLinks)
            ## intersection of all the links
            resultSet = set.intersection(*links)   
            pprint("********************************")
            pprint(item)
            pprint(resultSet)
            pprint("********************************")
            output.append(resultSet)
        return output


def download_free_pdf_from_links(links,driver):
    pprint("Starting Downloading of generated Links!")
    for i in range(len(input)):
        pprint(input[i])
        for link in links[i]:
            driver.execute_script('''window.open("{}","_blank");'''.format(link))
            driver.switch_to.window(driver.window_handles[-1])
            try:
                pdfButton = driver.find_element(By.CSS_SELECTOR,'img[alt="PDF icon"]')
                pdfButton.click()
                # halt the process for 5 seconds for download to begin
                time.sleep(5)
                pprint(link+" Downloaded!")
            except:
                pprint(link+" Paid Paper")
                pass
            driver.close()
            driver.switch_to.window(driver.window_handles[0])


# important to provide both jelcode and keywords
input = [
    # {'jelcodes':['C50','G11','C00','G00'],'keywords':['GTAA','Quantitative','Tactical Asset Allocation']},
    # {'jelcodes':['G00','G11','G21','G33'],'keywords':['risk management','Quantitative','convertible bond']}
    {'jelcodes':['G33'],'keywords':['risk management','Quantitative','convertible bond']}
]

driver  = initiate_browser_driver()
output =  generate_links_for_each_input(input=input,driver=driver)
download_free_pdf_from_links(links=output,driver=driver)

time.sleep(5)
driver.quit()

'Chromium Driver is Activated!'
'Generating Links for each Input'
'searching for G33 jelcode'
'generated 56 links for G33 jel code'
'********************************'
{'jelcodes': ['G33'],
 'keywords': ['risk management', 'Quantitative', 'convertible bond']}
{'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1344745',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1502467',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1616477',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1895984',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2023998',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2054816',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2263871',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2400101',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2419016',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2668653',
 'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2827808',
 'https://papers.ssrn.com/sol3/papers

'https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4397933 Paid Paper, sad!'
