In [4]:
!pip install requests beautifulsoup4



In [5]:
YEARS = [str(year) for year in range(2020, 2024)]  # The years for which we must download the PDFs
PDF_DEST_ROOT_PATH = '/home/bharathsk/acads/fall_2023/scientific_entity_recognition/data/pdfs'  # Where the PDFs must be stored; format -> PDF_DEST_ROOT_PATH/<event_name>/<pdf_name>.pdf
BASE_URL = "https://aclanthology.org/"

In [6]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [7]:
response = requests.get(BASE_URL)
soup = BeautifulSoup(response.content, 'html.parser')
hrefs = soup.find_all('a', href=True)
events = [a['href'] for a in hrefs if '/events/' in a['href'] and any(year in a['href'] for year in YEARS)]

In [8]:
def download_pdf(pdf_url, folder_name):
    response = requests.get(pdf_url)
    pdf_name = os.path.join(folder_name, pdf_url.split("/")[-1])
    with open(pdf_name, 'wb') as pdf_file:
        pdf_file.write(response.content)


def get_pdf_links(year_url):
    response = requests.get(year_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    pdf_links = [urljoin(BASE_URL, link['href']) for link in soup.select("a[href$='.pdf']")]
    return pdf_links


def scrape_pdfs_for_years():
    for event in events:
        print(f"Downloading event = {event}")
        event_year_url = urljoin(BASE_URL, event)
        pdf_links = get_pdf_links(event_year_url)
        dest_folder_dir = os.path.join(PDF_DEST_ROOT_PATH, event.split('/')[2])
        os.makedirs(dest_folder_dir, exist_ok=True)
        for pdf_url in pdf_links:
            download_pdf(pdf_url, dest_folder_dir)
            print(f"Downloaded: {pdf_url} @ {dest_folder_dir}")

In [9]:
scrape_pdfs_for_years()

Downloading event = /events/aacl-2022/
Downloaded: https://aclanthology.org/2022.aacl-main.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.0.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.1.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.2.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.3.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.4.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.5.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.6.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.7.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.8.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.9.pdf @ ../../data/pdfs/aacl-2022
Downloaded: https

ConnectTimeout: HTTPSConnectionPool(host='aclanthology.org', port=443): Max retries exceeded with url: /2022.aacl-short.57.pdf (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7ff6d797cb80>, 'Connection to aclanthology.org timed out. (connect timeout=None)'))