In [1]:
!pip install requests beautifulsoup4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
YEARS = [str(year) for year in range(2020, 2024)]  # The years for which we must download the PDFs
PDF_DEST_ROOT_PATH = './pdfs'  # Where the PDFs must be stored; format -> PDF_DEST_ROOT_PATH/<event_name>/<pdf_name>.pdf
BASE_URL = "https://aclanthology.org/"

In [3]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [4]:
response = requests.get(BASE_URL)
soup = BeautifulSoup(response.content, 'html.parser')
hrefs = soup.find_all('a', href=True)
events = [a['href'] for a in hrefs if '/events/' in a['href'] and any(year in a['href'] for year in YEARS)]

In [5]:
def download_pdf(pdf_url, folder_name):
    response = requests.get(pdf_url)
    pdf_name = os.path.join(folder_name, pdf_url.split("/")[-1])
    with open(pdf_name, 'wb') as pdf_file:
        pdf_file.write(response.content)


def get_pdf_links(year_url):
    response = requests.get(year_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    pdf_links = [urljoin(BASE_URL, link['href']) for link in soup.select("a[href$='.pdf']")]
    return pdf_links


def scrape_pdfs_for_years():
    for event in events:
        print(f"Downloading event = {event}")
        event_year_url = urljoin(BASE_URL, event)
        pdf_links = get_pdf_links(event_year_url)
        dest_folder_dir = os.path.join(PDF_DEST_ROOT_PATH, event.split('/')[2])
        os.makedirs(dest_folder_dir, exist_ok=True)
        for pdf_url in pdf_links:
            download_pdf(pdf_url, dest_folder_dir)
            print(f"Downloaded: {pdf_url} @ {dest_folder_dir}")

In [6]:
scrape_pdfs_for_years()

Downloading event = /events/aacl-2022/
Downloaded: https://aclanthology.org/2022.aacl-main.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.0.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.1.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.2.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.3.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.4.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.5.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.6.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.7.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.8.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.9.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2022.aacl-main.10.pdf @ ./pdfs/aacl-2022
Downloaded: https://aclanthology.org/2


KeyboardInterrupt

