In [1]:
import requests
from bs4 import BeautifulSoup
import time
from openpyxl import Workbook, load_workbook

# Base URL of the page to scrape
base_url = "https://dl.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&ConceptID=118290&ConceptID=118211&expand=all&startPage={}&pageSize=50"

# Excel file to store the titles, URLs, badges, and abstracts
excel_file = "titles_urls_badges_and_abstracts.xlsx"

# Clear the file at the start by creating a new workbook and writing the header
wb = Workbook()
ws = wb.active
ws.title = "Scraped Data"
ws.append(["Title", "URL", "Badges", "Abstract"])
wb.save(excel_file)


# Loop through the pages from startPage=0 to startPage=71
for page in range(72):
    print(f"Getting page {page} of 40")
    # Construct the URL for the current page
    url = base_url.format(page)

    # Send a request to the URL
    response = requests.get(url)
    response.raise_for_status()  # Ensure we got a valid response

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all the list items with class "search__item issue-item-container"
    search_items = soup.find_all("li", class_="search__item issue-item-container")

    # Iterate over each search item and extract the necessary information
    for item in search_items:
        title_tag = item.find("span", class_="hlFld-Title")
        if title_tag:
            title = title_tag.get_text(strip=True)
            link_tag = title_tag.find("a")
            href = link_tag["href"] if link_tag else None
            full_url = f"https://dl.acm.org{href}" if href else None

            # Extract the alt text of the image in the publisher badge
            badges = []
            publisher_badge_div = item.find("div", class_="publisher-badge")
            img_tag = publisher_badge_div.find("img") if publisher_badge_div else None
            if img_tag and "alt" in img_tag.attrs:
                badges.append(img_tag["alt"])

            # Extract the data titles from the 'a' tags in the img-badget divs
            img_badget_divs = item.find_all("div", class_="img-badget")
            for div in img_badget_divs:
                a_tag = div.find("a")
                if a_tag and "data-title" in a_tag.attrs:
                    badges.append(a_tag["data-title"])

            # Join all badges with a semicolon
            badges_str = "; ".join(badges)

            # Extract the abstract from the search results page
            abstract_div = item.find("div", class_="issue-item__abstract")
            abstract_p = abstract_div.find("p") if abstract_div else None
            abstract = abstract_p.get_text(strip=True) if abstract_p else None


            # Append the data to the Excel file immediately
            wb = load_workbook(excel_file)
            ws = wb.active
            ws.append([title, full_url, badges_str, abstract])
            wb.save(excel_file)

            # Sleep for 10 seconds between each search result request
            time.sleep(2)

    # Sleep for 10 seconds between each page request
    time.sleep(10)

print(f"Data saved to {excel_file}")

Getting page 0 of 40
Getting page 1 of 40
Getting page 2 of 40
Getting page 3 of 40
Getting page 4 of 40
Getting page 5 of 40
Getting page 6 of 40
Getting page 7 of 40
Getting page 8 of 40
Getting page 9 of 40
Getting page 10 of 40
Getting page 11 of 40
Getting page 12 of 40
Getting page 13 of 40
Getting page 14 of 40
Getting page 15 of 40
Getting page 16 of 40
Getting page 17 of 40
Getting page 18 of 40
Getting page 19 of 40
Getting page 20 of 40
Getting page 21 of 40
Getting page 22 of 40
Getting page 23 of 40
Getting page 24 of 40
Getting page 25 of 40
Getting page 26 of 40
Getting page 27 of 40
Getting page 28 of 40
Getting page 29 of 40
Getting page 30 of 40
Getting page 31 of 40
Getting page 32 of 40
Getting page 33 of 40
Getting page 34 of 40
Getting page 35 of 40
Getting page 36 of 40
Getting page 37 of 40
Getting page 38 of 40
Getting page 39 of 40
Getting page 40 of 40
Getting page 41 of 40
Getting page 42 of 40
Getting page 43 of 40
Getting page 44 of 40
Getting page 45 of 4