**Scraper for ACS journals**

Make imports
* BeautifulSoup - parsing html
* undetected_chromedriver - headless browser

Set the root url

In [32]:
import requests
from bs4 import BeautifulSoup
import os
from pathlib import Path
import requests
from urllib.parse import urljoin
import undetected_chromedriver as uc

root = "https://pubs.acs.org/"


In [40]:
def download_image(root, stem):
    '''
    Download image from the url root + stem
    '''

    url = urljoin(root, stem)

    # Get the image
    response = requests.get(url)
    _, file_ = os.path.split(stem)

    # Check 'images' folder exists
    if not os.path.exists("images"):
        os.mkdir("images")

    # Write image to file
    out_path = os.path.join("images", file_)
    with open(out_path, "wb") as out_file:
        out_file.write(response.content)

**Direct download**
* Loop through all issues of the journal
* Parse the html to find the location of the ToC image
* Download the image

In [None]:
# Set the ranges to cover the issues for the journal of interest
for i in range(144, 0, -1):
    for j in range(52, 0, -1):
        
        # Substitute <JOURNAL URL>
        URL = f"https://pubs.acs.org/toc/<JOURNAL URL>/{i}/{j}"

        page = requests.get(URL)

        soup = BeautifulSoup(page.content, "html.parser")

        results = soup.find_all("img", class_ = "lazy")

        for result in results:
            download_image(root, result["data-src"]) # updated to src
            print(result["data-src"])

**If a headless browser is needed (e.g. for cloudflare protection)**
* This example is for downloading text from specific classes in the html
* e.g. Paper titles and abstracts. 
* **Note:** if using text with nonstandard characters, check encoding
* Write to <FILENAME.csv>
* Could also download images

In [44]:
import csv

with open ('<FILENAME>.csv', 'a') as f:
    writer = csv.writer(f)

    for i in range(144, 0, -1):
        for j in range(52, 0, -1):
            URL = f"https://pubs.acs.org/toc/<JOURNAL URL>/{i}/{j}"
            # print(f'Volume {i} Issue {j}')

            # Access page via headless chrome driver
            options = uc.ChromeOptions()
            options.headless = True
            driver = uc.Chrome(options=options)
            driver.get(URL)
            page = driver.page_source

            # Parse the html
            soup = BeautifulSoup(page, "html.parser")
            results = soup.find_all("div", class_ = "issue-item clearfix")

            # Find the appropriate content within the html
            # title, image url (for matching image filename), and abstract
            for k in results:
                title = k.a.get('title').encode("utf-8")
                if k.img:
                    img = k.img.get('data-src')
                else:
                    img = 'None'
                if k.p:
                    abs = k.p.getText().encode("utf-8")
                else:
                    abs = 'None'
                writer.writerow([title, img, abs])

    # Close the headless browser
    driver.quit()