# DOI **Web Scraping**

In [112]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
url = 'https://dl.acm.org/doi/proceedings/10.1145/2814864'

In [113]:
# Parse HTML
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

# soup

In [109]:
# Create DataFrame
df = pd.DataFrame(columns=['doi', 'title', 'authors', 'doi_url', 'pages'])

# Get article metadata
issue_item_container = soup.find_all('div', {'class': 'issue-item-container'})

for item in issue_item_container:
    # Get article title and DOI
    h5 = item.find('h5', {'class': 'issue-item__title'})
    title = h5.text
    doi = h5.a['href'].replace('/doi/', '')

    # Get authors
    temp_authors = []
    li = item.find_all('li')
    for l in li:
        a_tag = l.find('a', {'href': True, 'title': True})
        if a_tag:
            if a_tag.text.strip().startswith('Create'):
                continue
            else:
                temp_authors.append(a_tag.text)

    # Get pages
    div_detail = item.find('div', {'class': 'issue-item__detail'})
    pages = div_detail.span.text.replace('pp ', '')

    # Get DOI URL
    doi_url = 'https://doi.org/' + doi

    # Fill DataFrame
    new_row = {'doi': doi,
               'title': title,
               'authors': ', '.join(temp_authors),
               'doi_url': doi_url,
               'pages': pages}
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

In [115]:
df.head()

Unnamed: 0,doi,title,authors,doi_url,pages
0,10.1145/2814864.2814887,Crowdsourced semantic annotation of scientific...,"Jaana Takis, AQM Saiful Islam, Christoph Lange...",https://doi.org/10.1145/2814864.2814887,1–8
1,10.1145/2814864.2814870,Complex event extraction from real-time news s...,"Alexandra La Fleur, Kia Teymourian, Adrian Pas...",https://doi.org/10.1145/2814864.2814870,9–16
2,10.1145/2814864.2814869,An elastic and scalable spatiotemporal query p...,"Hoan Nguyen Mau Quoc, Danh Le Phuoc",https://doi.org/10.1145/2814864.2814869,17–24
3,10.1145/2814864.2814879,Toward a statistical data integration environm...,"Ba-Lam Do, Tuan-Dat Trinh, Peb Ruswono Aryan, ...",https://doi.org/10.1145/2814864.2814879,25–32
4,10.1145/2814864.2814867,The role of reasoning for RDF validation,"Thomas Bosch, Erman Acar, Andreas Nolle, Kai E...",https://doi.org/10.1145/2814864.2814867,33–40
