# USTC-AD/2024 课程作业 实验报告

| 实验 2          | DBLP 论文信息获取与整理 |
| --------------- | ----------------------- |
| 马天开          | PB21000030              |
| Due: 2024.03.28 | Submitted: 2024.03.14   |

In [1]:
import re
import requests
import json
from pprint import pprint

In [2]:
headers_raw = """
Host: dblp.uni-trier.de
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2
Accept-Encoding: gzip, deflate, br
DNT: 1
Sec-GPC: 1
Connection: keep-alive
Cookie: dblp-search-mode=c; dblp-dismiss-new-feature-2022-01-27=1
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: cross-site
"""

headers = {}
for line in headers_raw.split("\n"):
    if line.strip() == "":
        continue
    k, v = line.split(": ")
    headers[k] = v

print(headers)

{'Host': 'dblp.uni-trier.de', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Sec-GPC': '1', 'Connection': 'keep-alive', 'Cookie': 'dblp-search-mode=c; dblp-dismiss-new-feature-2022-01-27=1', 'Upgrade-Insecure-Requests': '1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'cross-site'}


In [3]:
# open https://dblp.uni-trier.de/db/conf/kdd/kdd2023.html and save to page.txt
req = requests.get("https://dblp.uni-trier.de/db/conf/kdd/kdd2023.html", headers=headers)
with open("page.txt", "w") as f:
    f.write(req.text)

In [4]:
with open("page.txt", "r") as f:
    page = f.read()

In [5]:
# this is pratically useless since builtin reflection in python is bullshit and json dumps is not working
# using simple dict + list instead
class Paper:
    authors: list[str]
    title: str
    startPage: int
    endPage: int

    def __init__(
        self,
        authors: list[str] = [],
        title: str = "",
        startPage: int = 0,
        endPage: int = 0
    ):
        self.authors = authors
        self.title = title
        self.startPage = startPage
        self.endPage = endPage


class Track:
    track: str
    papers: list[Paper]

    def __init__(
        self,
        track: str = "",
        papers: list[Paper] = []
    ):
        self.track = track
        self.papers = papers


tracks = []

In [6]:
# regex patterns:
# since no DOM parsing package is built-in, we have to use regex to parse the page

track_full_pattern = re.compile(r'<header class="h2">(.*?)<meta property="genre" content="computer science"></li></ul>')
track_name_pattern = re.compile(r'<h2 id="(.+?)">(.+?)</h2>')
paper_full_pattern = re.compile(r'<li class="entry inproceedings"(.*?)<meta property="genre" content="computer science"></li>')
author_name_pattern = re.compile(r'<span itemprop="name" title=".*?">(.*?)</span>')
title_pattern = re.compile(r'<span class="title" itemprop="name">(.+?)</span>')
pagination_pattern = re.compile(r'<span itemprop="pagination">(.+?)</span>')

In [7]:
tracks = []
for track_raw in track_full_pattern.findall(page)[:2]:
    track = {}
    track["track"] = track_name_pattern.search(track_raw).group(2)
    track["papers"] = []

    for paper_raw in paper_full_pattern.findall(track_raw):
        paper = {}
        authors = []
        for author in author_name_pattern.findall(paper_raw):
            authors.append(author)
        paper["authors"] = authors
        paper["title"] = title_pattern.search(paper_raw).group(1)

        pagination = pagination_pattern.search(paper_raw).group(1)
        if '-' in pagination:
            _pagination = pagination.split('-')
            paper["startPage"] = int(_pagination[0])
            paper["endPage"] = int(_pagination[1])
        else:
            paper["startPage"] = int(pagination)
            paper["endPage"] = int(pagination)
        track["papers"].append(paper)

        # print(paper.authors)

    print(track["track"], "\n", len(track["papers"]))

    tracks.append(track)

Research Track Full Papers 
 312
Applied Data Track Full Papers 
 182


In [8]:
# save tracks to json
with open("kdd23.json", "w") as f:
    f.write(json.dumps(tracks, indent=2))

In [9]:
url_pattern = re.compile(r'https://dblp.uni-trier.de/pid/\d+/\d+.html')

author_urls = []
for track_raw in track_full_pattern.findall(page)[:2]:
    for paper_raw in paper_full_pattern.findall(track_raw)[:10]:
        for author in author_name_pattern.findall(paper_raw):
            url = url_pattern.search(paper_raw).group(0)
            author_urls.append(url)

print(author_urls)

['https://dblp.uni-trier.de/pid/211/5760.html', 'https://dblp.uni-trier.de/pid/211/5760.html', 'https://dblp.uni-trier.de/pid/211/5760.html', 'https://dblp.uni-trier.de/pid/247/9288.html', 'https://dblp.uni-trier.de/pid/247/9288.html', 'https://dblp.uni-trier.de/pid/247/9288.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/221/2843.html', 'https://dblp.uni-trier.de/pid/201/1957.html', 'https://dblp.uni-trier.de/pid/201/1957.html', 'https://dblp.uni-trier.de/pid/201/1957.html', 'https://dblp.uni-trier.de/pid/201/1957.html', 'https://dbl

In [10]:
name_pattern = re.compile(r'<span class="name primary" itemprop="name">(.*?)</span>')
orcid_pattern = re.compile(r'<a href="https://orcid.org/([0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X])"')
paper_pattern = re.compile(r'<li class="entry inproceedings toc"(.*?)<meta property="genre" content="computer science"></li>')

publish_info_pattern_A = re.compile(r'<span itemprop="isPartOf" itemscope="" itemtype="http://schema.org/PublicationVolume"><span itemprop="volumeNumber">(.*?)</span></span></a> (<span itemprop="datePublished">(.*?)</span>)')
publish_info_pattern_B = re.compile(r'<span itemprop="name">(.*?)</span></span> <span itemprop="datePublished">(.*?)</span></a>: <span itemprop="pagination">(.*?)</span>')
publish_info_pattern_C = re.compile(r'<span itemprop="name">(.*?)</span></span> <span itemprop="datePublished">(.*?)</span></a>')

def handle_url(url: str):
    text = requests.get(url, headers=headers).text

    name = name_pattern.search(text).group(1)
    try:
        orcid = orcid_pattern.search(text).group(1)
    except:
        orcid = ""

    papers = []
    for paper_raw in paper_pattern.findall(text):
        authors = []
        authors.append(name)
        for author in author_name_pattern.findall(paper_raw):
            authors.append(author)
        title = title_pattern.search(paper_raw).group(1)

        # find publish info & year, try two patterns
        try:
            volume, year = publish_info_pattern_A.search(paper_raw).groups()
            publishInfo = f"{volume} ({year})"
        except:
            try:
                volume, year, pagination = publish_info_pattern_B.search(paper_raw).groups()
                publishInfo = f"{volume} {year}: {pagination}"
            except:
                volume, year = publish_info_pattern_C.search(paper_raw).groups()
                publishInfo = f"{volume} {year}"
        year = int(year)
        if year < 2020:
            continue

        papers.append({
            "authors": authors,
            "title": title,
            "publishInfo": publishInfo,
            "year": year,
        })

    return {
        "name": name,
        "orcid": orcid,
        "papers": papers
    }

# pprint(handle_url(author_urls[0]))

In [11]:
cleaned_author_urls = list(set(author_urls))
print(len(cleaned_author_urls))
authors = []
for url in cleaned_author_urls:
    authors.append(handle_url(url))

19


In [12]:
# save to reserachers.json
with open("researchers.json", "w") as f:
    f.write(json.dumps(authors, indent=2))