# Computer Science Papers

In [154]:
import pandas as pd

# Initialize empty list to store all publications
all_publications = []

# Note: We'll need to collect data from all the lab parsing results in subsequent cells
# This creates the structure for the dataframe
df = pd.DataFrame(columns=['authors', 'lab', 'title', 'year', 'department'])

print("DataFrame structure created with columns:", df.columns.tolist())

DataFrame structure created with columns: ['authors', 'lab', 'title', 'year', 'department']


### Notre Dame Computer Vision Laboratory

In [155]:
LABORATORY = "Notre Dame Computer Vision Laboratory"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://cvrl.nd.edu/publications/"

In [156]:
import requests
from bs4 import BeautifulSoup

response = requests.get(URL)
html = response.text  

In [157]:
from bs4 import BeautifulSoup
import re
import json

def parse_pubs(html: str):
    soup = BeautifulSoup(html, "html.parser")
    year_re = re.compile(r"pub_info_(\d{4})")
    current_year = None
    results = []

    # walk in document order
    for el in soup.descendants:
        if not getattr(el, "name", None):
            continue

        # 1) did we hit a year anchor?
        if el.name == "a" and "anchor" in el.get("class", []):
            m = year_re.match(el.get("id", ""))
            if m:
                current_year = int(m.group(1))
            continue

        # 2) did we hit a publication card?
        if el.name == "div" and "pub-card" in el.get("class", []):
            if current_year is None:
                # card is above the first year header → skip
                continue

            # title
            title_tag = el.select_one(".card-header b")
            title = title_tag.get_text(strip=True) if title_tag else ""

            # authors line
            p = el.select_one(".pre_print_item p")
            authors = ""
            if p:
                line = p.get_text(" ", strip=True)
                # cut at the section year so we drop ", ... 2002."
                authors = line.split(str(current_year))[0].rstrip(" ,.;\u00a0")

            results.append({
                "year": current_year,
                "title": title,
                "authors": authors,
            })

    return results


In [158]:
data = parse_pubs(html)

In [None]:
print(data)

In [159]:
names = [
    "Bowyer",
    "Czajka",
    "Flynn",
    "Moreira",
    "Scheirer",
]

filtered_data = [entry for entry in data if any(name in entry["authors"] for name in names)]

In [160]:
# Add filtered Notre Dame Computer Vision Laboratory data to the dataframe
for entry in filtered_data:
    new_row = {
        'authors': entry['authors'],
        'lab': LABORATORY,
        'title': entry['title'],
        'year': entry['year'],
        'department': DEPARTMENT_NAME
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(f"Added {len(filtered_data)} publications from {LABORATORY}")
print(f"Total publications in dataframe: {len(df)}")

Added 373 publications from Notre Dame Computer Vision Laboratory
Total publications in dataframe: 373


In [None]:
print(filtered_data)

### Data Interference, Analytics, and Learning Laboratory

In [161]:
LABORATORY = "Data Interference, Analytics, and Learning Laboratory"
DEPARTMENT_NAME = "Notre Dame Computer Science and Engineering"
URL = "https://lucyinstitute.nd.edu/centers-and-labs/dial-publications/"

In [162]:
response = requests.get(URL)
html = response.text

In [163]:
print(html)

<!doctype html>
<html lang="en-US" prefix="og: https://ogp.me/ns#" class="no-js">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="preconnect" href="https://static.nd.edu/" crossorigin>
  <link rel="preconnect" href="https://ajax.googleapis.com/">
  <link rel="preconnect" href="https://fonts.googleapis.com/">
  <link rel="preconnect" href="https://fonts.gstatic.com/" crossorigin>
  <link rel="profile" href="https://gmpg.org/xfn/11">
  <link rel="icon" href="https://lucyinstitute.nd.edu/wp-content/themes/ndlucy/favicon.ico">
  <link rel="apple-touch-icon" sizes="76x76" href="https://static.nd.edu/images/webclips/default/webclip-76.png">
  <link rel="apple-touch-icon" sizes="114x114" href="https://static.nd.edu/images/webclips/default/webclip-114.png">
  <link rel="apple-touch-icon" sizes="120x120" href="https://static.nd.edu/images/webclips/default/webclip-120.png">
  <link rel="apple-touch-icon" sizes="144x144" href="ht

In [164]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re


# curly and straight quotes that might wrap the title
QUOTE_CHARS = ['"', '“', '”', '„', '‟', '«', '»', '‟', '‹', '›']

def fetch_page(url: str) -> BeautifulSoup | None:
    resp = requests.get(url, timeout=15)
    if resp.status_code != 200:
        return None
    return BeautifulSoup(resp.text, "html.parser")

def first_between_quotes(text: str) -> str | None:
    # find first opening-like quote and the next closing-like quote
    idxs = [text.find(q) for q in QUOTE_CHARS if q in text]
    if not idxs:
        return None
    start = min(i for i in idxs if i != -1)
    # search for a closing quote after start
    for j in range(start + 1, len(text)):
        if text[j] in QUOTE_CHARS:
            return text[start+1:j].strip()
    return None

def parse_li_text(li_text: str, year: int) -> dict | None:
    """
    li_text example:
    "Anna Sokol, Elizabeth Daly, ... Nitesh V Chawla. “BenchmarkCards: ...” arXiv"
    We want authors (before first quote) and title (inside quotes).
    """
    # normalize spaces
    text = " ".join(li_text.split())
    # 1) title
    title = first_between_quotes(text)

    # 2) authors: everything before first quote (or before title) minus trailing dot
    if title:
        before_title = text.split(title, 1)[0]
        # strip leading/trailing quotes again just in case
        for q in QUOTE_CHARS:
            before_title = before_title.replace(q, " ")
        authors = before_title.strip().rstrip(".").strip()
    else:
        # fallback: assume authors end at the first period
        parts = text.split(".", 1)
        authors = parts[0].strip()
        title = parts[1].strip() if len(parts) > 1 else ""

    if not authors or not title:
        return None

    return {
        "year": year,
        "authors": authors,
        "title": title,
    }

def scrape_all_pages(base_url: str = BASE_URL, max_pages: int = 50):
    all_pubs = []

    for page in range(1, max_pages + 1):
        url = base_url if page == 1 else urljoin(base_url, f"{page}/")
        soup = fetch_page(url)
        if soup is None:
            break  # no more pages

        # find all h2s that introduce a section
        h2s = soup.select("h2.wp-block-heading")
        any_year_found = False

        for h2 in h2s:
            heading_text = h2.get_text(strip=True)
            # skip "Preprints"
            if heading_text.lower().startswith("preprint"):
                continue

            # check if this h2 looks like a year
            m = re.fullmatch(r"\d{4}", heading_text)
            if not m:
                continue

            year = int(heading_text)
            any_year_found = True

            # the list right after the h2
            ol = h2.find_next_sibling("ol")
            if not ol:
                continue

            for li in ol.find_all("li", recursive=False):
                li_text = li.get_text(" ", strip=True)
                item = parse_li_text(li_text, year)
                if item:
                    all_pubs.append(item)

        # if the page had literally no year sections, we can stop early
        if not any_year_found:
            break

    return all_pubs


In [165]:
pubs = scrape_all_pages()
# just peek at the first 10
for p in pubs[:10]:
    print(p)
print(f"total publications: {len(pubs)}")

{'year': 2025, 'authors': 'Anna Sokol, Elizabeth Daly, Michael Hind, David Piorkowski, Xiangliang Zhang, Nuno Moniz, Nitesh V Chawla', 'title': 'BenchmarkCards: Standardized Documentation for Large Language Model Benchmarks.'}
{'year': 2025, 'authors': 'Anna Sokol, Matthew L', 'title': 'Sisk, Josefina Echavarría Alvarez, and Nitesh Chawla. 2025. Ventana a la Verdad (Window to the Truth): A Chatbot Application for Navigating The Colombian Truth Commission’s Archives. In Proceedings of the Eighteenth ACM International Conference on Web Search and Data Mining (WSDM ’25) PDF'}
{'year': 2025, 'authors': 'Germino, Joe, Nuno Moniz, and Nitesh V. Chawla', 'title': 'Intersectional Divergence: Measuring Fairness in Regression.'}
{'year': 2025, 'authors': 'Grigorii Khvatskii, Yong Suk Lee, Corey Angst, Maria Gibbs, Robert Landers, Nitesh V. Chawla', 'title': 'Do Multimodal Large Language Models Understand Welding?'}
{'year': 2025, 'authors': 'Deng Pan, Nuno Moniz, Nitesh Chawla. 2025', 'title': '

In [166]:
names = [
    "Chawla",
    "Moniz",
]

filtered_pubs = [p for p in pubs if any(name in p["authors"] for name in names)]

In [167]:
# Add filtered Data Interference, Analytics, and Learning Laboratory publications to the dataframe
for entry in filtered_pubs:
    new_row = {
        'authors': entry['authors'],
        'lab': LABORATORY,
        'title': entry['title'],
        'year': entry['year'],
        'department': DEPARTMENT_NAME
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(f"Added {len(filtered_pubs)} publications from {LABORATORY}")
print(f"Total publications in dataframe: {len(df)}")

Added 941 publications from Data Interference, Analytics, and Learning Laboratory
Total publications in dataframe: 1314


In [None]:
print(filtered_pubs)

### Prof. Weninger research team

In [168]:
LABORATORY = "Prof. Weninger research team"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://timweninger.com/publications/"

In [169]:
response = requests.get(URL)
html = response.text

In [34]:
print(html)

<!DOCTYPE html>
<html lang="en-US">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <link rel="profile" href="http://gmpg.org/xfn/11">
<title>Publications &#8211; Tim Weninger, PhD</title>
<meta name='robots' content='max-image-preview:large' />

<script>
MathJax = {
  tex: {
    inlineMath: [['$','$'],['\\(','\\)']], 
    processEscapes: true
  },
  options: {
    ignoreHtmlClass: 'tex2jax_ignore|editor-rich-text'
  }
};

</script>
<link rel='dns-prefetch' href='//cdn.jsdelivr.net' />
<link rel='dns-prefetch' href='//fonts.googleapis.com' />
<link rel='preconnect' href='//www.google-analytics.com' />
<link rel="alternate" type="application/rss+xml" title="Tim Weninger, PhD &raquo; Feed" href="https://timweninger.com/feed/" />
<link rel="alternate" type="application/rss+xml" title="Tim Weninger, PhD &raquo; Comments Feed" href="https://timweninger.com/comments/feed/"

In [170]:
from bs4 import BeautifulSoup
import re

def parse_publications(html: str):
    soup = BeautifulSoup(html, "html.parser")

    # find the publications list
    ol = soup.find("ol", reversed=True)
    if not ol:
        return {"papers": []}

    papers = []
    for li in ol.find_all("li", recursive=False):
        # 1. title
        title = None
        strong = li.find("strong")
        if strong:
            a = strong.find("a")
            if a and a.get_text(strip=True):
                title = a.get_text(strip=True)
            else:
                title = strong.get_text(strip=True)

        # 2. full text (for year)
        full_text = li.get_text(" ", strip=True)
        year = None
        m = re.search(r"(\d{4})(?!.*\d{4})", full_text)
        if m:
            year = int(m.group(1))

        # 3. authors = everything before the first <strong>
        authors = []
        authors_text = ""
        # collect all nodes before <strong>
        for child in li.contents:
            if getattr(child, "name", None) == "strong":
                break
            # text node or tag before strong
            authors_text += child.get_text(" ", strip=True) if hasattr(child, "get_text") else str(child)

        # normalize authors text
        if authors_text:
            # turn " and " into ", " so splitting is consistent
            normalized = authors_text.replace(" and ", ", ")
            for part in normalized.split(","):
                name = part.strip(" .;\n\t")
                if name:
                    # optional: drop bare "et al."
                    if name.lower().startswith("et al"):
                        continue
                    authors.append(name)

        papers.append(
            {
                "year": year,
                "authors": authors,
                "title": title,
            }
        )

    return papers


In [171]:
data = parse_publications(html)

In [None]:
print(data)

In [172]:
# Add filtered Prof. Weninger research team publications to the dataframe
for entry in data:
    new_row = {
        'authors': ', '.join(entry['authors']) if isinstance(entry['authors'], list) else str(entry['authors']),
        'lab': LABORATORY,
        'title': entry['title'],
        'year': entry['year'],
        'department': DEPARTMENT_NAME
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(f"Added {len(data)} publications from {LABORATORY}")
print(f"Total publications in dataframe: {len(df)}")

Added 128 publications from Prof. Weninger research team
Total publications in dataframe: 1442


### Natural Language Processing Group

In [173]:
LABORATORY = "Natural Language Processing Group"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://nlp.nd.edu/papers.html"

In [174]:
response = requests.get(URL)
html = response.text

In [48]:
print(html)

<!DOCTYPE html>
<html lang="en" class="no-js">
<head>
  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
  <meta name="apple-mobile-web-app-title" content="Natural Language Processing at Notre Dame">
  <meta name="application-name" content="Natural Language Processing at Notre Dame">
  <title>Natural Language Processing at Notre Dame</title>
  <link rel="icon" type="image/png" href="https://static.nd.edu/images/monogram/gold/monogram-32.png" sizes="32x32">
  <link rel="icon" type="image/png" href="https://static.nd.edu/images/monogram/gold/monogram-96.png" sizes="96x96">
  <link rel="apple-touch-icon" sizes="180x180" href="https://static.nd.edu/images/webclips/default/webclip-180.png">
  <link rel="apple-touch-icon" sizes="512x512" href="https://static.nd.edu/images/webclips/default/webclip-512.png">
  <link rel="mask-icon" href="https://static.nd.edu/images/monogram/monogram.svg" color="#002b5c">
  <link rel="stylesheet" href="css/foundation.css">

In [175]:
from bs4 import BeautifulSoup
import re

def extract_pubs_title_year_authors(html: str):
    soup = BeautifulSoup(html, "html.parser")
    out = []

    for div in soup.select("div.pub"):
        # drop the hidden BibTeX so its text doesn't pollute parsing
        bib = div.find("div", class_="bibtex")
        if bib:
            bib.decompose()

        # unprotect titles like <span class="bibtex-protected">NLP</span>
        for s in div.select("span.bibtex-protected"):
            s.unwrap()

        # collapse to one line
        text = " ".join(div.get_text(" ", strip=True).split())
        # typical shape: "Authors. Title. 2025. ..." → split on ". "
        parts = text.split(". ")

        if len(parts) < 2:
            # fallback: keep whole thing as title
            out.append({
                "title": text,
                "year": None,
                "authors": []
            })
            continue

        authors_part = parts[0].strip()
        title_part = parts[1].strip().rstrip(".")

        # find year anywhere in the remaining text
        rest = ". ".join(parts[2:]) if len(parts) > 2 else ""
        m_year = re.search(r"\b(19|20)\d{2}\b", rest)
        year = int(m_year.group(0)) if m_year else None

        # authors often look like: "A, B, C, and D"
        # make it "A, B, C, D"
        authors_norm = authors_part.replace(" and ", ", ")
        authors = [a.strip(" ,") for a in authors_norm.split(",") if a.strip()]

        out.append({
            "title": title_part,
            "year": year,
            "authors": authors
        })

    return out


In [176]:
data = extract_pubs_title_year_authors(html)

In [None]:
print(data)

In [177]:
names = ['David Chiang']

filtered_data = [entry for entry in data if any(name in entry["authors"] for name in names)]

In [None]:
print(filtered_data)

In [178]:
for entry in filtered_data:
    new_row = {
        'authors': ', '.join(entry['authors']) if isinstance(entry['authors'], list) else str(entry['authors']),
        'lab': LABORATORY,
        'title': entry['title'],
        'year': entry['year'],
        'department': DEPARTMENT_NAME
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(f"Added {len(filtered_data)} publications from {LABORATORY}")
print(f"Total publications in dataframe: {len(df)}")

Added 66 publications from Natural Language Processing Group
Total publications in dataframe: 1508


### Machine Intelligence and kNowledge Engineering (MINE) lab

In [179]:
LABORATORY = "Machine Intelligence and kNowledge Engineering (MINE) lab"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://sites.nd.edu/xiangliang-zhang/publications/"

In [180]:
response = requests.get(URL)
html = response.text

In [60]:
print(html)

<!doctype html>
<html lang="en-US" class="no-js">
<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <link rel="preconnect" href="https://static.nd.edu/" crossorigin>
  <link rel="preconnect" href="https://emergency.nd.edu/">
  <link rel="preconnect" href="https://ajax.googleapis.com/">
  <link rel="preconnect" href="https://fonts.googleapis.com/">
  <link rel="preconnect" href="https://fonts.gstatic.com/" crossorigin>
  <link rel="profile" href="https://gmpg.org/xfn/11">
  <link rel="shortcut icon" href="https://sites.nd.edu/xiangliang-zhang/wp-content/themes/wp-ndt3/favicon.ico">
  <link rel="apple-touch-icon" sizes="76x76" href="https://static.nd.edu/images/webclips/default/webclip-76.png">
  <link rel="apple-touch-icon" sizes="114x114" href="https://static.nd.edu/images/webclips/default/webclip-114.png">
  <link rel="apple-touch-icon" sizes="120x120" href="https://static.nd.edu/images/webclips/default/webclip-120.png">
  <link r

In [181]:
from bs4 import BeautifulSoup
import re

def extract_publications(html: str):
    soup = BeautifulSoup(html, "html.parser")
    content = soup.find("div", class_="entry-content")
    if not content:
        return []

    pubs = []
    current_year = None

    bad_link_titles = {
        "bib inria pdf",
        "bibtex",
        "pdf",
        "hal",
        "inria",
        "bib",
        "paper",
        "link",
        "Yu and Xiangliang Zhang"
    }

    for el in content.find_all(["p", "h4"], recursive=True):
        text = el.get_text(" ", strip=True)

        # year line?
        if re.fullmatch(r"\d{4}", text):
            current_year = int(text)
            continue

        # numbered entry?
        if not re.match(r"^\s*\d+\.\s+", text):
            continue

        # pick a good link
        good_title = None
        for a in el.find_all("a"):
            label = a.get_text(strip=True)
            if not label:
                continue
            norm = label.lower()
            if norm in bad_link_titles:
                continue
            if len(label) < 4:
                continue
            good_title = label
            break

        # fallback: extract from text
        if not good_title:
            numbered = re.sub(r"^\s*\d+\.\s*", "", text)
            parts = [p.strip() for p in numbered.split(".") if p.strip()]
            if len(parts) >= 2:
                good_title = parts[1]
            else:
                continue

        # authors
        m = re.match(r"^\s*\d+\.\s*(.+?)\.\s+", text)
        authors = m.group(1).strip() if m else ""

        # NEW RULE: skip old stuff
        if current_year is not None and current_year < 2012:
            continue

        pubs.append({
            "title": good_title,
            "authors": authors,
            "year": current_year,
        })

    return pubs


In [182]:
data = extract_publications(html)

In [183]:
# Add Machine Intelligence and kNowledge Engineering (MINE) lab publications to the dataframe
for entry in data:
    new_row = {
        'authors': entry['authors'],
        'lab': LABORATORY,
        'title': entry['title'],
        'year': entry['year'],
        'department': DEPARTMENT_NAME
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(f"Added {len(data)} publications from {LABORATORY}")
print(f"Total publications in dataframe: {len(df)}")

Added 257 publications from Machine Intelligence and kNowledge Engineering (MINE) lab
Total publications in dataframe: 1765


In [None]:
print(data)

### The Cooperative Computing Lab

In [184]:
LABORATORY = "The Cooperative Computing Lab"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://ccl.cse.nd.edu/research/papers/"

In [185]:
response = requests.get(URL)
html = response.text

In [79]:
print(html)


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "https://www.w3.org/TR/html4/strict.dtd">

<html>

<head>
<title>CCL Research Publications - Cooperative Computing Lab</title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="shortcut icon" href="https://ccl.cse.nd.edu/images/ndtiny.ico">
<link rel="icon" href="https://ccl.cse.nd.edu/images/ndtiny.ico">
<link rel="stylesheet" type="text/css" href="https://ccl.cse.nd.edu/ccl.css">
<link rel="alternate" type="application/rss+xml" title="RSS" href="https://cclnd.blogspot.com/feeds/posts/default?alt=rss">
</head>

<body>

<table class=subpage>
<tr>
<td colspan=2>
<div class=smallmenu>
<a href="/">CCL</a>&nbsp;|&nbsp;<a href="/software">Software</a>&nbsp;|&nbsp;<a href="/software/downloads">Install</a>&nbsp;|&nbsp;<a href="/software/manuals">Manuals</a>&nbsp;|&nbsp;<a href="/community/forum">Forum</a>&nbsp;|&nbsp;<a href="/research/papers">Pap

In [186]:
from bs4 import BeautifulSoup

def parse_ccl_papers(html: str):
    """
    Parse the CCL 'Research Publications' HTML page and return a list of
    papers, each as {'year': int, 'authors': str, 'title': str}.
    """
    soup = BeautifulSoup(html, "html.parser")
    results = []

    # Each year is an <h3>YYYY</h3>
    for h3 in soup.find_all("h3"):
        # Try to read the year
        year_text = h3.get_text(strip=True)
        if not year_text.isdigit():
            continue
        year = int(year_text)

        # The papers for that year are in the next sibling <ul> (in this page’s HTML)
        ul = h3.find_next_sibling("ul")
        if not ul:
            continue

        # Inside the <ul> they put one <table> per paper
        tables = ul.find_all("table")
        for tbl in tables:
            # Each table has a <p> with "authors,<br><a ...><b>title</b></a>,<br>venue..."
            p = tbl.find("p")
            if not p:
                continue

            # 1) AUTHORS: text up to the first <br>
            #    p.contents often looks like [NavigableString(authors,...), <br>, <a>...</a>, ...]
            authors = None
            for child in p.children:
                if getattr(child, "name", None) == "br":
                    break
                # first text chunk before <br>
                if isinstance(child, str):
                    txt = child.strip()
                    if txt:
                        authors = txt.rstrip(",")  # drop trailing comma
            if not authors:
                # fallback: just take the p text before first line break
                authors = p.get_text(separator="\n").split("\n", 1)[0].rstrip(",")

            # 2) TITLE: bold text inside the anchor
            title_tag = p.find("b")
            if title_tag:
                title = title_tag.get_text(strip=True)
            else:
                # fallback: grab the first <a>
                a = p.find("a")
                title = a.get_text(strip=True) if a else ""

            results.append({
                "year": year,
                "authors": authors,
                "title": title,
            })

    return results


In [187]:
data = parse_ccl_papers(html)

In [None]:
print(data)

In [188]:
people = ['Thain', 'Tovar']
filtered_data = [entry for entry in data if any(name in entry["authors"] for name in people)]
print(filtered_data)

[{'year': 2025, 'authors': 'Liubov Kurafeeva, Alan Subedi, Ryan Hartung, Michael Fay, Avhishek Biswas, Shantenu Jha, Ozgur O. Kilic, Chandra Krintz, Andre Merzky, Douglas Thain, Mehmet C. Vuran, and Rich Wolski', 'title': 'xGFabric: Coupling Sensor Networks and HPC Facilities with Private 5G Wireless Networks for Real-Time Digital Agriculture'}, {'year': 2025, 'authors': 'Md Saiful Islam, Talha Azaz, Raza Ahmad, A D M Shahadat Hossain, Furqan Baig, Shaowen Wang, Kevin Lannon, Tanu Malik, and Douglas Thain', 'title': 'Backpacks for Notebooks: Enabling Containerized Notebook Workflows in Distributed Environments'}, {'year': 2025, 'authors': 'Colin Thomas and Douglas Thain', 'title': 'Liberating the Data Aware Scheduler to Achieve Locality in Layered Scientific Workflow Systems'}, {'year': 2024, 'authors': 'Barry Sly-Delgado, Ben Tovar, Jin Zhou, and Douglas Thain', 'title': 'Reshaping High Energy Physics Applications for Near-Interactive Execution Using TaskVine'}, {'year': 2024, 'author

In [189]:
# Add filtered The Cooperative Computing Lab publications to the dataframe
for entry in filtered_data:
    new_row = {
        'authors': entry['authors'],
        'lab': LABORATORY,
        'title': entry['title'],
        'year': entry['year'],
        'department': DEPARTMENT_NAME
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(f"Added {len(filtered_data)} publications from {LABORATORY}")
print(f"Total publications in dataframe: {len(df)}")

Added 185 publications from The Cooperative Computing Lab
Total publications in dataframe: 1950


### Hardware-Software Codesign Lab - FAILED

In [96]:
# Recreate the variables used in previous cells
LABORATORY = "Hardware-Software Codesign Lab"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://codesignlab.nd.edu/Publications/"

In [97]:
response = requests.get(URL)
html = response.text

In [98]:
print(html)


<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"db51011748","applicationID":"9339","transactionName":"Il8IQ0sLDVVVR05QXQ9UE1RNCxMWXVQIXQ==","queueTime":0,"applicationTime":372,"agent":""}</script>
<script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"db51011748",applicationID:"9339",browserID:"1388084"};;/*! For license information please see nr-loader-rum-1.302.0.min.js.LICENSE.txt */
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<link rel="preconnect" href="https://static.nd.edu/" crossorigin>
<link rel="preconnect" href="https://emergency.nd.edu/">
<link rel="preconnect" href="https://ajax.googleapis.com/">
<link rel="preconnect" href="https://fonts.googleapis.com/">
<link r

In [101]:
from bs4 import BeautifulSoup
import re

def parse_publications(html: str):
    soup = BeautifulSoup(html, "html.parser")
    results = []

    # find the container where the lists live (optional, but keeps it scoped)
    main = soup.find("div", class_="page-primary") or soup

    # an <h2> whose text is a 4-digit year
    year_headers = [
        h2 for h2 in main.find_all("h2")
        if h2.get_text(strip=True).isdigit() and len(h2.get_text(strip=True)) == 4
    ]

    for h2 in year_headers:
        year = h2.get_text(strip=True)
        ul = h2.find_next_sibling("ul")
        if not ul:
            continue

        for li in ul.find_all("li"):
            # we'll look for the <em> first, since it marks where the venue starts
            em = li.find("em")

            # full text of the <li>
            li_text = li.get_text(" ", strip=True)

            # default values
            authors = None
            title = None

            if em:
                # text up to the <em> tag
                # e.g. "F.-X. Liang, S. Kumar, ... , A physics-based model ..."
                before_em = ""
                # collect everything up to <em>
                for node in li.contents:
                    if node is em:
                        break
                    # node can be NavigableString or Tag
                    before_em += node.get_text(" ", strip=True) if hasattr(node, "get_text") else str(node)

                # now split that string into "authors" and "title"
                # pattern in your HTML is: "authors, title"
                parts = [p.strip() for p in before_em.split(", ")]
                if len(parts) >= 2:
                    # heuristic: authors = all but last; title = last
                    authors = ", ".join(parts[:-1])
                    title = parts[-1]
                else:
                    # fallback: try to peel off quotes
                    title = before_em

            else:
                # no <em> – rare, but try to pull authors/title from the raw li text
                # try to extract the year at the end first
                m = re.search(r",\s*(\d{4})\s*$", li_text)
                if m:
                    li_text = li_text[:m.start()].strip()

                # same heuristic: last comma-separated chunk is title
                parts = [p.strip() for p in li_text.split(",")]
                if len(parts) >= 2:
                    authors = ", ".join(parts[:-1])
                    title = parts[-1]
                else:
                    title = li_text

            results.append(
                {
                    "title": title,
                    "authors": authors,
                    "year": int(year),
                }
            )

    return results


In [102]:
data = parse_ccl_papers(html)

In [None]:
print(data)

### Notre Dame Intelligent Microsystems Lab


In [190]:
LABORATORY = "Notre Dame Intelligent Microsystems Lab"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://siddharth-joshi.com/#research"

In [191]:
response = requests.get(URL)
html = response.text

In [105]:
print(html)

<!DOCTYPE html>
<html lang="en">
    <head>
    <!-- Global site tag (gtag.js) - Google Analytics -->
	<script async src="https://www.googletagmanager.com/gtag/js?id=UA-75997123-2"></script>
	<script>
	  window.dataLayer = window.dataLayer || [];
	  function gtag(){dataLayer.push(arguments);}
	  gtag('js', new Date());

	  gtag('config', 'UA-75997123-2');
	</script>


        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta name="description" content="">
        <meta name="author" content="">

        <title>Siddharth Joshi</title>

		<!-- Bootstrap core CSS -->
		<link href="styles/css/c3.css" rel="stylesheet">
		<link href="styles/css/labsite.css" rel="stylesheet">
		<!--
         <link href="https://maxcdn.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet">
         -->
		<link rel="stylesheet" href="https://use.fontawesome.com/r

In [192]:
from bs4 import BeautifulSoup

def extract_papers(html):
    """
    Parse the given HTML and return a list of dicts:
    [
      {
        "title": <str>,
        "year": <int>,
        "authors": []   # authors are not present in the HTML, so we return empty
      },
      ...
    ]
    """
    soup = BeautifulSoup(html, "html.parser")
    results = []

    # Find the publications container
    pubs_container = soup.find(id="pubList")
    if not pubs_container:
        return results  # nothing to do

    # Strategy:
    # In this HTML, each year looks like:
    #   <h4 id="2019" class="2019">2019</h4>
    #   <ol id="pubList2019" class="2019" start="1"> ... <li>paper</li> ... </ol>
    #
    # So we can iterate over all h4s inside #pubList and pair each with the next <ol>.
    for year_header in pubs_container.find_all("h4"):
        # get the year text (strip to be safe)
        year_text = year_header.get_text(strip=True)
        try:
            year = int(year_text)
        except ValueError:
            # skip non-year headers
            continue

        # the ol with the papers is the next sibling that's an <ol>
        ol = year_header.find_next_sibling("ol")
        if not ol:
            continue

        # each li is a paper title (sometimes with children like <a> or <svg>)
        for li in ol.find_all("li"):
            # get the visible text for the title
            title = li.get_text(" ", strip=True)
            if not title:
                continue

            results.append({
                "title": title,
                "year": year,
                # authors not present in the HTML snippet
                "authors": []
            })

    return results


In [193]:
data = extract_papers(html)

In [None]:
print(data)

In [194]:
# Add Joshi as an author for each publication
for entry in data:
    entry["authors"] = ["Joshi"]

In [195]:
for entry in data:
    new_row = {
        'authors': ', '.join(entry['authors']) if isinstance(entry['authors'], list) else str(entry['authors']),
        'lab': LABORATORY,
        'title': entry['title'],
        'year': entry['year'],
        'department': DEPARTMENT_NAME
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(f"Added {len(data)} publications from {LABORATORY}")
print(f"Total publications in dataframe: {len(df)}")

Added 37 publications from Notre Dame Intelligent Microsystems Lab
Total publications in dataframe: 1987


### Sustainable Computing Lab - FAILED

In [112]:
LABORATORY = "Sustainable Computing Lab"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://scl-nd.github.io/index.html#/publications"

In [113]:
response = requests.get(URL)
html = response.text

In [114]:
print(html)

<!doctype html>
<html lang="en">

<head>
  <link rel="preconnect" href="https://fonts.googleapis.com">
  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
  <link href="https://fonts.googleapis.com/css2?family=Bitcount+Single:wght@100..900&display=swap" rel="stylesheet">
  <meta charset="UTF-8" />
  <link rel="icon" type="images/svg+xml" href="./scl-pins.svg" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Sustainable Computing Lab</title>
  <script type="module" crossorigin src="./assets/index-DBbmyJy4.js"></script>
  <link rel="stylesheet" crossorigin href="./assets/index-fRNSXH1R.css">
</head>

<body>
  <div id="root"></div>
</body>

</html>


### Wireless Institution - FAILED

In [115]:
LABORATORY = "Wireless Institution"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://wireless.nd.edu/publications/"

In [116]:
response = requests.get(URL)
html = response.text

In [117]:
print(html)

<!doctype html>
<html lang="en-US">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <link rel="profile" href="https://gmpg.org/xfn/11">
    <meta name='robots' content='noindex, follow' />
	<style>img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }</style>
	
	<!-- This site is optimized with the Yoast SEO plugin v25.2 - https://yoast.com/wordpress/plugins/seo/ -->
	<title>Publications Archive - ND Wireless Institute</title>
	<meta property="og:locale" content="en_US" />
	<meta property="og:type" content="website" />
	<meta property="og:title" content="Publications Archive - ND Wireless Institute" />
	<meta property="og:url" content="https://wireless.nd.edu/publications/" />
	<meta property="og:site_name" content="ND Wireless Institute" />
	<meta name="twitter:card" content="summary_large_image" />
	<meta name="twitter:site" content="@ndwireless" />
	<script type="application/ld+json" class

In [121]:
from bs4 import BeautifulSoup
import re

def extract_publications(html: str):
    """
    Parse the ND Wireless publications HTML and return
    [{title, authors, year}, ...].
    This version is robust to layout changes.
    """
    soup = BeautifulSoup(html, "html.parser")
    pubs = []

    # grab every td that looks like a publication entry
    pub_tds = soup.select("td.publications, td[id^='post-']")
    for td in pub_tds:
        # 1) title
        title_tag = td.find("span", class_="pub-title")
        title = title_tag.get_text(strip=True) if title_tag else None

        # 2) year: last 4 digits in this cell's text
        whole_text = td.get_text(" ", strip=True)
        m = re.search(r"(\d{4})\s*$", whole_text)
        year = m.group(1) if m else None

        # 3) authors: everything before the <a> that holds the title
        authors = None
        a_with_title = title_tag.find_parent("a") if title_tag else None
        parts = []
        for child in td.children:
            if child == a_with_title:
                break
            if isinstance(child, str):
                parts.append(child)
            else:
                parts.append(child.get_text(" ", strip=True))
        if parts:
            authors = " ".join(p.strip() for p in parts if p.strip()).rstrip(", ")

        # only add if we found a title
        if title:
            pubs.append(
                {
                    "title": title,
                    "authors": authors,
                    "year": year,
                }
            )

    return pubs



In [122]:
data = extract_publications(html)

In [None]:
print(data)

### Human Computer Interaction - FAILED

In [123]:
# Recreate variables for Wireless Institution
LABORATORY = "Wireless Institution"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://hci.nd.edu/publications/"

In [126]:
response = requests.get(URL)
html = response.text

In [127]:
print(html)


<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<script type="text/javascript">window.NREUM||(NREUM={});NREUM.info={"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"db51011748","applicationID":"9339","transactionName":"Il8IQ0sLDVVVR05QXQ9UE1RNCxMWXVQIXQ==","queueTime":0,"applicationTime":259,"agent":""}</script>
<script type="text/javascript">(window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"db51011748",applicationID:"9339",browserID:"1388084"};;/*! For license information please see nr-loader-rum-1.302.0.min.js.LICENSE.txt */
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<link rel="preconnect" href="https://static.nd.edu/" crossorigin>
<link rel="preconnect" href="https://emergency.nd.edu/">
<link rel="preconnect" href="https://ajax.googleapis.com/">
<link rel="preconnect" href="https://fonts.googleapis.com/">
<link r

In [128]:
with open('hci_publications.html', 'w', encoding='utf-8') as f:
    f.write(html)
print("HTML file 'hci_publications.html' has been created")

HTML file 'hci_publications.html' has been created


In [131]:
import re
from typing import List, Dict

def extract_papers(text: str) -> List[Dict[str, str]]:
    """
    Extract papers from a block of text.
    
    Tries to find patterns like:
        "Smith, J., & Doe, A. (2019). Title of the paper. Journal ..."
        "Nguyen, T. 2021. Another title..."
    
    Returns a list of dicts with keys: 'title', 'authors', 'year', 'raw'
    """
    papers = []

    # split text into candidate reference lines/paragraphs
    # (references are often on separate lines)
    candidates = re.split(r"\n{1,}|\r{1,}", text)
    
    # regex patterns
    # 1) APA-ish: Authors (Year). Title.
    apa_like = re.compile(
        r"""
        ^\s*
        (?P<authors>.+?)               # authors at start
        \s*[\(\[](?P<year>\d{4})[\)\]] # (2021) or [2021]
        \s*\.?\s*
        (?P<title>[^.]+)               # title until the next period
        """,
        re.VERBOSE
    )
    
    # 2) Author Year. Title.  (no parentheses)
    author_year = re.compile(
        r"""
        ^\s*
        (?P<authors>.+?)               # authors at start
        \s+(?P<year>19\d{2}|20\d{2})   # 4-digit year
        \.?\s+
        (?P<title>[^.]+)
        """,
        re.VERBOSE
    )

    for cand in candidates:
        cand_clean = cand.strip()
        if not cand_clean:
            continue

        m = apa_like.match(cand_clean)
        if not m:
            m = author_year.match(cand_clean)

        if m:
            authors = m.group("authors").strip()
            year = m.group("year").strip()
            title = m.group("title").strip()

            papers.append(
                {
                    "authors": authors,
                    "year": year,
                    "title": title,
                    "raw": cand_clean,
                }
            )

    return papers


In [132]:
data = extract_papers(html)

In [None]:
print(data)

### SANDWICH LAB

In [196]:
LABORATORY = "SANDWICH LAB"
DEPARTMENT_NAME = "Computer Science and Engineering"
URL = "https://toby.li/publications/"

In [197]:
response = requests.get(URL)
html = response.text

In [198]:
print(html)

<!doctype html><html lang="en" class="no-js"><head><meta charset="utf-8"> <!-- begin SEO --><title>Publications - Toby Jia-Jun Li</title><meta property="og:locale" content="en-US"><meta property="og:site_name" content="Toby Jia-Jun Li"><meta property="og:title" content="Publications"><link rel="canonical" href="https://toby.li/publications/"><meta property="og:url" content="https://toby.li/publications/"> <script type="application/ld+json"> { "@context" : "http://schema.org", "@type" : "Person", "name" : "Toby Jia-Jun Li", "url" : "https://toby.li", "sameAs" : null } </script> <!-- end SEO --><link href="https://toby.li/feed.xml" type="application/atom+xml" rel="alternate" title="Toby Jia-Jun Li Feed"> <!-- http://t.co/dKP3o1e --><meta name="HandheldFriendly" content="True"><meta name="MobileOptimized" content="320"><meta name="viewport" content="width=device-width, initial-scale=1.0"> <script> document.documentElement.className = document.documentElement.className.replace(/\bno-js\b/g

In [199]:
import re
import json
from bs4 import BeautifulSoup

# -------- YEAR HELPERS --------

def _year_from_left_column(left_col):
    """Grab year from the left column label like 'CHI 2025' or 'OpenSym / WikiSym 2014'."""
    if not left_col:
        return None
    text = left_col.get_text(" ", strip=True)
    # look for 4-digit 19xx or 20xx near the end
    m = re.search(r'(?:19|20)\d{2}(?!.*(?:19|20)\d{2})', text)
    if m:
        return m.group(0)
    # sometimes you could have shorthand like ’25 here, but in your HTML left col is clean
    return None


def _year_from_four_digits(text):
    """Return the FIRST 4-digit year we find, not the last."""
    if not text:
        return None
    m = re.search(r'(?:19|20)\d{2}', text)
    if m:
        return m.group(0)
    return None


def _year_from_shorthand(text):
    """
    Catch things like:
      CHI EA ’23
      CHI EA '23
    and convert to 2023 (assume 2000–2039 window).
    """
    if not text:
        return None
    m = re.search(r"[’'](\d{2})\b", text)
    if not m:
        return None
    yy = int(m.group(1))
    # assume 2000-2039 is safe for this page
    if 0 <= yy <= 39:
        return f"20{yy:02d}"
    return None


def _best_year(left_col, right_text):
    """
    Priority:
      1. left column explicit year
      2. 4-digit year in right column
      3. shorthand year in right column (’23 → 2023)
    """
    y = _year_from_left_column(left_col)
    if y:
        return y

    y = _year_from_four_digits(right_text)
    if y:
        return y

    y = _year_from_shorthand(right_text)
    if y:
        return y

    return None


# -------- MAIN PARSER --------

def extract_publications(html: str):
    soup = BeautifulSoup(html, "html.parser")

    articles = soup.select("div.list__item article.archive__item")
    results = []

    for art in articles:
        left_col = art.select_one(".publication_column.left")

        # some entries (book sections, patents, workshops) don't have left/right split
        right_col = art.select_one(".publication_column.right")
        if right_col is None:
            right_col = art.select_one(".publication_column")
        if right_col is None:
            continue

        right_text = right_col.get_text(separator="\n").strip()
        lines = [ln.strip() for ln in right_text.split("\n") if ln.strip()]

        if not lines:
            continue

        # 1) TITLE
        title = lines[0]

        # 2) AUTHORS — everything after title until we hit venue-ish line
        authors = None
        for ln in lines[1:]:
            if re.search(
                r"(Proceedings|Chapter of|Extended Abstracts|The Network and Distributed System Security Symposium|U\.S\. Patent|Symposium|Workshop|Conference|ACM on Human-Computer Interaction)",
                ln,
                re.IGNORECASE,
            ):
                break
            if authors is None:
                authors = ln
            else:
                authors += " " + ln

        # 3) YEAR — improved
        year = _best_year(left_col, right_text)

        results.append(
            {
                "title": title,
                "authors": authors,
                "year": year,
            }
        )

    return results


In [200]:
data = extract_publications(html)

In [None]:
print(data)

In [201]:
# Add SANDWICH LAB publications to the dataframe
for entry in data:
    new_row = {
        'authors': entry['authors'],
        'lab': LABORATORY,
        'title': entry['title'],
        'year': int(entry['year']) if entry['year'] and entry['year'].isdigit() else None,
        'department': DEPARTMENT_NAME
    }
    df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

print(f"Added {len(data)} publications from {LABORATORY}")
print(f"Total publications in dataframe: {len(df)}")

Added 68 publications from SANDWICH LAB
Total publications in dataframe: 2055


In [202]:
# Save the dataframe to a CSV file
df.to_csv('computer_science_publications.csv', index=False)
print(f"Saved {len(df)} publications to 'computer_science_publications.csv'")

# Also save as JSON for better data preservation
df.to_json('computer_science_publications.json', orient='records', indent=2)
print(f"Saved {len(df)} publications to 'computer_science_publications.json'")

# Display summary statistics
print(f"\nSummary:")
print(f"Total publications: {len(df)}")
print(f"Number of labs: {df['lab'].nunique()}")
print(f"Year range: {df['year'].min()} - {df['year'].max()}")
print(f"\nPublications by lab:")
print(df['lab'].value_counts())

Saved 2055 publications to 'computer_science_publications.csv'
Saved 2055 publications to 'computer_science_publications.json'

Summary:
Total publications: 2055
Number of labs: 8
Year range: 1936 - 2026

Publications by lab:
lab
Data Interference, Analytics, and Learning Laboratory        941
Notre Dame Computer Vision Laboratory                        373
Machine Intelligence and kNowledge Engineering (MINE) lab    257
The Cooperative Computing Lab                                185
Prof. Weninger research team                                 128
SANDWICH LAB                                                  68
Natural Language Processing Group                             66
Notre Dame Intelligent Microsystems Lab                       37
Name: count, dtype: int64
