In [1]:
if "google.colab" in str(get_ipython()):
    %pip install --quiet polars[numpy,pandas,pyarrow]==1.7.1 requests==2.31.0 lxml==4.9.3 cssselect==1.2.0 plotly kaleido

In [2]:
import requests
from lxml import etree
import polars as pl
from typing import Iterator
from bisect import bisect_right


def etree_elements_to_dicts(elements: Iterator[etree._Element]) -> Iterator[dict]:
    for element in elements:
        obj = {
            "name": element.tag,
            "text": "".join(element.itertext()),
            "sourceline": element.sourceline,
            **element.attrib,
        }
        yield obj


url = "https://raw.githubusercontent.com/temporal-communities/carmen-nova/main/carmen_nova.xml"
res = requests.get(url)

The document is parsed using [lxml](https://lxml.de/).

## CSS

Use [CSS selector syntax](https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors) for querying.

For example, to get all `<persName>` and `<author>` elements within `<text>`, use:

```
text persName, text author
```

## XPath

Use [XPath query syntax](<https://en.wikipedia.org/wiki/XPath#Syntax_and_semantics_(XPath_1.0)>) for querying.

For example, to get all `<persName>` and `<author>` elements within `<text>`, use:

```
//text//persName | //text//author
```

- [CSS → XPath syntax cheat sheet](https://devhints.io/xpath)


In [3]:
class PageBreakLookup:
    def __init__(self, dom):
        """
        Initialise the PageBreakLookup with a DOM object.

        Args:
            dom: The DOM object containing <pb> elements.
        """
        self.page_breaks = [
            (int(pb["n"]), pb["sourceline"])
            for pb in etree_elements_to_dicts(dom.xpath("//pb"))
        ]
        self.page_breaks.sort(key=lambda x: x[1])  # Ensure sorted by sourceline

    def from_source_line(self, line: int) -> int:
        """
        Get the page number for a given source code line.

        Args:
            line (int): The source code line number.

        Returns:
            int: The page number corresponding to the source code line.

        If no page break is found before the given line, returns -1.
        """
        idx = bisect_right([pb[1] for pb in self.page_breaks], line) - 1
        if idx >= 0:
            return self.page_breaks[idx][0]
        return -1

In [None]:
# @title Run query
query_type = "css"  # @param ["css", "xpath"]
query = "text persName, text author"  # @param ["text persName, text author", "//text//persName | //text//author"] {allow-input: true}

xml_without_namespaces = res.text.replace(
    " xmlns=", " xmlnamespace=", 1
)  # Remove namespace to make XPath more convenient
dom = etree.XML(xml_without_namespaces.encode())

# Initialise page breaks lookup
page_breaks = PageBreakLookup(dom)

if query_type == "css":
    data = dom.cssselect(query)
elif query_type == "xpath":
    data = dom.xpath(query, namespaces=dom.nsmap)
else:
    raise ValueError(f"Unknown query type: {query_type}")

if len(data) == 0:
    raise ValueError("No results")

data = etree_elements_to_dicts(data)
df = pl.DataFrame(data).with_columns(
    text=pl.col("text").str.replace(r"\s+", " "),
    page=pl.col("sourceline").map_elements(
        page_breaks.from_source_line, return_dtype=pl.Int64
    ),
)
df.to_pandas()

In [None]:
# Create summary table: Number of occurrences by reference. Use the mode of the text value.
summary = (
    df.group_by("ref")
    .agg(pl.len(), pl.col("text").mode().first())
    .sort("len", "text", descending=True)
)
summary.to_pandas()

In [None]:
# Create summary table: Number of occurrences by reference. Use the mode of the text value.
summary = df.group_by("text").agg(pl.len()).sort("len", "text", descending=True)
summary.to_pandas()

## Adding Wikidata labels

Because the text content of the same reference may take many forms, we may wish to fetch a more canonical label from Wikidata.

The following `get_labels` function implements this using the Wikidata API.
It accepts a Polars series as input and outputs a Polars series.
It is later applied using the `map_batches` method.


In [7]:
# Function to get labels from Wikidata
def get_labels(qids: pl.Series) -> pl.Series:
    url = "https://www.wikidata.org/w/api.php"
    max_items = 50

    # Split QIDs into chunks of max_items
    qids_non_null = qids.drop_nulls()
    qid_chunks = [
        qids_non_null[i : i + max_items]
        for i in range(0, len(qids_non_null), max_items)
    ]

    # Initialize an empty dictionary to store the results
    entities_dict = {}

    # Loop through each chunk of QIDs
    for chunk in qid_chunks:
        # Only request max_items at once
        params = {
            "action": "wbgetentities",
            "format": "json",
            "props": "labels",
            "languages": "de|en",
            "ids": "|".join(chunk),
        }

        # Make the API request
        res = requests.get(url, params=params)
        data = res.json()

        # Update the dictionary with the new data
        entities_dict.update(data["entities"])

    # Create a new Series with the original QIDs as the index
    entities_list = []

    # Populate the new series with the entities in the original order
    for qid in qids:
        item = entities_dict.get(qid, {})
        # Get German label, fallback to English label
        label = item.get("labels", {}).get("de", {}).get("value")
        if label is None:
            label = item.get("labels", {}).get("en", {}).get("value")

        entities_list.append(label)

    return pl.Series(entities_list)

In [None]:
df = df.with_columns(
    wikidata_label=pl.col("ref")
    .str.replace("http://www.wikidata.org/entity/", "")  # Remove URL, Q-ID remains
    .map_batches(function=get_labels)  # Apply get_labels
)
df.to_pandas()

## Plotting TEI elements and their pages


In [9]:
# Create Plotly histogram with persName and author sourceline
import plotly.express as px

pb_elements = pl.DataFrame(
    etree_elements_to_dicts(dom.cssselect("head:not([type='sub'])"))
).filter(pl.col("text") != "ANHANG")
persons = pl.DataFrame(
    etree_elements_to_dicts(
        dom.cssselect("text persName, text quote, text sic, text rs, text placeName")
    )
).with_columns(
    page=pl.col("sourceline").map_elements(
        page_breaks.from_source_line, return_dtype=pl.Int64
    )
)

tag_colours = {
    "persName": px.colors.qualitative.Plotly[9],
    "quote": px.colors.qualitative.Plotly[0],
    "sic": px.colors.qualitative.Plotly[1],
    "rs": px.colors.qualitative.Plotly[3],
    "placeName": px.colors.qualitative.Plotly[2],
}

In [None]:
fig = px.histogram(
    persons,
    x="page",
    color="name",
    nbins=100,
    labels=dict(
        name="Element",
        page="Page",
        count="Count",
    ),
    title="Occurrences of various TEI tags",
    template="plotly_dark",
    color_discrete_map=tag_colours,
)

# Add vertical lines at page breaks
for pb in pb_elements.to_dicts():
    label = pb["text"]
    page = page_breaks.from_source_line(pb["sourceline"])
    fig.add_vline(x=page - 0.5, line_width=1, label=dict(text=label, yanchor="top"))

fig.show()

In [11]:
width = 1920
height = 1080
vh = height / 100

# Set font size
fig.update_layout(
    font=dict(
        size=3.2 * vh,
    ),
    paper_bgcolor="black",
    plot_bgcolor="black",
)

fig.write_image("tag-occurrences.png", width=width, height=height, scale=1)