In [1]:
bib = "(<citation-number>6</citation-number>) <author><family>Gabbay</family>, <given>D. M.</given></author>. <title>Basic Notions</title>. In <container-title>Investigations in Modal & Tense Logics with Applications to Problems in Philosophy and Linguistics</container-title>; <publisher>Springer Netherlands</publisher>, <issued><year>1976</year></issued>; pp <page>27\u201335</page>.\n"
bib = f"<bib>{bib}</bib>"
bib

'<bib>(<citation-number>6</citation-number>) <author><family>Gabbay</family>, <given>D. M.</given></author>. <title>Basic Notions</title>. In <container-title>Investigations in Modal & Tense Logics with Applications to Problems in Philosophy and Linguistics</container-title>; <publisher>Springer Netherlands</publisher>, <issued><year>1976</year></issued>; pp <page>27–35</page>.\n</bib>'

In [2]:
import lxml
from lxml import etree


In [3]:
parser = etree.HTMLParser()
root = etree.fromstring(bib, parser)

In [4]:
"".join(root.itertext())

'(6) Gabbay, D. M.. Basic Notions. In Investigations in Modal & Tense Logics with Applications to Problems in Philosophy and Linguistics; Springer Netherlands, 1976; pp 27–35.\n'

In [5]:
def textAndElement(node):
    '''
    yields elements and text IN ORDER
    Idea: https://stackoverflow.com/questions/24071072/iterate-over-both-text-and-elements-in-lxml-etree
    '''

    yield "start", node

    text = node.text if node.text else None
    if text:
        yield "text", text

    for child in node:
        yield from textAndElement(child)
        
    yield "end", node

    tail = node.tail if node.tail else None
    if tail:
        yield "text", tail    


### Let's try to convert Bibliography to Spacy.Displacy annotations

In [6]:
def annotations(bib):
    """
        bib = '<bib>(<citation-number>6</citation-number>) <author><family>Gabbay</family>, <given>D. M.</given></author>. <title>Basic Notions</title>. In <container-title>Investigations in Modal and Tense Logics with Applications to Problems in Philosophy and Linguistics</container-title>; <publisher>Springer Netherlands</publisher>, <issued><year>1976</year></issued>; pp <page>27–35</page>.\n</bib>'         
        root = etree.fromstring(bib)
    """
    text = []
    stack = []
    for event, t in textAndElement(bib):
        if "text" == event:
            text.append(t)
        elif "start" == event:
            stack.append((t.tag, len("".join(text))))
        elif "end" == event:
            _start = stack.pop()
            tag, start, end = _start[0], _start[1], len("".join(text))
            assert tag == t.tag
            if tag in ner_tags:
                yield (tag, start, end)
              
    assert "".join(text) == "".join(root.itertext())


### For Bib Item

In [7]:
ner_tags = [
    "citation-number",
    "author",
    "title",
    "title",
    "container-title",
    "publisher",
    "issued",
    "volume",
    "page"
]


root = etree.fromstring(bib, parser)

from spacy import displacy
displacy.render({"text":"".join(root.itertext()), "ents":[{"label":tag, "start":start, "end":end} for tag, start, end in annotations(root)]} ,
                style="ent" ,manual = True)

### For Bibliography 

In [9]:
import json
with open("rendered_biblioraphy.json") as f:
    # load bibliographies list - it contain the same bibliography rendered in different styles
    bibliographies = json.load(f)
    
    
for bibliography in bibliographies:
    # at the moment only the references section is needed (parsing citations is another task)
    if "references" not in bibliography:
        print("Oops: a problem for style ", bibliography["style"])
    
    # list of strings. Each string is a rendered bib item with the citation number(if supported by a CSL style)
    references = bibliography["references"]
    
    if not references:
        continue
    
    xml =  f"<references><bib>{'</bib><bib>'.join(references)}</bib></references>"
    
    root = etree.fromstring(xml, parser)
    
    displacy.render({"text":"".join(root.itertext()), "ents":[{"label":tag, "start":start, "end":end} for tag, start, end in annotations(root)]} ,
                style="ent" ,manual = True)
    
        
    
        

