# Overview

data exploration to test assumptions about our text samples for testing

# Setup

## Imports

In [1]:
import os

In [2]:
from collections import Counter

In [3]:
from pathlib import Path

In [4]:
import re

In [5]:
import xml.etree.ElementTree as ET

In [6]:
from xml.sax import make_parser, parse

In [7]:
import label_alignment.sax2spans as sax2spans

In [8]:
import importlib

In [9]:
importlib.reload(sax2spans)

<module 'label_alignment.sax2spans' from '/Users/dcf/development/python/utils/label-alignment/src/label_alignment/sax2spans.py'>

In [10]:
import tokenizers

In [11]:
tokenizers.Offsets

typing.Tuple[int, int]

In [12]:
from tokenizers.pre_tokenizers import Whitespace, WhitespaceSplit

In [13]:
from label_alignment.simple_tokenizers import PretokenizerWrapper, ws_tokenizer, wss_tokenizer

In [14]:
from label_alignment.alignment import align_tokens_and_annotations_bilou as ata

In [15]:
from label_alignment.tok2spans import iob2spans

# Load Data

In [16]:
project_root = Path('..')

In [17]:
project_root.is_dir()

True

In [18]:
project_root.resolve()

PosixPath('/Users/dcf/development/python/utils/label-alignment')

In [19]:
test_data = project_root / 'tests' / 'data'

In [20]:
list(test_data.glob('*'))

[PosixPath('../tests/data/annotated_texts')]

In [21]:
anno_dir = test_data / 'annotated_texts'

In [22]:
list(anno_dir.glob('*.txt'))

[PosixPath('../tests/data/annotated_texts/verne_20000_leagues.ch5.txt')]

In [23]:
list(anno_dir.glob('*.xml'))

[PosixPath('../tests/data/annotated_texts/verne_20000_leagues.ch5.xml')]

In [24]:
verne = 'verne_20000_leagues.ch5'

In [25]:
with open(anno_dir / (verne + '.txt')) as vin:
    vtext = vin.read()
    

In [26]:
vtext

'The voyage of the Abraham Lincoln was for a long time marked by no special incident. But one circumstance happened which showed\nthe wonderful dexterity of Ned Land, and proved what confidence we might place in him.\n\nThe 30th of June, the frigate spoke some American whalers, from whom we learned that they knew nothing about the narwhal. But\none of them, the captain of the Monroe, knowing that Ned Land had shipped on board the Abraham Lincoln, begged for his help in\nchasing a whale they had in sight. Commander Farragut, desirous of seeing Ned Land at work, gave him permission to go on board\nthe Monroe. And fate served our Canadian so well that, instead of one whale, he harpooned two with a double blow, striking one\nstraight to the heart, and catching the other after some minutes’ pursuit.\n\nDecidedly, if the monster ever had to do with Ned Land’s harpoon, I would not bet in its favour.\n\nThe frigate skirted the south-east coast of America with great rapidity. The 3rd of July we

In [27]:
re.search('  ', vtext)

In [28]:
re.sub(r'\n{2}', 'FOO', vtext)

'The voyage of the Abraham Lincoln was for a long time marked by no special incident. But one circumstance happened which showed\nthe wonderful dexterity of Ned Land, and proved what confidence we might place in him.FOOThe 30th of June, the frigate spoke some American whalers, from whom we learned that they knew nothing about the narwhal. But\none of them, the captain of the Monroe, knowing that Ned Land had shipped on board the Abraham Lincoln, begged for his help in\nchasing a whale they had in sight. Commander Farragut, desirous of seeing Ned Land at work, gave him permission to go on board\nthe Monroe. And fate served our Canadian so well that, instead of one whale, he harpooned two with a double blow, striking one\nstraight to the heart, and catching the other after some minutes’ pursuit.FOODecidedly, if the monster ever had to do with Ned Land’s harpoon, I would not bet in its favour.FOOThe frigate skirted the south-east coast of America with great rapidity. The 3rd of July we we

In [29]:
vx = ET.parse(anno_dir / (verne + '.xml'))

# Data Exploration

## Via ElementTree

In [30]:
root = vx.getroot()

In [31]:
root.tag

'doc'

In [32]:
list(root)

[<Element 'p' at 0x10b30be20>,
 <Element 'p' at 0x10b3204f0>,
 <Element 'p' at 0x10b320810>,
 <Element 'p' at 0x10b3208b0>]

In [33]:
root[0]

<Element 'p' at 0x10b30be20>

In [34]:
root.findall('p')

[<Element 'p' at 0x10b30be20>,
 <Element 'p' at 0x10b3204f0>,
 <Element 'p' at 0x10b320810>,
 <Element 'p' at 0x10b3208b0>]

In [35]:
p1 = root.find('p')

In [36]:
list(p1)

[<Element 'vessel' at 0x10b320400>, <Element 'person' at 0x10b3204a0>]

In [37]:
p1.text

'The voyage of the '

In [38]:
list(p1.iter())

[<Element 'p' at 0x10b30be20>,
 <Element 'vessel' at 0x10b320400>,
 <Element 'person' at 0x10b3204a0>]

In [39]:
[(x.text, x.tail) for x in p1.iter()]

[('The voyage of the ', '\n\n'),
 ('Abraham Lincoln',
  ' was for a long time marked by no special incident. But one circumstance happened which showed the wonderful dexterity of '),
 ('Ned Land', ', and proved what confidence we might place in him.')]

## Via Sax

In [40]:
sparser = make_parser()

In [41]:
rep = sax2spans.Reporter()

In [42]:
rep

<label_alignment.sax2spans.Reporter at 0x10b330fe0>

In [43]:
rep.locator

In [44]:
rep._locator is rep.locator

True

In [45]:
parse(anno_dir / (verne + '.xml'), rep)

<xml.sax.expatreader.ExpatLocator object at 0x10b2bc3b0>
starting doc
1 characters
starting p
18 characters
starting vessel
15 characters
ending vessel
122 characters
starting person
8 characters
ending person
51 characters
ending p
1 characters
1 characters
starting p
4 characters
starting date
13 characters
ending date
24 characters
starting nationality
8 characters
ending nationality
109 characters
starting vessel
6 characters
ending vessel
15 characters
starting person
8 characters
ending person
26 characters
starting vessel
15 characters
ending vessel
60 characters
starting person
18 characters
ending person
21 characters
starting person
8 characters
ending person
246 characters
ending p
1 characters
1 characters
starting p
46 characters
starting person
8 characters
ending person
42 characters
ending p
1 characters
1 characters
starting p
44 characters
starting country
7 characters
ending country
26 characters
starting date
11 characters
ending date
31 characters
starting place
19

In [46]:
ch = sax2spans.SpanAndText(verbose=1)

In [47]:
parse(anno_dir / (verne + '.xml'), ch)

starting doc
starting paragraph
starting vessel
ending vessel
starting person
ending person
ending paragraph
ignoring text outside paragraphs
ignoring text outside paragraphs
starting paragraph
starting date
ending date
starting nationality
ending nationality
starting vessel
ending vessel
starting person
ending person
starting vessel
ending vessel
starting person
ending person
starting person
ending person
ending paragraph
ignoring text outside paragraphs
ignoring text outside paragraphs
starting paragraph
starting person
ending person
ending paragraph
ignoring text outside paragraphs
ignoring text outside paragraphs
starting paragraph
starting country
ending country
starting date
ending date
starting place
ending place
starting place
ending place
starting person
ending person
starting place
ending place
ending paragraph
ignoring text outside paragraphs
ending doc
4 paragraphs


In [48]:
text, annos = sax2spans.text_and_spans(ch)

In [49]:
len(text)

1140

In [50]:
print(text)

The voyage of the Abraham Lincoln was for a long time marked by no special incident. But one circumstance happened which showed the wonderful dexterity of Ned Land, and proved what confidence we might place in him.
The 30th of June, the frigate spoke some American whalers, from whom we learned that they knew nothing about the narwhal. But one of them, the captain of the Monroe, knowing that Ned Land had shipped on board the Abraham Lincoln, begged for his help in chasing a whale they had in sight. Commander Farragut, desirous of seeing Ned Land at work, gave him permission to go on board the Monroe. And fate served our Canadian so well that, instead of one whale, he harpooned two with a double blow, striking one straight to the heart, and catching the other after some minutes’ pursuit.
Decidedly, if the monster ever had to do with Ned Land’s harpoon, I would not bet in its favour.
The frigate skirted the south-east coast of America with great rapidity. The 3rd of July we were at the op

In [51]:
len(annos)

16

In [52]:
annos[0]

SpanAnnotation(vessel, 18, 33)

In [53]:
for anno in annos:
    print(anno.label)
    print(anno.start, anno.end)
    print(text[anno.start:anno.end])


vessel
18 33
Abraham Lincoln
person
155 163
Ned Land
date
219 232
30th of June,
nationality
256 264
American
vessel
373 379
Monroe
person
394 402
Ned Land
vessel
428 443
Abraham Lincoln
person
503 521
Commander Farragut
person
542 550
Ned Land
person
843 851
Ned Land
country
938 945
America
date
971 982
3rd of July
place
1013 1032
Straits of Magellan
place
1045 1057
Cape Vierges
person
1063 1081
Commander Farragut
place
1129 1138
Cape Horn


In [54]:
def summarize_annos(text, annos):
    by_label = {}
    for anno in annos:
        by_label.setdefault(anno.label, Counter()).update([text[anno.start:anno.end]])
    return by_label


In [55]:
summarize_annos(text, annos)

{'vessel': Counter({'Abraham Lincoln': 2, 'Monroe': 1}),
 'person': Counter({'Ned Land': 4, 'Commander Farragut': 2}),
 'date': Counter({'30th of June,': 1, '3rd of July': 1}),
 'nationality': Counter({'American': 1}),
 'country': Counter({'America': 1}),
 'place': Counter({'Straits of Magellan': 1,
          'Cape Vierges': 1,
          'Cape Horn': 1})}

In [56]:
cws = sax2spans.find_consec_whitespace(text)

In [57]:
len(cws)

0

# Tokenization Testing

## Recovering Token Index for char2token

In [58]:
from bisect import bisect_left, bisect_right

In [59]:
a = [1, 5, 6, 13, 14, 19, 25]

In [60]:
bisect_left(a, 7)

3

In [61]:
text

'The voyage of the Abraham Lincoln was for a long time marked by no special incident. But one circumstance happened which showed the wonderful dexterity of Ned Land, and proved what confidence we might place in him.\nThe 30th of June, the frigate spoke some American whalers, from whom we learned that they knew nothing about the narwhal. But one of them, the captain of the Monroe, knowing that Ned Land had shipped on board the Abraham Lincoln, begged for his help in chasing a whale they had in sight. Commander Farragut, desirous of seeing Ned Land at work, gave him permission to go on board the Monroe. And fate served our Canadian so well that, instead of one whale, he harpooned two with a double blow, striking one straight to the heart, and catching the other after some minutes’ pursuit.\nDecidedly, if the monster ever had to do with Ned Land’s harpoon, I would not bet in its favour.\nThe frigate skirted the south-east coast of America with great rapidity. The 3rd of July we were at th

### Simple Tokenizers

create tokenizers from Hugging Face pre-tokenizers

**Note:** HuggingFace Tokenizer class requires a trainable model, so it won't actually do simple
tokenization.  However, we can use the hugging face pre-tokenizers to build a tokenizer whose output
conforms to the output of a Tokenizer.

#### Whitespace


"Whitespace" pre-tokenizer is a misnomer - it actually splits on '\w+|[^\w\s]+' which also separates punctuation (except for underscore) from words.  For simple splitting on whitespace, use WhitespaceSplit

In [62]:
wss = WhitespaceSplit()

In [63]:
help(wss.pre_tokenize_str)

Help on built-in function pre_tokenize_str:

pre_tokenize_str(self, sequence) method of tokenizers.pre_tokenizers.WhitespaceSplit instance
    Pre tokenize the given string

    This method provides a way to visualize the effect of a
    :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
    alignment, nor does it provide all the capabilities of the
    :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
    :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`

    Args:
        sequence (:obj:`str`):
            A string to pre-tokeize

    Returns:
        :obj:`List[Tuple[str, Offsets]]`:
            A list of tuple with the pre-tokenized parts and their offsets



In [64]:
spaced = wss.pre_tokenize_str(text)

In [65]:
import itertools

In [66]:
def flatten(list_of_lists):
    "Flatten one level of nesting."
    return itertools.chain.from_iterable(list_of_lists)

In [67]:
bounds = [x[1] for x in spaced]

In [68]:
bounds[:15]

[(0, 3),
 (4, 10),
 (11, 13),
 (14, 17),
 (18, 25),
 (26, 33),
 (34, 37),
 (38, 41),
 (42, 43),
 (44, 48),
 (49, 53),
 (54, 60),
 (61, 63),
 (64, 66),
 (67, 74)]

token bounds follow the python convention: token 0 -> characters 0 up to (but not including) 3

In [69]:
fb = list(flatten(bounds))

In [70]:
fb[:30]

[0,
 3,
 4,
 10,
 11,
 13,
 14,
 17,
 18,
 25,
 26,
 33,
 34,
 37,
 38,
 41,
 42,
 43,
 44,
 48,
 49,
 53,
 54,
 60,
 61,
 63,
 64,
 66,
 67,
 74]

In [71]:
for i in range(13, 20):
    br = bisect_right(fb, i)
    bl = bisect_left(fb, i)
    print(i, bl, br, f'({fb[bl]}, {fb[br]})')

13 5 6 (13, 14)
14 6 7 (14, 17)
15 7 7 (17, 17)
16 7 7 (17, 17)
17 7 8 (17, 18)
18 8 9 (18, 25)
19 9 9 (25, 25)


In [72]:
wss_tok = wss_tokenizer()

In [73]:
toked = wss_tok.tokenize(text)

In [74]:
for i in range (13, 20):
    ti = toked.char_to_token(i)
    tc = '?'
    if ti is not None:
        tc = toked.tokens[ti]
    print(i, text[i], ti, tc)

13   None ?
14 t 3 the
15 h 3 the
16 e 3 the
17   None ?
18 A 4 Abraham
19 b 4 Abraham


In [75]:
spans = [a.to_labeled_span() for a in annos]
spans

[{'start': 18, 'label': 'vessel', 'end': 33},
 {'start': 155, 'label': 'person', 'end': 163},
 {'start': 219, 'label': 'date', 'end': 232},
 {'start': 256, 'label': 'nationality', 'end': 264},
 {'start': 373, 'label': 'vessel', 'end': 379},
 {'start': 394, 'label': 'person', 'end': 402},
 {'start': 428, 'label': 'vessel', 'end': 443},
 {'start': 503, 'label': 'person', 'end': 521},
 {'start': 542, 'label': 'person', 'end': 550},
 {'start': 843, 'label': 'person', 'end': 851},
 {'start': 938, 'label': 'country', 'end': 945},
 {'start': 971, 'label': 'date', 'end': 982},
 {'start': 1013, 'label': 'place', 'end': 1032},
 {'start': 1045, 'label': 'place', 'end': 1057},
 {'start': 1063, 'label': 'person', 'end': 1081},
 {'start': 1129, 'label': 'place', 'end': 1138}]

In [76]:
def check_char_to_token(spans):
    for span in spans:
        label = span['label']
        span_text = text[span['start']:span['end']]
        span_words = span_text.split()
        print(f'{label}: "{span_text}" at {span['start']}')
        til = []
        errs = []
        for i, c in enumerate(span_text):
            ti = toked.char_to_token(i + span['start'])
            tin = ti is None
            is_ws = not bool(c.strip())
            if tin ^ is_ws:
                errs.append((c, i, is_ws, tin))
            if not tin:
                til.append(ti)
        
        if errs:
            msg = f'Inconsistent ws and token labels in ({label} "{span_text}"'
            print(msg)
            for err in errs:
                c, i, is_ws, tin = err
                msgs = [f'\tchar "{c}" at index {i} is']
                if not is_ws: 
                    msgs.append('not')
                msgs.append('whitespace but had')
                if tin:
                    msgs.append('a')
                else:
                    msgs.append('no')
                msgs.append('token id')
            print(' '.join(msgs))
        tic = Counter(til)
        i_min = min(til)
        seen = set(til)
        expected = {}
        for i, span_word in enumerate(span_words):
            expected[i + i_min] = len(span_word)
        for i in set(tic) | set(expected):
            if tic[i] != expected[i]:
                print(f'expected {expected[i]} counts for {i}, but saw {tic[i]}')
                
        

# Alignment Testing

In [77]:
iob_labels = ata(toked, spans)

In [78]:
def new_span():
    return {'tokens': [], 'labels': []}

def show_span(span):
    for tok, lab in zip(span['tokens'], span['labels']):
        print(lab, tok)
    if span['tokens']:
        print('---------')

def update_span(span, tok, lab):
    span['tokens'].append(tok)
    span['labels'].append(lab)
    
def analyze_labels(toked, iob_labels):
    current_span = new_span()
    for tok, lab in zip(toked.tokens, iob_labels):
        if lab == 'O':
            show_span(current_span)
            current_span = new_span()
            continue
        update_span(current_span, tok, lab)
    show_span(current_span)
        
            
        

In [79]:
analyze_labels(toked, iob_labels)

B-vessel Abraham
L-vessel Lincoln
---------
B-person Ned
L-person Land,
---------
B-date 30th
I-date of
L-date June,
---------
U-nationality American
---------
U-vessel Monroe,
---------
B-person Ned
L-person Land
---------
B-vessel Abraham
L-vessel Lincoln,
---------
B-person Commander
L-person Farragut,
---------
B-person Ned
L-person Land
---------
B-person Ned
L-person Land’s
---------
U-country America
---------
B-date 3rd
I-date of
L-date July
---------
B-place Straits
I-place of
L-place Magellan,
---------
B-place Cape
L-place Vierges.
---------
B-person Commander
L-person Farragut
---------
B-place Cape
L-place Horn.
---------


# Test Tok2spans

In [80]:
gen_annos = iob2spans(toked.tokens, iob_labels)

In [81]:
roundtrip = list(gen_annos)

In [82]:
len(roundtrip)

16

In [83]:
len(annos)

16

In [84]:
roundtrip[0]

SpanAnnotation(vessel, 18, 33)

In [85]:
def compare_annos(orig, rt):
    if len(orig) != len(rt):
        print('warning: lengths do not match')
    matches = []
    diffs = []
    for i, apair in enumerate(zip(orig, rt)):
        oa, rta = apair
        if oa == rta:
            matches.append(i)
            continue
        diffs.append((i, oa, rta))
    return matches, diffs

In [86]:
matches, diffs = compare_annos(annos, roundtrip)

In [87]:
len(matches)

8

In [88]:
len(diffs)

8

In [89]:
diffs[0]

(1, SpanAnnotation(person, 155, 163), SpanAnnotation(person, 155, 164))

In [90]:
diffs

[(1, SpanAnnotation(person, 155, 163), SpanAnnotation(person, 155, 164)),
 (4, SpanAnnotation(vessel, 373, 379), SpanAnnotation(vessel, 373, 380)),
 (6, SpanAnnotation(vessel, 428, 443), SpanAnnotation(vessel, 428, 444)),
 (7, SpanAnnotation(person, 503, 521), SpanAnnotation(person, 503, 522)),
 (9, SpanAnnotation(person, 843, 851), SpanAnnotation(person, 843, 853)),
 (12, SpanAnnotation(place, 1013, 1032), SpanAnnotation(place, 1013, 1033)),
 (13, SpanAnnotation(place, 1045, 1057), SpanAnnotation(place, 1045, 1058)),
 (15, SpanAnnotation(place, 1129, 1138), SpanAnnotation(place, 1129, 1139))]

In [93]:
for diff in diffs:
    print(diff)
    for sa in diff[1:]:
        print(text[sa.start:sa.end])

(1, SpanAnnotation(person, 155, 163), SpanAnnotation(person, 155, 164))
Ned Land
Ned Land,
(4, SpanAnnotation(vessel, 373, 379), SpanAnnotation(vessel, 373, 380))
Monroe
Monroe,
(6, SpanAnnotation(vessel, 428, 443), SpanAnnotation(vessel, 428, 444))
Abraham Lincoln
Abraham Lincoln,
(7, SpanAnnotation(person, 503, 521), SpanAnnotation(person, 503, 522))
Commander Farragut
Commander Farragut,
(9, SpanAnnotation(person, 843, 851), SpanAnnotation(person, 843, 853))
Ned Land
Ned Land’s
(12, SpanAnnotation(place, 1013, 1032), SpanAnnotation(place, 1013, 1033))
Straits of Magellan
Straits of Magellan,
(13, SpanAnnotation(place, 1045, 1057), SpanAnnotation(place, 1045, 1058))
Cape Vierges
Cape Vierges.
(15, SpanAnnotation(place, 1129, 1138), SpanAnnotation(place, 1129, 1139))
Cape Horn
Cape Horn.


expected - original annotation did not include punctuation, so but we are aligning to space-tokenized text,
so the round trip spans do.