# Overview

data exploration to test assumptions about our text samples for testing

# Setup

## Imports

In [1]:
import os

In [2]:
from pathlib import Path

In [3]:
import re

In [4]:
import xml.etree.ElementTree as ET

In [5]:
from xml.sax import make_parser, parse

In [6]:
import label_alignment.sax2spans

In [7]:
import importlib

In [8]:
importlib.reload(label_alignment.sax2spans)

<module 'label_alignment.sax2spans' from '/Users/dcf/development/python/utils/label-alignment/src/label_alignment/sax2spans.py'>

In [9]:
from label_alignment.sax2spans import Reporter, SpanAndText

# Load Data

In [10]:
project_root = Path('..')

In [11]:
project_root.is_dir()

True

In [12]:
project_root.resolve()

PosixPath('/Users/dcf/development/python/utils/label-alignment')

In [13]:
test_data = project_root / 'tests' / 'data'

In [14]:
list(test_data.glob('*'))

[PosixPath('../tests/data/annotated_texts')]

In [15]:
anno = test_data / 'annotated_texts'

In [16]:
list(anno.glob('*.txt'))

[PosixPath('../tests/data/annotated_texts/verne_20000_leagues.ch5.txt')]

In [17]:
list(anno.glob('*.xml'))

[PosixPath('../tests/data/annotated_texts/verne_20000_leagues.ch5.xml')]

In [18]:
verne = 'verne_20000_leagues.ch5'

In [19]:
with open(anno / (verne + '.txt')) as vin:
    vtext = vin.read()
    

In [20]:
vtext

'The voyage of the Abraham Lincoln was for a long time marked by no special incident. But one circumstance happened which showed\nthe wonderful dexterity of Ned Land, and proved what confidence we might place in him.\n\nThe 30th of June, the frigate spoke some American whalers, from whom we learned that they knew nothing about the narwhal. But\none of them, the captain of the Monroe, knowing that Ned Land had shipped on board the Abraham Lincoln, begged for his help in\nchasing a whale they had in sight. Commander Farragut, desirous of seeing Ned Land at work, gave him permission to go on board\nthe Monroe. And fate served our Canadian so well that, instead of one whale, he harpooned two with a double blow, striking one\nstraight to the heart, and catching the other after some minutes’ pursuit.\n\nDecidedly, if the monster ever had to do with Ned Land’s harpoon, I would not bet in its favour.\n\nThe frigate skirted the south-east coast of America with great rapidity. The 3rd of July we

In [21]:
re.search('  ', vtext)

In [22]:
re.sub(r'\n{2}', 'FOO', vtext)

'The voyage of the Abraham Lincoln was for a long time marked by no special incident. But one circumstance happened which showed\nthe wonderful dexterity of Ned Land, and proved what confidence we might place in him.FOOThe 30th of June, the frigate spoke some American whalers, from whom we learned that they knew nothing about the narwhal. But\none of them, the captain of the Monroe, knowing that Ned Land had shipped on board the Abraham Lincoln, begged for his help in\nchasing a whale they had in sight. Commander Farragut, desirous of seeing Ned Land at work, gave him permission to go on board\nthe Monroe. And fate served our Canadian so well that, instead of one whale, he harpooned two with a double blow, striking one\nstraight to the heart, and catching the other after some minutes’ pursuit.FOODecidedly, if the monster ever had to do with Ned Land’s harpoon, I would not bet in its favour.FOOThe frigate skirted the south-east coast of America with great rapidity. The 3rd of July we we

In [23]:
vx = ET.parse(anno / (verne + '.xml'))

# Data Exploration

## Via ElementTree

In [24]:
root = vx.getroot()

In [25]:
root.tag

'doc'

In [26]:
list(root)

[<Element 'p' at 0x10e046cf0>,
 <Element 'p' at 0x10e046de0>,
 <Element 'p' at 0x10e047100>,
 <Element 'p' at 0x10e0471a0>]

In [27]:
root[0]

<Element 'p' at 0x10e046cf0>

In [28]:
root.findall('p')

[<Element 'p' at 0x10e046cf0>,
 <Element 'p' at 0x10e046de0>,
 <Element 'p' at 0x10e047100>,
 <Element 'p' at 0x10e0471a0>]

In [29]:
p1 = root.find('p')

In [30]:
list(p1)

[<Element 'vessel' at 0x10e046d40>, <Element 'person' at 0x10e046d90>]

In [31]:
p1.text

'The voyage of the '

In [32]:
list(p1.iter())

[<Element 'p' at 0x10e046cf0>,
 <Element 'vessel' at 0x10e046d40>,
 <Element 'person' at 0x10e046d90>]

In [33]:
[(x.text, x.tail) for x in p1.iter()]

[('The voyage of the ', '\n\n'),
 ('Abraham Lincoln',
  ' was for a long time marked by no special incident. But one circumstance happened which showed the wonderful dexterity of '),
 ('Ned Land', ', and proved what confidence we might place in him.')]

## Via Sax

In [34]:
sparser = make_parser()

In [35]:
rep = Reporter()

In [36]:
rep

<label_alignment.sax2spans.Reporter at 0x10e065520>

In [37]:
parse(anno / (verne + '.xml'), rep)

<xml.sax.expatreader.ExpatLocator object at 0x10e0283b0>
starting doc
1 characters
starting p
18 characters
starting vessel
15 characters
ending vessel
122 characters
starting person
8 characters
ending person
51 characters
ending p
1 characters
1 characters
starting p
4 characters
starting date
13 characters
ending date
24 characters
starting nationality
8 characters
ending nationality
109 characters
starting vessel
6 characters
ending vessel
15 characters
starting person
8 characters
ending person
26 characters
starting vessel
15 characters
ending vessel
60 characters
starting person
18 characters
ending person
21 characters
starting person
8 characters
ending person
246 characters
ending p
1 characters
1 characters
starting p
46 characters
starting person
8 characters
ending person
42 characters
ending p
1 characters
1 characters
starting p
44 characters
starting country
7 characters
ending country
26 characters
starting date
11 characters
ending date
31 characters
starting place
19

In [38]:
ch = SpanAndText()

In [39]:
parse(anno / (verne + '.xml'), ch)

starting doc
ignoring text outside paragraphs
starting paragraph
starting vessel
ending vessel
starting person
ending person
ending paragraph
ignoring text outside paragraphs
ignoring text outside paragraphs
starting paragraph
starting date
ending date
starting nationality
ending nationality
starting vessel
ending vessel
starting person
ending person
starting vessel
ending vessel
starting person
ending person
starting person
ending person
ending paragraph
ignoring text outside paragraphs
ignoring text outside paragraphs
starting paragraph
starting person
ending person
ending paragraph
ignoring text outside paragraphs
ignoring text outside paragraphs
starting paragraph
starting country
ending country
starting date
ending date
starting place
ending place
starting place
ending place
starting person
ending person
starting place
ending place
ending paragraph
ignoring text outside paragraphs
ending doc
4 paragraphs


In [40]:
text, annos = ch.text_and_spans()

In [41]:
len(text)

1140

In [42]:
print(text)

The voyage of the Abraham Lincoln was for a long time marked by no special incident. But one circumstance happened which showed the wonderful dexterity of Ned Land, and proved what confidence we might place in him.
The 30th of June, the frigate spoke some American whalers, from whom we learned that they knew nothing about the narwhal. But one of them, the captain of the Monroe, knowing that Ned Land had shipped on board the Abraham Lincoln, begged for his help in chasing a whale they had in sight. Commander Farragut, desirous of seeing Ned Land at work, gave him permission to go on board the Monroe. And fate served our Canadian so well that, instead of one whale, he harpooned two with a double blow, striking one straight to the heart, and catching the other after some minutes’ pursuit.
Decidedly, if the monster ever had to do with Ned Land’s harpoon, I would not bet in its favour.
The frigate skirted the south-east coast of America with great rapidity. The 3rd of July we were at the op

In [43]:
len(annos)

16

In [44]:
annos[0]

<label_alignment.span_annotation.SpanAnnotation at 0x10e066bd0>

In [45]:
for anno in annos:
    print(anno.label)
    print(anno.start, anno.end)
    print(text[anno.start:anno.end])


vessel
18 33
Abraham Lincoln
person
155 163
Ned Land
date
219 232
30th of June,
nationality
256 264
American
vessel
373 379
Monroe
person
394 402
Ned Land
vessel
428 443
Abraham Lincoln
person
503 521
Commander Farragut
person
542 550
Ned Land
person
843 851
Ned Land
country
938 945
America
date
971 982
3rd of July
place
1013 1032
Straits of Magellan
place
1045 1057
Cape Vierges
person
1063 1081
Commander Farragut
place
1129 1138
Cape Horn
