In [42]:
from cas_visualizer.visualizer import SpacySpanVisualiser
from IPython.display import display, HTML

import spacy
import cassis
from rwse import RWSE_Checker

In [43]:
rwse = RWSE_Checker()
rwse.set_confusion_sets('input/confusion_sets_modified.csv')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [44]:
ts_file = 'input/TypeSystem.xml'
with open(ts_file, 'rb') as f:
    ts = cassis.load_typesystem(f)


T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

S = ts.get_type(T_SENTENCE)
T = ts.get_type(T_TOKEN)

cas = cassis.Cas(ts)

input_file = 'input/use_case_example_modified.txt'
with open(input_file, 'r') as f:
    cas.sofa_string = f.readlines()[0]

#cas.sofa_string = "If you want to embed an HTML page with JavaScript on your page now, the easiest thing to do is to save your HTML file to the directory with your notebook and then load the HTML as follows:"

nlp = spacy.load("en_core_web_sm")
doc = nlp(cas.sofa_string)
for sent in doc.sents:
    cas_sentence = S(begin=sent.start_char, end=sent.end_char)
    cas.add(cas_sentence)
for token in doc:
    cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
    cas.add(cas_token)

def check_for_rwse(magnitude):
    for annotation in cas.select(T_RWSE):
        cas.remove(annotation)
    rwse.check_cas(cas, ts, magnitude)

    print('magnitude:', magnitude, 'found:', len(cas.select(T_RWSE)))
    print(sorted([(x.begin, x.end, x.get_covered_text()) for x in cas.select(T_RWSE)]))

    spacy_span_vis = SpacySpanVisualiser(cas, [])

    spacy_span_vis.set_selected_annotations_to_types({
        'RWSE': T_RWSE,
    })
    spacy_span_vis.set_annotations_to_colors({
        'RWSE': 'lightgreen',
    })
    spacy_span_vis.set_span_type(SpacySpanVisualiser.SPAN_STYLE_HIGHLIGHTING)
    spacy_span_vis.set_allow_highlighting_overlap(True)
    return spacy_span_vis.visualise()

In [45]:
html_list = [check_for_rwse(i) for i in [1, 10, 20, 100]]

magnitude: 1 found: 5
[(59, 63, 'from'), (261, 265, 'Your'), (488, 491, 'you'), (579, 583, 'your'), (928, 932, 'live')]
magnitude: 10 found: 3
[(261, 265, 'Your'), (488, 491, 'you'), (579, 583, 'your')]
magnitude: 20 found: 3
[(261, 265, 'Your'), (488, 491, 'you'), (579, 583, 'your')]
magnitude: 100 found: 3
[(261, 265, 'Your'), (488, 491, 'you'), (579, 583, 'your')]


In [46]:
display(HTML(html_list[0]))

### Modify CAS annotations

In [47]:
#[(59, 63, 'from'), (261, 265, 'Your'), (488, 491, 'you'), (579, 583, 'your'), (928, 932, 'live')]
ts_file = 'input/TypeSystem.xml'
with open(ts_file, 'rb') as f:
    mod_ts = cassis.load_typesystem(f)


T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

mod_ts.create_type('custom_1', T_RWSE)
C1 = mod_ts.get_type('custom_1')
mod_ts.create_type('custom_st100', T_RWSE)
C_st100 = mod_ts.get_type('custom_st100')
mod_ts.create_type('custom_all', T_RWSE)
C_all = mod_ts.get_type('custom_all')
mod_cas = cassis.Cas(mod_ts)

input_file = 'input/use_case_example_modified.txt'
with open(input_file, 'r') as f:
    mod_cas.sofa_string = f.readlines()[0]

#cas.sofa_string = "If you want to embed an HTML page with JavaScript on your page now, the easiest thing to do is to save your HTML file to the directory with your notebook and then load the HTML as follows:"

nlp = spacy.load("en_core_web_sm")
doc = nlp(mod_cas.sofa_string)
for sent in doc.sents:
    cas_sentence = S(begin=sent.start_char, end=sent.end_char)
    mod_cas.add(cas_sentence)
for token in doc:
    cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
    mod_cas.add(cas_token)

rwse.check_cas(mod_cas, ts, 1)

for annotation in mod_cas.select(T_RWSE):
    if annotation.begin < 70:
        custom = C1(begin=annotation.begin, end=annotation.end)
        mod_cas.add(custom)
    elif annotation.begin > 900:
        custom = C_st100(begin=annotation.begin, end=annotation.end)
        mod_cas.add(custom)
    else:
        custom = C_all(begin=annotation.begin, end=annotation.end)
        mod_cas.add(custom)

spacy_span_vis = SpacySpanVisualiser(mod_cas, [])

spacy_span_vis.set_selected_annotations_to_types({
    '1': C1,
    '1,10,20': C_st100,
    'ALL': C_all,
})

spacy_span_vis.set_annotations_to_colors({
    '1': 'palegreen',#'#0DF5B3',
    '1,10,20': 'lightgreen',#'#0AC590',
    'ALL': 'limegreen',#'#078A65',
})
spacy_span_vis.set_span_type(SpacySpanVisualiser.SPAN_STYLE_HIGHLIGHTING)
spacy_span_vis.set_allow_highlighting_overlap(True)
html = spacy_span_vis.visualise()

In [48]:
display(HTML(html))