In [15]:
from cas_visualizer.visualizer import SpacySpanVisualiser
from IPython.display import display, HTML

import spacy
import cassis
from rwse import RWSE_Checker

### Build CAS

In [16]:
ts_file = 'input/TypeSystem.xml'
with open(ts_file, 'rb') as f:
    ts = cassis.load_typesystem(f)

T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

S = ts.get_type(T_SENTENCE)
T = ts.get_type(T_TOKEN)

cas = cassis.Cas(ts)

input_file = 'input/use_case_example_bawe_0246b_modified.txt'
with open(input_file, 'r') as f:
    cas.sofa_string = f.readlines()[0]

nlp = spacy.load("en_core_web_sm")
doc = nlp(cas.sofa_string)
for sent in doc.sents:
    cas_sentence = S(begin=sent.start_char, end=sent.end_char)
    cas.add(cas_sentence)
for token in doc:
    cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
    cas.add(cas_token)

rwse = RWSE_Checker()
rwse.set_confusion_sets('input/confusion_sets_modified.csv')
rwse.check_cas(cas, ts, 1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


### Visualize CAS

In [17]:
#[(59, 63, 'from'), (261, 265, 'Your'), (488, 491, 'you'), (579, 583, 'your'), (928, 932, 'live')] indices from use_case_bawe_0246b.txt
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'

ts.create_type('custom_1', T_RWSE)
C1 = ts.get_type('custom_1')
ts.create_type('custom_st100', T_RWSE)
C_st100 = ts.get_type('custom_st100')
ts.create_type('custom_all', T_RWSE)
C_all = ts.get_type('custom_all')

for annotation in cas.select(T_RWSE):
    if annotation.begin < 70:
        custom = C1(begin=annotation.begin, end=annotation.end)
        cas.add(custom)
    elif annotation.begin > 900:
        custom = C_st100(begin=annotation.begin, end=annotation.end)
        cas.add(custom)
    else:
        custom = C_all(begin=annotation.begin, end=annotation.end)
        cas.add(custom)

spacy_span_vis = SpacySpanVisualiser(cas, [])

spacy_span_vis.set_selected_annotations_to_types({
    '1': C1,
    '1,10,20': C_st100,
    'ALL': C_all,
})

spacy_span_vis.set_annotations_to_colors({
    '1': 'palegreen',#'#0DF5B3',
    '1,10,20': 'lightgreen',#'#0AC590',
    'ALL': 'limegreen',#'#078A65',
})
spacy_span_vis.set_span_type(SpacySpanVisualiser.SPAN_STYLE_HIGHLIGHTING)
spacy_span_vis.set_allow_highlighting_overlap(True)
html = spacy_span_vis.visualise()

In [18]:
display(HTML(html))

### Analyze sentence

In [19]:
from transformers import pipeline

pipe = pipeline("fill-mask", model="bert-base-cased", device=-1)
masked_sentence = "On examination , such patients may have enlarged , non-tender lymph nodes , an enlarged [MASK] and/or spleen ."
results = pipe(masked_sentence, targets=["Life", "Live", "life", "live"])
#0.82254 liver (best prediction overall)
#0.06170 kidney
#0.04681 lung
#0.02685 stomach
#0.00831 heart
#1.406122720482017e-07 life (best prediction from confusion set)
#1.796062853998137e-08 live (original token)
results

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 1.406122720482017e-07,
  'token': 1297,
  'token_str': 'life',
  'sequence': 'On examination, such patients may have enlarged, non - tender lymph nodes, an enlarged life and / or spleen.'},
 {'score': 1.796062853998137e-08,
  'token': 1686,
  'token_str': 'live',
  'sequence': 'On examination, such patients may have enlarged, non - tender lymph nodes, an enlarged live and / or spleen.'},
 {'score': 1.086198464506083e-09,
  'token': 3374,
  'token_str': 'Live',
  'sequence': 'On examination, such patients may have enlarged, non - tender lymph nodes, an enlarged Live and / or spleen.'},
 {'score': 4.452583024505685e-10,
  'token': 2583,
  'token_str': 'Life',
  'sequence': 'On examination, such patients may have enlarged, non - tender lymph nodes, an enlarged Life and / or spleen.'}]