In [1]:
# this will ensure any change on external py file will be applied instantly
%load_ext autoreload
%autoreload 2

In [44]:
from analyze import TextlineTagger

"""
Define TextlineTagger based on your analysis need.
Tag the textlines with proper tags on your intested lines.
Return None if you don't care this textline.
"""
class DeleteErrorTagger(TextlineTagger):
    def tag_line(self, textline):
        if textline.is_debug_line:  # skip rejected/filtered/early_rejected lines
            return
        if textline.del_err > 0 and all(
                c == '*' or c == ' ' for c in textline.hyp
        ) and textline.insert_err == 0 and textline.subs_err == 0:
            delete_tags = {"DETECTOR_DELETE", "REJECTED", "EARLY_REJ"}
            tag = [tag for tag in delete_tags if tag in textline.tags]
            assert len(tag) <= 1, tag
            if tag:
                tag = tag[0]
            else:
                tag = 'UNK'
            return [tag]


class CharErrorPairTagger(TextlineTagger):
    def tag_line(self, textline):
        if textline.is_debug_line:  # skip rejected/filtered/early_rejected lines
            return
        ref = textline.ref
        hyp = textline.hyp
        assert len(ref) == len(hyp), textline
        cur_pairs = list(
            filter(lambda tp: tp[0] != tp[1] and tp[0] != '*' and tp[1] != '*',
                   zip(ref, hyp)))
        return [f'{pair[0]} -> {pair[1]}' for pair in cur_pairs]

In [77]:
from analyze import EvalAnalyzer

analyzer = EvalAnalyzer()

# register the taggers you defined
analyzer.register_textline_tagger(DeleteErrorTagger())
analyzer.register_textline_tagger(CharErrorPairTagger())

alldata_id = '1d48bf6d-d4dc-4cb6-b04a-3df6ef5326e6'
baseline_id = 'e8efe255-c62f-4dd2-8a58-8fc4eb417553'
script = 'latin_hw'
doc_only=False
entity = 'TextAnalyticsAPI_Quantity_EntityGroup'

# mark each record with an alias name
records = [
#     ('baseline', baseline_id), 
    ('alldata', alldata_id),
]
analysis = analyzer.analyze(records, script, doc_only, entity, 'lv1')

# format the result and print
formatted = EvalAnalyzer.format(analysis, max_tag_count = 10)
for tagger, fmt in formatted.items():
    print(tagger)
    print(fmt)
    print()

92280it [00:02, 39673.05it/s]

DeleteErrorTagger
key      count alldata    percentage alldata
-----  ---------------  --------------------
UNK               4624                   100

CharErrorPairTagger
key       count alldata    percentage alldata
------  ---------------  --------------------
. -> ,              636                  5.82
, -> .              607                  5.56
1 -> /              427                  3.91
0 -> .              264                  2.42
0 -> O              252                  2.31
0 -> o              224                  2.05
1 -> -              223                  2.04
1 -> .              213                  1.95
. -> -              164                  1.5
5 -> S              162                  1.48




