### RWSE-Checker: false-positives (false alarm) statistics from filtered corpus

In [1]:
from rwse import RWSE_Checker

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rwse = RWSE_Checker()
rwse.set_confusion_sets('input/confusion_sets_modified.csv')

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architect

#### Load sentences by confusion sets

In [3]:
file_name = 'input/eng_news_2023-balanced-sentences.csv'

sentences_by_confusion_sets = dict()
with open(file_name, 'r') as f:
    lines = f.readlines()
    for line in lines:
        confusion_set, sentence = line.strip().split('\t')
        if sentences_by_confusion_sets.get(confusion_set) is None:
            sentences_by_confusion_sets[confusion_set] = []
        sentences_by_confusion_sets[confusion_set].append(sentence)

total = 0

for key, value in sorted(sentences_by_confusion_sets.items()):
    total += len(value)
    print(key, '=', len(value))

print('total =', total)

Being,begin,being = 100
Capital,Capitol,capital = 100
Country,County,country,county = 100
Desert,desert,dessert = 100
Easy,ease,easy = 100
Effect,affect,effect = 100
Effects,affects,effects = 100
Except,accept,except = 100
Few,View,few,view = 100
Form,From,form,from = 100
Found,Fund,found,fund = 100
Fourth,forth,fourth = 100
Hole,Whole,hole,whole = 100
Lead,Led,lead,led = 100
Life,Live,life,live = 100
Mad,Made,mad,made = 100
Or,or,ore = 94
Past,passed,past = 100
Peace,Piece,peace,piece = 100
Plain,plain,plane = 100
Principal,principal,principle = 100
Provence,Province,province = 12
Quiet,Quite,quiet,quite = 100
Rise,raise,rise = 100
Safe,Save,safe,save = 100
Site,sight,site = 100
Split,spit,split = 59
Than,Then,than,then = 100
Their,There,They,their,there,they = 100
Them,Theme,them,theme = 100
Things,things,thinks = 100
Three,Tree,three,tree = 100
To,Too,Two,to,too,two = 100
Trail,Trial,trail,trial = 100
Weather,Whether,weather,whether = 100
Week,weak,week = 100
Were,Where,were,where =

#### Determine RWSEs

In [4]:
from cassis import Cas, load_typesystem
import spacy

T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

nlp = spacy.load('en_core_web_sm')

path = 'input/TypeSystem.xml'

with open(path, 'rb') as f:
    ts = load_typesystem(f)

S = ts.get_type(T_SENTENCE)
T = ts.get_type(T_TOKEN)

In [5]:
file_name = 'output/false_positives_balanced.csv'

with open(file_name, 'w') as f:

    result = dict()

    for confusion_set, sentences in sentences_by_confusion_sets.items():
        matches = 0
        rwse.set_confusion_sets([set(confusion_set.split(','))])
        for sentence in sentences:
            cas = Cas(ts)
            # TODO clean sentence?
            cas.sofa_string = sentence
            doc = nlp(cas.sofa_string)
            cas_sentence = S(begin=0, end=len(sentence))
            cas.add(cas_sentence)
            for token in doc:
                cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
                cas.add(cas_token)
            rwse.check_cas(cas, ts)
            false_positives = cas.select(T_RWSE)
            if len(false_positives) != 0:
                matches += 1
                for item in false_positives:
                    before = cas.sofa_string[item.begin:item.end]
                    modified_string = cas.sofa_string[:item.begin] + ' [[' + cas.sofa_string[item.begin:item.end] + ']] ' +cas.sofa_string[item.end:]
                    print(f'{before} => {item.suggestion}' ,f'({item.certainty:.5f})' , modified_string, sep='\t', file=f)
        result[confusion_set] = {
            'num_sentences':len(sentences),
            'num_matches':matches,
        }

#### Determine false-positive rate

In [6]:
total = sum(item['num_sentences'] for item in result.values())
total_matches = sum(item['num_matches'] for item in result.values())
print(f'false positive rate: {total_matches/total:.2f}')
print(f'falsely identified {total_matches} out of {total}')

false positive rate: 0.00
falsely identified 16 out of 4671


In [7]:
file_name = 'output/report_false_positives_balanced.csv'

with open(file_name, 'w') as f:
    print('confusion_set', 'num_matches', 'num_sentences', sep=';', file=f)
    for key, value in result.items():
        print(key, value['num_matches'], value['num_sentences'], sep=';', file=f)
        print(key, value['num_matches'], value['num_sentences'], sep='\t')

Except,accept,except	0	100
advice,advise	0	100
Effect,affect,effect	2	100
Being,begin,being	0	100
bitch,pitch	0	36
brakes,breaks	1	60
Capital,Capitol,capital	0	100
Site,sight,site	0	100
Country,County,country,county	1	100
crab,crap	0	59
Desert,desert,dessert	1	100
Easy,ease,easy	1	100
Effects,affects,effects	1	100
extend,extent	0	100
feat,feet	0	100
Few,View,few,view	0	100
Form,From,form,from	0	100
Fourth,forth,fourth	0	100
forms,forums	1	86
Found,Fund,found,fund	0	100
Lead,Led,lead,led	0	100
Life,Live,life,live	0	100
loose,lose	1	100
Mad,Made,mad,made	0	100
Or,or,ore	0	94
Past,passed,past	0	100
Peace,Piece,peace,piece	0	100
Plain,plain,plane	1	100
Principal,principal,principle	1	100
Provence,Province,province	0	12
Quiet,Quite,quiet,quite	0	100
Rise,raise,rise	1	100
Safe,Save,safe,save	1	100
Split,spit,split	0	59
Than,Then,than,then	0	100
Their,There,They,their,there,they	1	100
Them,Theme,them,theme	0	100
Things,things,thinks	0	100
Trail,Trial,trail,trial	1	100
Three,Tree,three,tree	0	