### RWSE-Checker: false-positives (false alarm) statistics from filtered corpus

In [1]:
from rwse import RWSE_Checker

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rwse = RWSE_Checker()
rwse.set_confusion_sets('../data/confusion_sets_modified.csv')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


#### Read from corpus and collect sentences by confusion sets

In [3]:
with open('../data/eng_news_2023_10K-sentences.txt', 'r') as f:
    sentences = f.readlines()

sentences_cleaned = [sentence.split('\t')[1].strip() for sentence in sentences]

from util import collect_sentences_by_confusion_sets

sentences_by_confusion_sets = collect_sentences_by_confusion_sets(rwse.confusion_sets.values(), sentences_cleaned)

total = 0

for key, value in sentences_by_confusion_sets.items():
    total += len(value)
    print(key, '=', len(value))

print('total =', total)

accept,except = 20
advise,advice = 21
affect,effect = 31
begin,being = 213
bitch,pitch = 7
brakes,breaks = 7
burrows,borrows = 0
sight,site = 29
cords,chords = 0
country,county = 100
crap,crab = 2
dessert,desert = 6
ease,easy = 30
effects,affects = 23
extend,extent = 16
feet,feat = 21
few,view = 103
form,from = 839
forth,fourth = 33
forums,forms = 3
fund,found = 115
lead,led = 96
life,live = 147
loose,lose = 20
mad,made = 154
or,ore = 413
passed,past = 80
peace,piece = 28
plane,plain = 12
principal,principle = 13
quite,quiet = 37
raise,rise = 32
safe,save = 48
spit,split = 9
than,then = 419
their,there,they = 1421
theme,them = 246
things,thinks = 69
trail,trial = 27
tree,three = 174
two,too,to = 4563
weak,week = 121
weather,whether = 56
weed,wheat = 1
where,were = 598
which,witch = 424
whole,hole = 33
with,width = 1383
world,word = 119
you,your = 698
total = 13060


#### Determine RWSEs

In [4]:
from cassis import Cas, load_typesystem
import spacy

T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

nlp = spacy.load('en_core_web_sm')

path = '../data/TypeSystem.xml'

with open(path, 'rb') as f:
    ts = load_typesystem(f)

S = ts.get_type(T_SENTENCE)
T = ts.get_type(T_TOKEN)

for confusion_set, sentences in sentences_by_confusion_sets.items():
    cas = Cas(ts)
    cas.sofa_string = ' '.join(sentences)
    doc = nlp(cas.sofa_string)
    for sent in doc.sents:
        cas_sentence = S(begin=sent.start_char, end=sent.end_char)
        cas.add(cas_sentence)
    for token in doc:
        cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
        cas.add(cas_token)
    rwse.set_confusion_sets([set(confusion_set.split(','))])
    rwse.check_cas(cas, ts)
    false_positives = cas.select(T_RWSE)
    with open('../experiments/data/false_positives.csv', 'a') as f:
        false_positives_transformed = [(token.begin, token.end, cas.sofa_string[token.begin:token.end]) for token in false_positives]
        for sent in doc.sents:
            for token in false_positives_transformed:
                if sent.start_char <= token[0] and sent.end_char >= token[1]:
                    print(confusion_set, sent.text, sep='\t', file=f)
    print(confusion_set, ":", len(false_positives), 'from', len(sentences))

accept,except : 0 from 20
advise,advice : 0 from 21
affect,effect : 0 from 31
begin,being : 0 from 213
bitch,pitch : 0 from 7
brakes,breaks : 0 from 7
burrows,borrows : 0 from 0
sight,site : 0 from 29
cords,chords : 0 from 0
d.t.u.d.c.a.a.t.RWSE(begin=14100, end=14106)
d.t.u.d.c.a.a.t.RWSE(begin=14188, end=14194)
d.t.u.d.c.a.a.t.RWSE(begin=14923, end=14929)
country,county : 3 from 100
crap,crab : 0 from 2
dessert,desert : 0 from 6
ease,easy : 0 from 30
effects,affects : 0 from 23
extend,extent : 0 from 16
feet,feat : 0 from 21
few,view : 0 from 103
d.t.u.d.c.a.a.t.RWSE(begin=119955, end=119959)
form,from : 1 from 839
forth,fourth : 0 from 33
forums,forms : 0 from 3
fund,found : 0 from 115
lead,led : 0 from 96
life,live : 0 from 147
loose,lose : 0 from 20
mad,made : 0 from 154
or,ore : 0 from 413
passed,past : 0 from 80
peace,piece : 0 from 28
plane,plain : 0 from 12
principal,principle : 0 from 13
quite,quiet : 0 from 37
raise,rise : 0 from 32
safe,save : 0 from 48
spit,split : 0 from 