### RWSE-Checker: false-positives (false alarm) statistics from filtered corpus

In [6]:
from rwse import RWSE_Checker

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
rwse = RWSE_Checker()
rwse.set_confusion_sets('input/confusion_sets_modified.csv')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


#### Read from corpus and collect sentences by confusion sets

In [9]:
from util import collect_sentences_by_confusion_sets

with open('input/eng_news_2023_10K-sentences.txt', 'r') as f:
    sentences = f.readlines()

sentences_cleaned = [sentence.split('\t')[1].strip() for sentence in sentences]

sentences_by_confusion_sets = collect_sentences_by_confusion_sets(rwse.confusion_sets.values(), sentences_cleaned)

total = 0

for key, value in sorted(sentences_by_confusion_sets.items()):
    total += len(value)
    print(key, '=', len(value))

print('total =', total)

accept,except = 20
advise,advice = 21
affect,effect = 31
begin,being = 213
bitch,pitch = 7
brakes,breaks = 7
burrows,borrows = 0
cords,chords = 0
country,county = 100
crap,crab = 2
dessert,desert = 6
ease,easy = 30
effects,affects = 23
extend,extent = 16
feet,feat = 21
few,view = 103
form,from = 839
forth,fourth = 33
forums,forms = 3
fund,found = 115
lead,led = 96
life,live = 147
loose,lose = 20
mad,made = 154
or,ore = 413
passed,past = 80
peace,piece = 28
plane,plain = 12
principal,principle = 13
quite,quiet = 37
raise,rise = 32
safe,save = 48
sight,site = 29
spit,split = 9
than,then = 419
their,there,they = 1421
theme,them = 246
things,thinks = 69
trail,trial = 27
tree,three = 174
two,too,to = 4563
weak,week = 121
weather,whether = 56
weed,wheat = 1
where,were = 598
which,witch = 424
whole,hole = 33
with,width = 1383
world,word = 119
you,your = 698
total = 13060


#### Determine RWSEs

In [10]:
from cassis import Cas, load_typesystem
import spacy

T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

nlp = spacy.load('en_core_web_sm')

path = 'TypeSystem.xml'

with open(path, 'rb') as f:
    ts = load_typesystem(f)

S = ts.get_type(T_SENTENCE)
T = ts.get_type(T_TOKEN)

result = dict()

In [11]:
for confusion_set, sentences in sentences_by_confusion_sets.items():
    matches = 0
    rwse.set_confusion_sets([set(confusion_set.split(','))])
    for sentence in sentences:
        cas = Cas(ts)
        # TODO clean sentence?
        cas.sofa_string = sentence
        doc = nlp(cas.sofa_string)
        cas_sentence = S(begin=0, end=len(sentence))
        cas.add(cas_sentence)
        for token in doc:
            cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
            cas.add(cas_token)
        rwse.check_cas(cas, ts)
        false_positives = cas.select(T_RWSE)
        if len(false_positives) != 0:
            matches += 1
            with open('output/false_positives.csv', 'a') as f:
                print(confusion_set, cas.sofa_string, sep='\t', file=f)
    result[confusion_set] = {
        'num_sentences':len(sentences),
        'num_matches':matches,
    }

KeyboardInterrupt: 

#### Determine false-positive rate

In [10]:
total = sum(item['num_sentences'] for item in result.values())
total_matches = sum(item['num_matches'] for item in result.values())
print(f'false positive rate: {total_matches/total:.2f}')
print(f'falsely identified {total_matches} out of {total}')

false positive rate: 0.00
falsely identified 13 out of 13060


In [9]:
for key, value in result.items():
    if value['num_matches'] > 0:
        print(key, value, sep='\t')

country,county	{'num_sentences': 100, 'num_matches': 3}
form,from	{'num_sentences': 839, 'num_matches': 1}
life,live	{'num_sentences': 147, 'num_matches': 1}
their,there,they	{'num_sentences': 1421, 'num_matches': 3}
theme,them	{'num_sentences': 246, 'num_matches': 1}
two,too,to	{'num_sentences': 4563, 'num_matches': 1}
whole,hole	{'num_sentences': 33, 'num_matches': 1}
you,your	{'num_sentences': 698, 'num_matches': 2}


In [9]:
with open('output/report_false_positives.csv', 'a') as f:
    print('confusion_set', 'num_matches', 'num_sentences', sep=';', file=f)
    for key, value in result.items():
        print(key, value['num_matches'], value['num_sentences'], sep=';', file=f)
        print(key, value['num_matches'], value['num_sentences'], sep='\t')

accept,except	0	20
advise,advice	0	21
affect,effect	0	31
begin,being	0	213
bitch,pitch	0	7
brakes,breaks	0	7
burrows,borrows	0	0
sight,site	0	29
cords,chords	0	0
country,county	3	100
crap,crab	0	2
dessert,desert	0	6
ease,easy	0	30
effects,affects	0	23
extend,extent	0	16
feet,feat	0	21
few,view	0	103
form,from	1	839
forth,fourth	0	33
forums,forms	0	3
fund,found	0	115
lead,led	0	96
life,live	1	147
loose,lose	0	20
mad,made	0	154
or,ore	0	413
passed,past	0	80
peace,piece	0	28
plane,plain	0	12
principal,principle	0	13
quite,quiet	0	37
raise,rise	0	32
safe,save	0	48
spit,split	0	9
than,then	0	419
their,there,they	3	1421
theme,them	1	246
things,thinks	0	69
trail,trial	0	27
tree,three	0	174
two,too,to	1	4563
weak,week	0	121
weather,whether	0	56
weed,wheat	0	1
where,were	0	598
which,witch	0	424
whole,hole	1	33
with,width	0	1383
world,word	0	119
you,your	2	698
