### RWSE-Checker: true-positives statistics from filtered corpus

In [1]:
from rwse import RWSE_Checker

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
rwse = RWSE_Checker()
rwse.set_confusion_sets('../data/confusion_sets_modified.csv')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


#### Read from corpus and collect sentences by confusion sets

In [3]:
with open('../data/eng_news_2023_10K-sentences.txt', 'r') as f:
    sentences = f.readlines()

sentences_cleaned = [sentence.split('\t')[1].strip() for sentence in sentences]

from util import collect_sentences_by_confusion_sets

sentences_by_confusion_sets = collect_sentences_by_confusion_sets(rwse.confusion_sets.values(), sentences_cleaned)

total = 0

for key, value in sentences_by_confusion_sets.items():
    total += len(value)
    print(key, '=', len(value))

print('total =', total)

accept,except = 20
advise,advice = 21
affect,effect = 31
begin,being = 213
bitch,pitch = 7
brakes,breaks = 7
burrows,borrows = 0
sight,site = 29
cords,chords = 0
country,county = 100
crap,crab = 2
dessert,desert = 6
ease,easy = 30
effects,affects = 23
extend,extent = 16
feet,feat = 21
few,view = 103
form,from = 839
forth,fourth = 33
forums,forms = 3
fund,found = 115
lead,led = 96
life,live = 147
loose,lose = 20
mad,made = 154
or,ore = 413
passed,past = 80
peace,piece = 28
plane,plain = 12
principal,principle = 13
quite,quiet = 37
raise,rise = 32
safe,save = 48
spit,split = 9
than,then = 419
their,there,they = 1421
theme,them = 246
things,thinks = 69
trail,trial = 27
tree,three = 174
two,too,to = 4563
weak,week = 121
weather,whether = 56
weed,wheat = 1
where,were = 598
which,witch = 424
whole,hole = 33
with,width = 1383
world,word = 119
you,your = 698
total = 13060


#### Switch confusion words in sentences

In [4]:
from util import replace_confusion_set_words_in_sentences
modified_sentences_by_confusion_sets = replace_confusion_set_words_in_sentences(sentences_by_confusion_sets)
total = 0

for key, value in modified_sentences_by_confusion_sets.items():
    total += len(value)
    print(key, '=', len(value))

print('total =', total)

accept,except = 20
advise,advice = 21
affect,effect = 32
begin,being = 230
bitch,pitch = 7
brakes,breaks = 7
burrows,borrows = 0
sight,site = 29
cords,chords = 0
country,county = 104
crap,crab = 2
dessert,desert = 6
ease,easy = 30
effects,affects = 23
extend,extent = 16
feet,feat = 22
few,view = 104
form,from = 898
forth,fourth = 34
forums,forms = 3
fund,found = 118
lead,led = 100
life,live = 153
loose,lose = 20
mad,made = 156
or,ore = 443
passed,past = 80
peace,piece = 28
plane,plain = 13
principal,principle = 13
quite,quiet = 39
raise,rise = 32
safe,save = 50
spit,split = 9
than,then = 435
their,there,they = 3730
theme,them = 259
things,thinks = 69
trail,trial = 27
tree,three = 176
two,too,to = 12638
weak,week = 128
weather,whether = 56
weed,wheat = 1
where,were = 671
which,witch = 437
whole,hole = 33
with,width = 1491
world,word = 122
you,your = 1149
total = 24264


#### Determine RWSEs

In [5]:
from cassis import Cas, load_typesystem
import spacy

T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

nlp = spacy.load('en_core_web_sm')

path = '../data/TypeSystem.xml'

with open(path, 'rb') as f:
    ts = load_typesystem(f)

S = ts.get_type(T_SENTENCE)
T = ts.get_type(T_TOKEN)

result = dict()

for confusion_set, sentences in modified_sentences_by_confusion_sets.items():
    cas = Cas(ts)
    if len(sentences) > 6000:
        print('WARNING: too many sentences:', len(sentences), 'for', confusion_set)
        continue
    cas.sofa_string = ' '.join(sentences)
    doc = nlp(cas.sofa_string)
    for sent in doc.sents:
        cas_sentence = S(begin=sent.start_char, end=sent.end_char)
        cas.add(cas_sentence)
    for token in doc:
        cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
        cas.add(cas_token)
    rwse.set_confusion_sets([set(confusion_set.split(','))])
    rwse.check_cas(cas, ts)
    true_positives = cas.select(T_RWSE)
    with open('../experiments/data/true_positives.csv', 'a') as f:
        false_positives_transformed = [(token.begin, token.end, cas.sofa_string[token.begin:token.end], token.suggestion) for token in true_positives]
        for sent in doc.sents:
            for token in false_positives_transformed:
                if sent.start_char <= token[0] and sent.end_char >= token[1]:
                    print(confusion_set, f'{token[2]}->{token[3]}',sent.text, sep='\t', file=f)
    print(confusion_set, ":", len(true_positives), 'from', len(sentences))
    diff = len(sentences)-len(true_positives)
    if diff != 0:
        print(f"WARNING: {diff} missed matches for {confusion_set}")
    result[confusion_set] = {
        'num_sentences': len(sentences),
        'num_matches': len(true_positives)
    }

accept,except : 19 from 20
advise,advice : 21 from 21
affect,effect : 30 from 32
begin,being : 230 from 230
bitch,pitch : 7 from 7
brakes,breaks : 7 from 7
burrows,borrows : 0 from 0
sight,site : 28 from 29
cords,chords : 0 from 0
country,county : 93 from 104
crap,crab : 1 from 2
dessert,desert : 6 from 6
ease,easy : 29 from 30
effects,affects : 23 from 23
extend,extent : 16 from 16
feet,feat : 21 from 22
few,view : 104 from 104
form,from : 893 from 898
forth,fourth : 34 from 34
forums,forms : 3 from 3
fund,found : 116 from 118
lead,led : 96 from 100
life,live : 147 from 153
loose,lose : 19 from 20
mad,made : 156 from 156
or,ore : 444 from 443
passed,past : 79 from 80
peace,piece : 27 from 28
plane,plain : 12 from 13
principal,principle : 12 from 13
quite,quiet : 39 from 39
raise,rise : 27 from 32
safe,save : 52 from 50
spit,split : 9 from 9
than,then : 426 from 435
their,there,they : 3621 from 3730
theme,them : 257 from 259
things,thinks : 68 from 69
trail,trial : 24 from 27
tree,thre

#### Seperate computation of confusion set two,too,to

In [12]:
confusion_set = 'two,too,to'
org_sentences = modified_sentences_by_confusion_sets[confusion_set]
for sentences in [org_sentences[:6500], org_sentences[6500:]]:
    cas = Cas(ts)
    cas.sofa_string = ' '.join(sentences)
    doc = nlp(cas.sofa_string)
    for sent in doc.sents:
        cas_sentence = S(begin=sent.start_char, end=sent.end_char)
        cas.add(cas_sentence)
    for token in doc:
        cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
        cas.add(cas_token)
    rwse.set_confusion_sets([set(confusion_set.split(','))])
    rwse.check_cas(cas, ts)
    true_positives = cas.select(T_RWSE)
    with open('../experiments/data/true_positives.csv', 'a') as f:
        false_positives_transformed = [(token.begin, token.end, cas.sofa_string[token.begin:token.end], token.suggestion) for token in true_positives]
        for sent in doc.sents:
            for token in false_positives_transformed:
                if sent.start_char <= token[0] and sent.end_char >= token[1]:
                    print(confusion_set, f'{token[2]}->{token[3]}',sent.text, sep='\t', file=f)
    print(confusion_set, ":", len(true_positives), 'from', len(sentences))
    diff = len(sentences)-len(true_positives)
    if diff != 0:
        print(f"WARNING: {diff} missed matches for {confusion_set}")
    result[confusion_set] = {
        'num_sentences': len(sentences),
        'num_matches': len(true_positives)
    }

two,too,to : 6490 from 6500
two,too,to : 6130 from 6138


In [11]:
total = sum(item['num_sentences'] for item in result.values())
total_matches = sum(item['num_matches'] for item in result.values())
print(f'true positive rate: {total_matches/total:.2f}')

true positive rate: 0.97
