### RWSE-Checker: true-positives statistics from filtered corpus

In [3]:
from rwse import RWSE_Checker

In [4]:
rwse = RWSE_Checker()
rwse.set_confusion_sets('../data/confusion_sets_modified.csv')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


#### Read from corpus and collect sentences by confusion sets

In [5]:
with open('../data/eng_news_2023_10K-sentences.txt', 'r') as f:
    sentences = f.readlines()

sentences_cleaned = [sentence.split('\t')[1].strip() for sentence in sentences]

from util import collect_sentences_by_confusion_sets

sentences_by_confusion_sets = collect_sentences_by_confusion_sets(rwse.confusion_sets.values(), sentences_cleaned)

total = 0

for key, value in sentences_by_confusion_sets.items():
    total += len(value)
    print(key, '=', len(value))

print('total =', total)

accept,except = 20
advise,advice = 21
affect,effect = 31
begin,being = 213
bitch,pitch = 7
brakes,breaks = 7
burrows,borrows = 0
sight,site = 29
cords,chords = 0
country,county = 100
crap,crab = 2
dessert,desert = 6
ease,easy = 30
effects,affects = 23
extend,extent = 16
feet,feat = 21
few,view = 103
form,from = 839
forth,fourth = 33
forums,forms = 3
fund,found = 115
lead,led = 96
life,live = 147
loose,lose = 20
mad,made = 154
or,ore = 413
passed,past = 80
peace,piece = 28
plane,plain = 12
principal,principle = 13
quite,quiet = 37
raise,rise = 32
safe,save = 48
spit,split = 9
than,then = 419
their,there,they = 1421
theme,them = 246
things,thinks = 69
trail,trial = 27
tree,three = 174
two,too,to = 4563
weak,week = 121
weather,whether = 56
weed,wheat = 1
where,were = 598
which,witch = 424
whole,hole = 33
with,width = 1383
world,word = 119
you,your = 698
total = 13060


#### Switch confusion words in sentences

In [6]:
from util import replace_confusion_set_words_in_sentences
modified_sentences_by_confusion_sets = replace_confusion_set_words_in_sentences(sentences_by_confusion_sets)
total = 0

for key, value in modified_sentences_by_confusion_sets.items():
    total += len(value)
    print(key, '=', len(value))

print('total =', total)

accept,except = 20
advise,advice = 21
affect,effect = 32
begin,being = 230
bitch,pitch = 7
brakes,breaks = 7
burrows,borrows = 0
sight,site = 29
cords,chords = 0
country,county = 104
crap,crab = 2
dessert,desert = 6
ease,easy = 30
effects,affects = 23
extend,extent = 16
feet,feat = 22
few,view = 104
form,from = 898
forth,fourth = 34
forums,forms = 3
fund,found = 118
lead,led = 100
life,live = 153
loose,lose = 20
mad,made = 156
or,ore = 443
passed,past = 80
peace,piece = 28
plane,plain = 13
principal,principle = 13
quite,quiet = 39
raise,rise = 32
safe,save = 50
spit,split = 9
than,then = 435
their,there,they = 3730
theme,them = 259
things,thinks = 69
trail,trial = 27
tree,three = 176
two,too,to = 12638
weak,week = 128
weather,whether = 56
weed,wheat = 1
where,were = 671
which,witch = 437
whole,hole = 33
with,width = 1491
world,word = 122
you,your = 1149
total = 24264


#### Determine RWSEs

In [14]:
from cassis import Cas, load_typesystem
import spacy

T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

nlp = spacy.load('en_core_web_sm')

path = '../data/TypeSystem.xml'

with open(path, 'rb') as f:
    ts = load_typesystem(f)

S = ts.get_type(T_SENTENCE)
T = ts.get_type(T_TOKEN)

result = dict()

In [15]:
result = dict()
for confusion_set, sentences in modified_sentences_by_confusion_sets.items():
    rwse.set_confusion_sets([set(confusion_set.split(','))])
    matches = 0
    for sentence in sentences:
        cas = Cas(ts)
        # TODO clean sentence?
        cas.sofa_string = sentence
        doc = nlp(cas.sofa_string)
        cas_sentence = S(begin=0, end=len(sentence))
        cas.add(cas_sentence)
        for token in doc:
            cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
            cas.add(cas_token)
        rwse.check_cas(cas, ts)
        true_positives = cas.select(T_RWSE)
        if len(true_positives) == 0:
            with open('../experiments/data/true_positives_missed.csv', 'a') as f:
                print(confusion_set, cas.sofa_string, sep='\t', file=f)
        else:
            matches += 1
    result[confusion_set] = {
        'num_sentences':len(sentences),
        'num_matches':matches,
    }

#### Determine true-positive rate

In [16]:
total = sum(item['num_sentences'] for item in result.values())
total_matches = sum(item['num_matches'] for item in result.values())
print(f'true positive rate: {total_matches/total:.2f}')
print(f'correctly identified {total_matches} out of {total}')

true positive rate: 0.99
correctly identified 23917 out of 24264


In [17]:
for key, value in result.items():
    if value['num_matches'] != value['num_sentences']:
        print(key, value, sep='\t')

accept,except	{'num_sentences': 20, 'num_matches': 19}
affect,effect	{'num_sentences': 32, 'num_matches': 30}
sight,site	{'num_sentences': 29, 'num_matches': 28}
country,county	{'num_sentences': 104, 'num_matches': 90}
crap,crab	{'num_sentences': 2, 'num_matches': 1}
ease,easy	{'num_sentences': 30, 'num_matches': 29}
feet,feat	{'num_sentences': 22, 'num_matches': 21}
form,from	{'num_sentences': 898, 'num_matches': 893}
fund,found	{'num_sentences': 118, 'num_matches': 116}
lead,led	{'num_sentences': 100, 'num_matches': 96}
life,live	{'num_sentences': 153, 'num_matches': 147}
loose,lose	{'num_sentences': 20, 'num_matches': 19}
passed,past	{'num_sentences': 80, 'num_matches': 79}
peace,piece	{'num_sentences': 28, 'num_matches': 27}
plane,plain	{'num_sentences': 13, 'num_matches': 12}
principal,principle	{'num_sentences': 13, 'num_matches': 12}
raise,rise	{'num_sentences': 32, 'num_matches': 27}
safe,save	{'num_sentences': 50, 'num_matches': 49}
than,then	{'num_sentences': 435, 'num_matche