# Analyze News Corpus

In [3]:
from rwse_checker.rwse import RWSE_Checker

import os
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
rwse = RWSE_Checker()
nlp = spacy.load('en_core_web_sm')

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architect

# False-Alarm Rate

#### Determine RWSEs

In [5]:
report_file_name = 'output/report_false_positives.csv'
input_file_name = 'input/eng_news_2023_10K-masked-sentences.csv'

result = dict()

if not os.path.exists(report_file_name):

    with open('output/false_positives.csv', 'w') as f:
        input_file = open(input_file_name, 'r')
        for line in input_file.readlines()[1:]: # skip header
            confusion_set, target, sentence = line.strip().split('\t')
            matches = 0
            rwse.set_confusion_sets([set(confusion_set.split(','))])
            suggestion, certainty = rwse.check(target, sentence)

            result.setdefault(confusion_set, {'num_sentences':0, 'num_matches':0})
            result[confusion_set]['num_sentences'] += 1
            if suggestion.lower() != target.lower(): # no case discrimination in RWSE result
                result[confusion_set]['num_matches'] += 1
                print(f'{target} => {suggestion}' ,f'({certainty:.5f})' , sentence, sep='\t', file=f)

    with open(report_file_name, 'w') as f:
        print('confusion_set', 'num_matches', 'num_sentences', sep=';', file=f)
        for key in sorted(result.keys()):
            print(key, result[key]['num_matches'], result[key]['num_sentences'], sep=';', file=f)
else:
    with open(report_file_name, 'r') as f:
        for line in f.readlines()[1:]:
            confusion_set, num_matches, num_sentences = line.strip().split(';')
            result[confusion_set] = {'num_matches': int(num_matches), 'num_sentences': int(num_sentences)}


#### Determine false-alarm rate

In [6]:
total = sum(item['num_sentences'] for item in result.values())
total_matches = sum(item['num_matches'] for item in result.values())
print(f'false-alarm rate: {total_matches/total:.3f}')
print(f'falsely identified {total_matches} out of {total}')

false-alarm rate: 0.001
falsely identified 13 out of 15960


# Miss Rate

#### Determine RWSEs

In [11]:
report_file_name = 'output/report_true_positives.csv'
input_file_name = 'input/eng_news_2023_10K-masked-sentences.csv'

result = dict()

if not os.path.exists(report_file_name):

    header = ['confusion_set', 'expected', 'target', 'suggestion', 'masked_sentence']
    file_false_negatives = open('output/false_negatives.csv', 'w')
    print(*header, sep='\t', file=file_false_negatives)
    file_misclassified_positives =  open('output/misclassified_positives.csv', 'w')
    print(*header, sep='\t', file=file_misclassified_positives)

    input_file = open(input_file_name, 'r')
    for line in input_file.readlines()[1:]: # skip header
        confusion_set, expected, sentence = line.strip().split('\t')
        rwse.set_confusion_sets([set(confusion_set.split(','))])
        result.setdefault(confusion_set, {'num_sentences':0, 'num_matches':0})
        for target in confusion_set.split(','):
            if target != expected: # analyze mistakes only
                suggestion, certainty = rwse.check(target, sentence)
                result[confusion_set]['num_sentences'] += 1

                if suggestion.lower() == expected.lower(): # no case discrimination in RWSE result, true positive
                    result[confusion_set]['num_matches'] += 1
                elif suggestion.lower() == target.lower(): # false negative
                    print(confusion_set, expected, target, suggestion, sentence, sep='\t', file=file_false_negatives)
                else: # complete misclassification
                    print(confusion_set, expected, target, suggestion, sentence, sep='\t', file=file_misclassified_positives)

    file_false_negatives.close()
    file_misclassified_positives.close()

    with open(report_file_name, 'w') as f:
        print('confusion_set', 'num_matches', 'num_sentences', sep=';', file=f)
        for key in sorted(result.keys()):
            print(key, result[key]['num_matches'], result[key]['num_sentences'], sep=';', file=f)
else:
    with open(report_file_name, 'r') as f:
        for line in f.readlines()[1:]:
            confusion_set, num_matches, num_sentences = line.strip().split(';')
            result[confusion_set] = {'num_matches': int(num_matches), 'num_sentences': int(num_sentences)}

#### Determine miss rate

In [9]:
total = sum(item['num_sentences'] for item in result.values())
total_matches = sum(item['num_matches'] for item in result.values())
print(f'miss rate: {(1 - total_matches/total):.3f}')
print(f'missed {total - total_matches} out of {total}')

miss rate: 0.005
missed 312 out of 60632


# Total Results

In [1]:
from helper import calculate_accuracy, calculate_miss_rate, calculate_false_alarm_rate

file_false_positives = 'output/report_false_positives.csv'
file_true_positives = 'output/report_true_positives.csv'

far = calculate_false_alarm_rate(file_false_positives)
mr = calculate_miss_rate(file_true_positives)
acc = calculate_accuracy(file_false_positives, file_true_positives)

print(f"false alarm rate: {far:.3f}") #false alarm rate: 0.001
print(f"miss rate: {mr:.3f}") #miss rate: 0.005
print(f"accuracy: {acc:.3f}") #accuracy: 0.996

  from .autonotebook import tqdm as notebook_tqdm


false alarm rate: 0.001
miss rate: 0.005
accuracy: 0.996
