# Analyze English WIKIPEDIA Corpus

In [3]:
from rwse_checker.rwse import RWSE_Checker
from transformers import AutoTokenizer
from helper import check_token

import os
import pandas as pd
import spacy

In [4]:
rwse = RWSE_Checker()
nlp = spacy.load('en_core_web_sm')

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architect

## Cleaning

In [5]:
collection = dict()
with open('input/dataset_semantic_en_simple.txt', 'r') as f:
    result = []
    idx = 0
    for line in f:
        if line == "\n":
            collection[idx] = result
            idx += 1
            result = []
        else:
            result.append(line.strip())

len(collection)

448

In [6]:
collection[0]

['27',
 '1298315',
 'goals',
 'jails',
 '223',
 'Often they were sent away to the British Colonies in America. But in 1770, the colonies in America became the United States. They were free from British rule and would not take England s convicts any more. By the 1780s the goals of England were so full that convicts were often chained up in rotting old ships.The Government decided to make a settlement in New South Wales and send some of the convicts there. In 1788 the First Fleet of eleven ships set sail from Portsmouth carrying convicts, sailors, marines, a few free settlers and enough food to last for two years. Their leader was Captain Arthur Phillip.']

In [7]:
file_name = 'input/cleaned_dataset_semantic_en.csv'

def clean_text(text):
    text = '. '.join([part.strip() for part in text.split('.')]).strip()
    text = '. '.join([part.strip() for part in text.split('?')]).strip()
    return text

def find_sentence(token_text, token_start, text):
    tmp_doc = nlp(clean_text(text))
    for sent in tmp_doc.sents:
        if sent.start_char <= token_start < sent.end_char:
            for tmp_token in sent:
                if tmp_token.text == token_text:
                    tmp_start = tmp_token.idx - sent.start_char
                    tmp_end = tmp_start + len(tmp_token.text)
                    return sent.text[:tmp_start] + '[MASK]' + sent.text[tmp_end:]
    return None

if not os.path.exists(file_name):

    checkpoint = 'bert-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    cleaned_collection_semantic = []
    bad_words = []
    bad_indices = []
    for key, value in collection.items():
        item = {
            'org_index': int(value[1]),
            'org_token': value[2],
            'org_suggestion': value[3],
            'confusion_set': f'{",".join(sorted([value[2],value[3]]))}',
            'text': find_sentence(value[2], int(value[4]), ' '.join(value[5:]))
        }
        if check_token(tokenizer, item['org_token']) is None:
            bad_words.append(item['org_token'])
        elif check_token(tokenizer, item['org_suggestion']) is None:
            bad_words.append(item['org_suggestion'])
        elif item['text'] is None:
            bad_indices.append(item['org_index'])
        else:
            cleaned_collection_semantic.append(item)


    cleaned_collection_semantic = pd.DataFrame.from_dict(cleaned_collection_semantic)
    for confusion_set in cleaned_collection_semantic['confusion_set'].unique():
        indices = cleaned_collection_semantic[cleaned_collection_semantic['confusion_set'] == confusion_set].index
        if len(indices) == 2:
            cleaned_collection_semantic.drop(min(indices), inplace=True)
    cleaned_collection_semantic.to_csv(file_name, index=False, sep='\t')
    pd.DataFrame(bad_words).to_csv(file_name+'-bad_words', index=False, header=None, sep='\t')
    pd.DataFrame(bad_indices).to_csv(file_name+'-bad_indices', index=False, header=None, sep='\t')
else:
    cleaned_collection_semantic = pd.read_csv(file_name, sep='\t')

cleaned_collection_semantic

Unnamed: 0,org_index,org_token,org_suggestion,confusion_set,text
0,208822,were,wear,"wear,were","When dancing ballet, you are required to [MASK..."
1,524374,affect,effect,"affect,effect",The Atlantic has a large [MASK] in the north a...
2,1137524,website,websites,"website,websites",Google is one of the biggest and most famous [...
3,16804,work,word,"word,work","Its name comes from the Greek [MASK] for Sun, ..."
4,321273,main,major,"main,major","In many countries throughout history, religion..."
...,...,...,...,...,...
101,1002353,easy,east,"east,easy",The [MASK] mainly spoke the Greek language.
102,1150628,honey,money,"honey,money",People who had bad things happen to them or wh...
103,502090,park,part,"park,part",The electrode is made of the same kind of meta...
104,1109051,saxophone,saxophonist,"saxophone,saxophonist","Charlie Parker born August 29, 1920 in Kansas ..."


## Analysis

In [8]:
file_name = 'output/report_dataset_semantic_en.csv'
input_file_name = 'input/modified_dataset_semantic_en.csv'

if not os.path.exists(file_name):
    data = pd.read_csv(input_file_name, sep='\t')
    with (open(file_name, 'w') as f):
        print('result_fw', 'result_bw', 'index' ,'org_token', 'org_suggestion', 'masked_sentence', sep='\t', end='\n', file=f)
        for index, item in data.iterrows():
            org_token = item['org_token']
            org_suggestion = item['org_suggestion']
            sentence = item['text']

            rwse.set_confusion_sets([item['confusion_set'].split(',')])
            suggestion, certainty = rwse.check(org_token, sentence)
            # Used in miss rate analysis
            result_fw = 'TP' if suggestion == org_suggestion else 'FN'

            # Switch tokens
            org_token, org_suggestion = org_suggestion, org_token
            suggestion, certainty = rwse.check(org_token, sentence)
            # Used in false-alarm rate analysis
            result_bw = 'FP' if suggestion == org_suggestion else 'TN'

            print(result_fw, result_bw, item['org_index'], item['org_token'], item['org_suggestion'], sentence, sep='\t', end='\n', file=f)

classification_results = pd.read_csv(file_name, sep='\t')
classification_results

Unnamed: 0,result_fw,result_bw,index,org_token,org_suggestion,masked_sentence
0,TP,TN,208822,were,wear,"When dancing ballet, you are required to [MASK..."
1,TP,TN,524374,affect,effect,The Atlantic has a large [MASK] in the north a...
2,TP,TN,16804,work,word,"Its name comes from the Greek [MASK] for Sun, ..."
3,FN,TN,341582,bitch,pitch,There are four things which music often has Mu...
4,FN,TN,1250703,weed,wheat,One person who owned many cows could trade wit...
5,TP,TN,343890,art,part,Leaving Canada Quebec was [MASK] of New France...
6,FN,TN,656168,mile,milk,But still keeping the chocloate [MASK] away.
7,TP,TN,708056,crack,back,They also have a dark line down their [MASK].
8,TP,TN,103691,seal,meal,"There is also informal education, for example,..."
9,TP,TN,350785,monkeys,monks,History The first people who lived on Iceland ...


In [9]:
summarized_results = {
    'TP': len(classification_results[classification_results['result_fw'] == 'TP']),
    'FN': len(classification_results[classification_results['result_fw'] == 'FN']),
    'TN': len(classification_results[classification_results['result_bw'] == 'TN']),
    'FP': len(classification_results[classification_results['result_bw'] == 'FP']),
}
print(summarized_results)

false_alarm_rate = summarized_results['FP']/len(classification_results)
print(f'false-alarm rate: {false_alarm_rate:.3f}')

miss_rate = summarized_results['FN']/len(classification_results)
print(f'miss rate: {miss_rate:.3f}')

accuracy = (summarized_results['TP']+summarized_results['TN'])/(len(classification_results) * 2)
print(f'accuracy: {accuracy:.3f}')

{'TP': 41, 'FN': 8, 'TN': 45, 'FP': 4}
false-alarm rate: 0.082
miss rate: 0.163
accuracy: 0.878
