# Analyze Educational Corpus

In [77]:
from rwse_checker.rwse import RWSE_Checker
from cas_visualizer.visualizer import SpacySpanVisualiser
from IPython.display import display, HTML

import cassis
import os
import pandas as pd
import spacy

### Analyze FCE dataset

In [2]:
report_file_name = 'output/report_use_case_fce.tsv'

In [3]:
confusion_set_file = 'input/confusion_sets_modified.csv'
confusion_set_dict = dict()
confusion_set_words = []
with open(confusion_set_file, 'r') as f:
    for line in f.readlines():
        words = line.strip().split(',')
        for word in words:
            confusion_set_dict[word] = line.strip()
        confusion_set_words.extend(words)
confusion_set_words = set([s.lower() for s in confusion_set_words])

In [4]:
file_name = 'input/fce_cleaned.txt'

if not os.path.exists(file_name):
    input_file = open('input/fce-public.train.original.tsv')
    with open(file_name, 'w') as f:
        sentence = ''
        for line in input_file.readlines()[1:]:
            if line.strip() == '':
                print(sentence.strip(), file=f, end='\n')
                sentence = ''
            else:
                sentence += line.split('\t')[0] + ' '
    input_file.close()

In [5]:
ts_file = 'input/TypeSystem.xml'
with open(ts_file, 'rb') as f:
    ts = cassis.load_typesystem(f)


T_SENTENCE = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'

S = ts.get_type(T_SENTENCE)
T = ts.get_type(T_TOKEN)

cas = cassis.Cas(ts)

input_file = 'input/fce_cleaned.txt'
with open(input_file, 'r') as f:
    cas_input = ''
    for line in f.readlines():
        for item in confusion_set_words:
            if item in line.split(' '):
                cas_input += line + ' '
                break
    cas.sofa_string = cas_input

nlp = spacy.load("en_core_web_sm")
nlp.max_length = 1800000
doc = nlp(cas.sofa_string)
for sent in doc.sents:
    cas_sentence = S(begin=sent.start_char, end=sent.end_char)
    cas.add(cas_sentence)
for token in doc:
    cas_token = T(begin=token.idx, end=token.idx+len(token.text), id=token.i)
    cas.add(cas_token)

len(cas.sofa_string)

1752223

In [6]:
len(list(doc.sents))

18984

In [7]:
if not os.path.exists(report_file_name):
    rwse = RWSE_Checker()
    rwse.set_confusion_sets(confusion_set_file)
    rwse.check_cas(cas, ts)

### Report RWSE

In [12]:
T_RWSE = 'de.tudarmstadt.ukp.dkpro.core.api.anomaly.type.RWSE'

if not os.path.exists(report_file_name):
    with open(report_file_name, 'w') as f:
        print('confusion_set', 'target', 'suggestion', 'sentence', sep='\t', end='\n', file=f)
        for annotation in cas.select(T_RWSE):
            confusion_set = confusion_set_dict[annotation.suggestion]
            target = annotation.get_covered_text()
            suggestion = annotation.suggestion
            rwse_text = f"{annotation.get_covered_text()}=>{annotation.suggestion}"
            begin = annotation.begin
            end = annotation.end
            for sentence in cas.select(T_SENTENCE):
                if sentence.begin <= begin and sentence.end >= end:
                    sentence_text = sentence.get_covered_text()
                    sentence_text = sentence_text[:(begin-sentence.begin)] + f'[{annotation.get_covered_text()}]' + sentence_text[(end-sentence.begin):]
                    sentence_text = sentence_text.replace('\n', ' ')
                    sentence_text = sentence_text.replace('\t', ' ')
                    break
            print(confusion_set, target, suggestion, sentence_text, sep='\t', end='\n', file=f)

In [15]:
data = pd.read_csv(report_file_name, sep='\t')
data.head()

Unnamed: 0,confusion_set,target,suggestion,sentence
0,"To,Too,Two,to,too,two",to,too,"And at the end , my opinion is that one weeken..."
1,"Effect,affect,effect",effect,affect,"Shopping does not just [effect] us mentally , ..."
2,"Plain,plain,plane",plane,plain,"If we came back to the past , we felt everythi..."
3,"Country,County,country,county",county,country,I used to be play in school team when I was in...
4,"Their,There,They,their,there,they",their,there,so I had an argument with the people Who worke...


In [16]:
data['confusion_set'].value_counts()

confusion_set
You,Your,you,your                    94
To,Too,Two,to,too,two                48
Their,There,They,their,there,they    47
Than,Then,than,then                  32
Life,Live,life,live                  24
Were,Where,were,where                19
Quiet,Quite,quiet,quite              14
Hole,Whole,hole,whole                12
advice,advise                         8
Things,things,thinks                  8
loose,lose                            6
Word,World,word,world                 6
Country,County,country,county         6
Which,Witch,which,witch               5
Mad,Made,mad,made                     4
Effects,affects,effects               4
Weather,Whether,weather,whether       4
Peace,Piece,peace,piece               3
Except,accept,except                  3
Form,From,form,from                   3
Week,weak,week                        3
Past,passed,past                      2
Them,Theme,them,theme                 2
Effect,affect,effect                  2
Plain,plain,plane         

### Limitation of Confusion Set Approach

In [1]:
from transformers import pipeline

pipe = pipeline("fill-mask", model="bert-base-cased", device=-1)
masked_sentence = "[MASK] you for reading."
results = pipe(masked_sentence, targets=["Then", "Than", "then", "than"])
#0.99769 Thanks (best prediction overall)
#3.5702942113857716e-07 Then (best prediction from confusion set)
#2.8134087060038837e-08 Than (original token)
results

  from .autonotebook import tqdm as notebook_tqdm
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model

[{'score': 3.5702942113857716e-07,
  'token': 1599,
  'token_str': 'Then',
  'sequence': 'Then you for reading.'},
 {'score': 2.8134087060038837e-08,
  'token': 16062,
  'token_str': 'Than',
  'sequence': 'Than you for reading.'},
 {'score': 1.3301545642718793e-08,
  'token': 1190,
  'token_str': 'than',
  'sequence': 'than you for reading.'},
 {'score': 9.532203470996592e-09,
  'token': 1173,
  'token_str': 'then',
  'sequence': 'then you for reading.'}]

### Mocked Visualization

In [75]:
ts_file = 'input/TypeSystem.xml'
with open(ts_file, 'rb') as f:
    ts = cassis.load_typesystem(f)

cas = cassis.Cas(ts)
cas.sofa_string = "My advise advice for you is: Do not put to too much subjects, just put a few subject and make them look interesting."

T_TOKEN = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token'
RWSE = ts.get_type(T_RWSE)
ts.create_type('CORR')
CORR = ts.get_type('CORR')

token = 'to'
begin = cas.sofa_string.find(token)
end = begin + len(token)
custom = RWSE(begin=begin, end=end)
cas.add(custom)

token = 'too'
begin = cas.sofa_string.find(token)
end = begin + len(token)
custom = CORR(begin=begin, end=end)
cas.add(custom)

token = 'advise'
begin = cas.sofa_string.find(token)
end = begin + len(token)
custom = RWSE(begin=begin, end=end)
cas.add(custom)

token = 'advice'
begin = cas.sofa_string.find(token)
end = begin + len(token)
custom = CORR(begin=begin, end=end)
cas.add(custom)

spacy_span_vis = SpacySpanVisualiser(cas, [])

spacy_span_vis.set_selected_annotations_to_types({
    'RWSE': RWSE,
    'CORR': CORR,
})

spacy_span_vis.set_annotations_to_colors({
    'RWSE': 'palegreen',#'#0DF5B3',
    'CORR': 'limegreen',#'#078A65',
})
spacy_span_vis.set_span_type(SpacySpanVisualiser.SPAN_STYLE_HIGHLIGHTING)
spacy_span_vis.set_allow_highlighting_overlap(True)
html = spacy_span_vis.visualise()
display(HTML(html))