In [219]:
import spacy
import spacy_transformers
from nltk.tokenize import sent_tokenize
import os
import re
import xlsxwriter
import pandas

In [220]:
nlp = spacy.load("en_core_web_trf")

In [221]:
def open_file(filename):
    with open(filename, encoding='utf-8') as f:
        text = f.read()
    return text

In [222]:
def writeln(errors, filename):
    workbook = xlsxwriter.Workbook(filename)
    worksheet = workbook.add_worksheet()
    #Fill in the column names
    for x in range(len(errors[0])):
        worksheet.write(0, x, errors[0][x])

    col = 0
    for row_num in range(1, len(errors)):
        error = errors[row_num]
        for x in range(len(error)):
            worksheet.write(row_num, col + x, error[x])
    workbook.close()

In [223]:
def find_subj(pred):
    
    # simple cases
    subjects = []
    subjects = [child for child in list(pred.children) if child.dep_.startswith(('nsubj', 'csubj'))]
    
    # if predicate is a conjunct or an auxiliary, we want to take subjects of its head
    if pred.dep_ == 'conj' or pred.dep_.startswith('aux'):
        subjects += [child for child in list(pred.head.children) if child.dep_.startswith(('nsubj', 'csubj'))]
        
    # handling 'there is' and 'there are' cases
    if pred.lemma_ == 'be' and 'there' in list(i.text.lower() for i in pred.children):
        subjects += [child for child in list(pred.children) if child.dep_ == 'attr']
        
    # handling conjuncts: multiple subjects as in 'Mother and father are key figures in a child's life'.
    add_subj = []
    for subject in subjects:
        add_subj += list(subject.conjuncts)
            
    subjects += add_subj
    
    # the subjects' order may be different from sentence order, so we arrange it right
    if subjects:
        s = []
        subj_i = sorted([subj.i for subj in subjects])
        for i in subj_i:
            s += [subject for subject in subjects if subject.i == i]
            
        subjects = s
        
    return subjects

In [224]:
def find_pred_subj(doc):
    pred_sub = list()
    for token in doc:
        if token.pos_ in ['AUX', 'VERB']:
            # negation: cases like "He doesn't scare me"   
            aux = None
            if [ch for ch in list(token.lefts) if ch.dep_ == 'neg']:
                children = list(token.children)
                for ch in children:
                    if ch.dep_ == 'aux' and not aux:
                        aux = ch
                if aux:
                    pred_sub += [(aux, find_subj(aux))]
                    
                    
            # if the predicate is analytical like 'I have done',
            # spacy rightfully considers the participle to be the root,
            # but we need grammatical info, so we will consider aux the root
            if not aux: # for when negation is expressed with 'never' etc and does not need aux support
                if token.tag_ in ['VBN', 'VBG']:
                    aux = None
                    children = list(token.children)
                    for ch in children:
                        if ch.dep_ == 'aux' and ch.pos_ in ['VERB', 'AUX'] and ch.tag_ != 'VBN':
                            aux = ch
                        elif not aux and ch.dep_ == 'auxpass' and ch.pos_ in ['VERB', 'AUX']:
                            aux = ch
                    if aux:
                        pred_sub += [(aux, find_subj(aux))]

                # all other cases
                elif token.dep_ in ['ROOT', 'ccomp', 'xcomp', 'acl', 'relcl']:
                    pred_sub += [(token, find_subj(token))]

                # conjuncts: when there are multiple predicates connected by conjunction
                elif token.dep_ == 'conj' and token.head.dep_ in ['ROOT', 'ccomp', 'xcomp', 'acl', 'relcl']:
                    pred_sub += [(token, find_subj(token))]
                
    return pred_sub

In [225]:
def errors(ps):
    
    res = []

    for pair in ps:
        subj_agr, pred_agr = None, None #what each must agree with, variables must coincide in the end
        pred = pair[0]
        subj = pair[1]
#         print(pair)
#         print([t.text for t in pred.children])
#         for sub in subj:
#             print([t.text for t in sub.lefts])
        if len(subj) == 1:
            subject = subj[0]
            s = subject.text.lower()
            subject_left_children = subject.lefts
            subject_is_numeral = False
            for child in subject_left_children:
                if child.pos_ == 'NUM':
                    subject_is_numeral = True
                    break
            if subject_is_numeral:
                continue
            if s not in ambiguous:
                children = list(ch for ch in subject.children)
                children_text = list(ch.text.lower() for ch in children)
                
                #singular only pronouns
                if s in sing_only:
                    subj_agr = 'sg'
                    
                # either singular or plural pronouns
                # if they have an 'of N, N, N...', after them we will require check if verb agrees with the last noun
                elif (s in {'some', 'any', 'none', 'all', 'most'} \
                      and 'of' in children_text):
                    of = [ch for ch in children if ch.text.lower() == 'of'][0]
                    noun = [ch for ch in of.children if ch.pos_ == 'NOUN']
                    if noun:
                        noun = noun[0]
                        while [ch for ch in noun.children if (ch.dep_ == 'conj' and ch.pos_ == 'NOUN')]:
                            noun = [ch for ch in noun.children if (ch.dep_ == 'conj' and ch.pos_ == 'NOUN')][-1]
                        if noun.tag_ in ['NNS', 'NNPS'] or noun.text.lower() in plur_only:
                            subj_agr = 'pl'
                        elif noun.tag_ in ['NN', 'NNP'] or noun.text.lower() in sing_only:
                            subj_agr = 'sg'
                    
                # plural only pronouns
                elif s in plur_only and not children:
                    subj_agr = 'pl'
                    
                elif s in {'i', 'we', 'you', 'they'}:
                    subj_agr = 'pl'
                elif s in {'he', 'she', 'it'}:
                    subj_agr = 'sg'
                    
                elif s == 'number':
                    if 'a' in children_text and 'of' in children_text:
                        subj_agr = 'pl'
                    else: subj_agr = 'sg'
                    
                # predicates in non-head clauses with 'who', 'that' agree with noun in head clause
                elif s in ['who', 'that']:
                    if pred.dep_ == 'relcl':
                        #why relcl? we can only be sure about this tag that it is the case we're looking for.
                        #other possible predicate tags include '...comp', but these also apply
                        #in cases like "I asked the boys who was the winner",
                        #which, although with incorrect word order,
                        #are still clearly present in Russian essays
                        #and will be parsed by spacy as 'ccomp'
                        head = pred.head
                        if not head.conjuncts:
                            if head.tag_ in ['NNS', 'NNPS'] or head.text.lower() in plur_only:
                                subj_agr = 'pl'
                            elif head.tag_ in ['NN', 'NNP'] or head.text.lower() in sing_only:
                                subj_agr = 'sg'
                        else:
                            conjuncts = list(head.conjuncts)+[head]
                            for conjunct in conjuncts:
                                if 'and' in list(child.text.lower() for child in conjunct.children):
                                    subj_agr = 'pl'     
                elif subject.tag_ in ['NNS', 'NNPS']:
                    subj_agr = 'pl'
                elif subject.tag_ in ['NN', 'NNP', 'VBG']:
                    subj_agr = 'sg'
                    
        elif len(subj) > 1:
            # 'Mother, father and brother were present.'
            # If conjuncts are connected by 'and', he predicate is plural
            # Exception: 'Every man, woman and child aprticipates in the tournament.'
            if 'and' in list(child.text.lower() for child in list(list(subj[0].children)+list(subj[-2].children))):
                left_subj_children = list(child.text.lower() for child in list(subj[0].lefts))
                left_pred_children = list(child.text.lower() for child in list(pred.lefts))
                is_uppercase = True
                for sub in subj:
                    if sub.tag_ != 'NNP':
                        is_uppercase = False
                        break
                #Don't check: 'Jones and Sons is a respectable company.'
                if is_uppercase:
                    continue
                # Don't check: 'There is Tom and Mary as a perfect example.'
                if pred.text.lower() in ['is', 'are'] and 'there' in left_pred_children:
                    continue
                if 'every' in left_subj_children or 'each' in left_subj_children:
                    subj_agr = 'sg'
                else:
                    subj_agr = 'pl'
            # 'Mother, father or brother comes to pick up the kid.'
            # If conjuncts are connected by 'or', verb agrees with the last one
            elif any(a in list(child.text.lower() for child in list(list(subj[0].children)+list(subj[-2].children))) for a in ['or', 'nor']):
                if subj[-1].tag_ in ['NNS', 'NNPS'] or subj[-1].text.lower() in plur_only:
                    subj_agr = 'pl'
                elif subj[-1].tag_ in ['NN', 'NNP', 'VBG'] or subj[-1].text.lower() in sing_only:
                    subj_agr = 'sg'

        if pred.tag_ == 'VBZ':
            pred_agr = 'sg'
        elif pred.tag_ == 'VBP':
            pred_agr = 'pl'
        elif pred.lemma == 'be':
            if pred.text == 'was':
                pred_agr = 'sg'
            elif pred.text == 'were':
                pred_agr = 'pl'

        if subj_agr != pred_agr and subj_agr and pred_agr:
            res += [pair]
            
    return res

In [34]:
def search(directory = 'test', only_errors=True): #If used with excel files, should be only used with similar format files, otherwise additional info can be messy
    final = []
    text_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if os.path.splitext(f)[1] == '.txt']
    excel_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(directory) for f in filenames if os.path.splitext(f)[1] == '.xlsx']

    if text_files: #If there is at least one txt file, we assume that the format is default
        if only_errors:
            final.append(("Sentence", "Errors", "Filename"))
        else:
            final.append(("Sentence", "Errors", "Filename", "Has Error"))

    for file in text_files:
        text = open_file(file)
        sentences = sent_tokenize(text)
        for sent in sentences:
            sent = re.sub(r'<.*?>', '', sent)
            sent = re.sub(r' +', ' ', sent)
            doc = nlp(sent)
            ps = find_pred_subj(doc)
            # ps looks like [(predicate1, [subject1, subject2]), (predicate2, [subject1])]
            er = errors(ps)
#             er = [error for error in er if not subjunctive(sent, error[0])]
            
            if only_errors:
                if er:
                    final.append((sent,
                                  ",\n".join(["{} {}".format(*er[x]) for x in range(len(er))]), # Wrong subjs and preds,
                                 file))
            else:
                final.append((sent,
                              ", ".join(["({} {})".format(*er[x]) for x in range(len(er))]),
                              file,
                              int(len(er) != 0)))  #Whether or not errors were found

    for file in excel_files:
        df = pandas.read_excel(file)
        if not final:
            additional_columns = tuple([x for x in df.columns if x != "sentence"])
            if only_errors:
                final.append(("Sentence", "Errors", "Filename") + additional_columns)
            else:
                final.append(("Sentence", "Errors", "Filename", "Has Error") + additional_columns)

        for row_num in range(df.shape[0]):
            sent = df["sentence"].iat[row_num]
            sent = re.sub(r'<.*?>', '', sent)
            sent = re.sub(r' +', ' ', sent)
            doc = nlp(sent)
            ps = find_pred_subj(doc)
            # ps looks like [(predicate1, [subject1, subject2]), (predicate2, [subject1])]
            er = errors(ps)
#             er = [error for error in er if not subjunctive(sent, error[0])]
            
            if only_errors:
                if er:
                    final.append((sent,
                                  ",\n".join(["{} {}".format(*er[x]) for x in range(len(er))]), # Wrong subjs and preds,
                                 file,
                                 *[df[column].iat[row_num] for column in additional_columns]))
            else:
                final.append((sent,
                              ", ".join(["({} {})".format(*er[x]) for x in range(len(er))]),
                              file,
                              int(len(er) != 0), #Whether or not errors were found
                              *[df[column].iat[row_num] for column in additional_columns]))
        
    return final

In [227]:
def subjunctive(sent, pred):
    adj_prt = {'advisable', 'best', 'crucial', 'desirable', 'vital',
               'essential', 'imperative', 'important', 'moved',
               'necessary', 'prohibited', 'unthinkable', 'urgent',
               'required', 'stipulated', 'requested', 'recommended',
               'advised', 'proposed', 'adamant', 'anxious', 
               'determined', 'eager', 'insistent', 'keen'}
    
    clause_is_subjunctive = False
    that_list = []

    if 'that' in sent.lower():
        doc = nlp(sent)
        that_list = [token for token in doc if token.text.lower() == 'that']
        for that in that_list: 
            # VERB/NOUN + that + CONJUNCTIVE
            if (that.head.i == pred.i \
                and pred.dep_ == 'ccomp' \
                and pred.head.pos_ in ['VERB', 'NOUN'] \
                and pred.head.lemma_ in {'advise', 'ask', 'command', 'demand', 'desire',
                                         'insist', 'move', 'order', 'prefer', 'propose',
                                         'recommend', 'request', 'stipulate', 'suggest', 'urge',
                                         'motion', 'order', 'preference', 'proposal', 'recommendation',
                                         'request', 'stipulation', 'suggestion'}):
                clause_is_subjunctive = True

            # SUBJECT + [be] ADJ/PRTC + that + CONJUNCTIVE
            elif (that.head.i == pred.i and pred.dep_ == 'ccomp'):
                if (pred.head.lemma_ == 'be' and \
                    any(a in adj_prt for a in list(b.text.lower() for b in pred.head.children))) \
                    or (pred.head.tag_ in ['VBN', 'JJ'] and \
                        pred.head.text in adj_prt):
                        clause_is_subjunctive = True

    return clause_is_subjunctive

In [10]:
ambiguous = {'bison', 'cod', 'deer', 'fish', 'moose', 'boar', 'salmon', 'sheep',
            'shrimp', 'swine', 'trout', 'buffalo', 'grouse', 'elk', 'fruit', 'reindeer',
            'offspring', 'pike',
            'statistics', 'politics', 'mechanics', 'economics',            
            'government', 'data', 'police', 'team', 'jury', 'family',
            'half', 'class', 'majority', 'part', 'percent', '%', 'cent', 'lot'}

sing_only = {'each', 'either', 'neither', 'one', 'nobody',
            'nothing', 'anyone', 'anybody', 'anything', 'someone', 
            'somebody', 'something', 'everyone', 'everybody', 'everything', 
             'this', 'one', 'other', 'which'}

plur_only = {'several', 'few', 'many', 'both', 'these', 'those'}

In [89]:
%%time
f = search()
fa = non_subjunctive(f)

In [82]:
%%time
f = search('../razmetka/', False)

CPU times: user 2min 53s, sys: 152 ms, total: 2min 53s
Wall time: 43.4 s


In [230]:
writeln(f, "itog.xlsx")

## Test

In [43]:
def run_model(*args):
    return search(os.path.join(*args), False)

In [44]:
def test_model(*args, write_errs=None, print_wrong=False): # Function to test the model on a directory with a correct.txt file and a wrong.txt file
    outpus = run_model(*args)
    
    if write_errs is not None:
        writeln(output, write_errs)
    
    false_pos = 0
    true_pos = 0
    false_neg = 0
    true_neg = 0
    for x in output:
        if x[2] == os.path.join(*args, "correct.txt"):
            if x[3] == 0:
                true_neg += 1
            else:
                false_pos += 1
        else:
            if x[3] == 0:
                false_neg += 1
            else:
                true_pos += 1

    precision = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
    recall = true_pos / (true_pos + false_neg)
    print("Precision = {}\nRecall = {}\n".format(round(precision, 2), round(recall, 2)))

def evaluate(level, **kwargs): # level should be equal to "easy", "medium" or "insane"
    print(level.capitalize(), "tests:")
    test_model("test", "evaluate", level, **kwargs)

In [45]:
%%time
evaluate("easy", print_wrong=True)

8
Easy tests:
Precision = 0.95
Recall = 0.88

CPU times: user 3.56 s, sys: 0 ns, total: 3.56 s
Wall time: 896 ms


In [96]:
spacy.explain("NNP")

'noun, proper singular'

## Example

In [None]:
s = ["Mother and father are important figures in a child's life."]

for ss in s:
    doc = nlp(ss)
    print(doc)
    print(find_pred_subj(doc))
    
    
for token in doc:
    print("%s\t%s\t\t\t%s\t%s\t\t%s\t\t%s\t\t%s" % (token.i, token.text, token.pos_, token.lemma_, token.tag_, token.dep_, token.head.i))
    
print(dir(doc[0]))
print(' '.join(list(ch.text for ch in doc[3].children)))

Mother and father are important figures in a child's life.
[(are, [Mother, father])]
0	Mother			NOUN	mother		NN		nsubj		3
1	and			CCONJ	and		CC		cc		0
2	father			NOUN	father		NN		conj		0
3	are			AUX	be		VBP		ROOT		3
4	important			ADJ	important		JJ		amod		5
5	figures			NOUN	figure		NNS		attr		3
6	in			ADP	in		IN		prep		5
7	a			DET	a		DT		det		8
8	child			NOUN	child		NN		poss		10
9	's			PART	's		POS		case		8
10	life			NOUN	life		NN		pobj		6
11	.			PUNCT	.		.		punct		3
['_', '__bytes__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_ty



In [None]:
spacy.explain("pobj")

'object of preposition'

In [130]:
st = sent_tokenize("Apples is good")