In [1]:
import en_core_web_md
import json

In [2]:
nlp = en_core_web_md.load()

In [3]:
with open('../../../../tasks/02-structural-linguistics/data/headlines-test-set.json', 'r') as f:
    data = json.load(f)
    
with open('../../../../tasks/02-structural-linguistics/data/examiner-headlines.txt', 'r') as f:
    corpus = f.readlines()

In [4]:
def headline(doc):
    res = ''

    valid_pos = ['NOUN', 'VERB', 'AUX', 'PRON', 'ADJ', 'ADV', 'SCONJ']
    invalid_pos = ['DET', 'CONJ', 'CCONJ', 'PART', 'INTJ', 'ADP']
    
    sent_len = len(doc) - 1

    for token in doc:
        prev_token = doc[token.i - 1]
        next_token = doc[token.i + 1] if sent_len > token.i else None
        
        is_part_of_hyphened = prev_token.text == '-' and not prev_token.whitespace_ \
            or next_token and next_token.text == '-' and not next_token.whitespace_
        is_start_of_quote = prev_token.is_quote and not prev_token.whitespace_
        is_last = token.i == sent_len or token.i == sent_len - 1 and doc[sent_len].is_punct
        is_adp = token.pos_ == 'SCONJ' and \
            any(c.pos_ == 'NOUN' or c.pos_ == 'ADP' or c.pos_ == 'PROPN' for c in token.children)
        is_pron = token.lemma_ == '-PRON-' or token.pos_ == 'DET' and token.head.pos_ == 'PRON'
        is_det_title = token.pos_ == 'DET' and prev_token.pos_ == 'PUNCT'
        is_propn_to_capitalize = len(token.text) <= 3 and token.pos_ == 'PROPN' and token.is_lower
        is_propn_to_skip = token.pos_ == 'PROPN' and not token.is_lower
        is_neg_adv = token.text.lower() == 'not' and \
            (token.head.pos_ == 'AUX' or token.head.pos_ == 'VERB' or token.head.pos_ == 'ADJ')

        should_capitalize = (
                len(token.text) > 3
                or ((token.is_sent_start or is_last) \
                    and not token.is_punct and not token.is_quote
                ) or is_part_of_hyphened \
                or is_start_of_quote \
                or token.pos_ in valid_pos \
                or is_pron or is_det_title \
                or is_propn_to_capitalize \
                or is_neg_adv
            ) and not token.is_upper and not (len(token.text) <=3 and is_adp) and not is_propn_to_skip

        if should_capitalize:
            res += token.text.capitalize()
        elif token.pos_ in invalid_pos:
            res += token.text.lower()
        else:
            res += token.text
        res += token.whitespace_

    return res

In [13]:
def accuracy(data):
    ok = 0

    for inp, exp in data:
        doc = nlp(inp)
        if (headline(doc) == exp):
            ok += 1

    return ok/len(data)

In [11]:
def test_corpus(corpus):
    ok = 0
    for line in corpus:
        formatted = headline(nlp(line))
        if formatted == line:
            ok += 1
    return ok, ok/len(corpus)

In [336]:
import random

def debug(corpus, n):
    random.shuffle(corpus)
    for sample in corpus[:n]:
        print(sample)
        print(headline(nlp(sample)))

In [14]:
accuracy(data)

0.97

In [None]:
debug(corpus, 10)

In [12]:
test_corpus(corpus)

(640, 0.128)