In [1]:
import json
import spacy

from functional_pipeline import pipeline

In [2]:
with open('../../tasks/02-structural-linguistics/data/headlines-test-set.json') as f:
    headlines_validation = json.load(f)

with open('../../tasks/02-structural-linguistics/data/examiner-headlines.txt') as f:
    headlines_debug = [l.strip() for l in f.readlines()]

In [3]:
headlines_debug[:3]

['Halep enters Rogers Cup final in straight sets win over Errani',
 "The phantoms of St. Mary's",
 "Talladega turmoil could spell trouble for NASCAR's Chase field"]

In [4]:
headlines_validation[:3]

[['How To Design A College Curriculum to Help You in Life',
  'How to Design a College Curriculum to Help You in Life'],
 ['This is why you should hate Battlefield 3',
  'This Is Why You Should Hate Battlefield 3'],
 ["How to photograph tonight's Lyrid Meteor Shower",
  "How to Photograph Tonight's Lyrid Meteor Shower"]]

In [7]:
nlp = spacy.load("en_core_web_md")

In [92]:
def capitalize(word):
    return '-'.join([t.capitalize() for t in word.split('-')])

def is_upper(word):
    alphatext = ''.join(filter(str.isalpha, word))
    return alphatext and any([c.isupper() for c in alphatext])

def apply_lowercase(txt):
    result = []
    sentence = nlp(txt)
    for i, token in enumerate(sentence):
        if i == 0 or sentence[i-1].whitespace_ == '' or is_upper(token.text):
            result.append(token.text_with_ws)
        else:
            result.append(token.text_with_ws.lower())
    return ''.join(result)
    
def apply_4char_rule(txt):
    result = []
    sentence = nlp(txt)
    for token in sentence:
        if len(token.text) >= 4 and not is_upper(token.text):
            result.append(capitalize(token.text_with_ws))
        else:
            result.append(token.text_with_ws)
    return ''.join(result)

def apply_first_last_rule(txt):
    sentence = nlp(txt)
    result = [token.text_with_ws for token in sentence]
    if not is_upper(result[0]):
        result[0] = capitalize(result[0])
    if not is_upper(result[-1]):
        result[-1] = capitalize(result[-1])
    return ''.join(result)

def apply_pos_rule(txt):
    #  іменники, займенники, дієслова, прикметники, прислівники та підрядні сполучники
    pos_list = ['NOUN', 'PRON', 'PROPN', 'VERB', 'ADJ', 'ADV', 'SCONJ']
    result = []
    sentence = nlp(txt)
    for i, token in enumerate(sentence):
        if i != 0 and sentence[i-1].whitespace_ == '':
            result.append(token.text_with_ws)
        elif token.pos_ in pos_list and not is_upper(token.text):
            result.append(capitalize(token.text_with_ws))
        else:
            result.append(token.text_with_ws)
    return ''.join(result)

def apply_dash_rule(txt):
    result = []
    sentence = nlp(txt)
    for i, token in enumerate(sentence):
        if i < 2:
            result.append(token.text_with_ws)
        elif (
            is_upper(sentence[i-2].text) and sentence[i-1].text == '-'
            and sentence[i-1].whitespace_ == ''
            and sentence[i-2].whitespace_ == ''
        ):
            result.append(capitalize(token.text_with_ws))
        else:
            result.append(token.text_with_ws)
    return ''.join(result)

def apply_headline_formatting(txt):
    return pipeline(
        txt,
        [
            apply_lowercase,
            apply_4char_rule,
            apply_first_last_rule,
            apply_pos_rule,
            apply_dash_rule,
        ]
    )

def validate(data):
    count_valid = 0
    for experiment in data:
        formatted_text = apply_headline_formatting(experiment[0])
        if formatted_text == experiment[1]:
            count_valid += 1

    print('Validation results: {0}/{1} ({2:.2f}%)'.format(count_valid, len(data), 100 * count_valid/len(data)))

In [93]:
validate(headlines_validation)

Validation results: 84/100 (84.00%)


In [94]:
count = 0
for line in headlines_debug:
    formatted_text = apply_headline_formatting(line)
    if formatted_text == line:
        count += 1


NameError: name 'count_valid' is not defined

In [95]:
print('Results: {0}/{1} ({2:.2f}%)'.format(count, len(headlines_debug), 100 * count/len(headlines_debug)))

Results: 746/5000 (14.92%)
