In [1]:
import json
import spacy

from functional_pipeline import pipeline, tap

In [2]:
with open('../../../tasks/02-structural-linguistics/data/headlines-test-set.json') as f:
    headlines_validation = json.load(f)

with open('../../../tasks/02-structural-linguistics/data/examiner-headlines.txt') as f:
    headlines_debug = [l.strip() for l in f.readlines()]

In [3]:
headlines_debug[:3]

['Halep enters Rogers Cup final in straight sets win over Errani',
 "The phantoms of St. Mary's",
 "Talladega turmoil could spell trouble for NASCAR's Chase field"]

In [4]:
headlines_validation[:3]

[['How To Design A College Curriculum to Help You in Life',
  'How to Design a College Curriculum to Help You in Life'],
 ['This is why you should hate Battlefield 3',
  'This Is Why You Should Hate Battlefield 3'],
 ["How to photograph tonight's Lyrid Meteor Shower",
  "How to Photograph Tonight's Lyrid Meteor Shower"]]

In [5]:
nlp = spacy.load("en_core_web_md")

In [9]:
def is_upper(word):
    alphatext = ''.join(filter(str.isalpha, word))
    return alphatext and any([c.isupper() for c in alphatext])

def is_upper_not_first(word):
    return len(word) > 1 and is_upper(word[1:])

def apply_lowercase(index, sentence, word):
    if (
        index == 0 or sentence[index-1].whitespace_ == ''
        or is_upper_not_first(word)
    ):
        return word

    return word.lower()


def apply_4char_rule(index, sentence, word):
    if is_upper(word):
        return word

    if len(word) >= 4:
        return word.capitalize()

    return word


def apply_first_last_rule(index, first_word_index, last_word_index, word):
    if is_upper(word):
        return word

    if index in (first_word_index, last_word_index):
        return word.capitalize()

    return word


def apply_pos_rule(index, sentence, word):
    if is_upper(word):
        return word

    token = sentence[index]
    #  іменники, займенники, дієслова, прикметники, прислівники та підрядні сполучники
    pos_list = ['NOUN', 'PRON', 'PROPN', 'VERB', 'ADJ', 'ADV', 'SCONJ']

    # Special case for verbs shortenings
    if (
        index > 1
        and token.pos_ == 'VERB'
        and sentence[index-1].text == "'"
        and (index == 1 or sentence[index-2].whitespace_ != '')
    ):
        return word.capitalize()

    if index != 0 and sentence[index-1].whitespace_ == '':
        return word

    if token.pos_ in pos_list:
        return word.capitalize()

    if token.pos_ == 'ADP' and token.dep_ == 'mark':
        return word.capitalize()

    return word


def apply_dash_rule(index, sentence, word):
    if is_upper(word):
        return word

    if index < 2:
        return word

    if (
        is_upper(sentence[index-2].text) and sentence[index-1].text == '-'
        and sentence[index-1].whitespace_ == ''
        and sentence[index-2].whitespace_ == ''
    ):
        return word.capitalize()

    if (
        index < len(sentence) - 2 and
        is_upper(sentence[index+2].text) and sentence[index+1].text == '-'
        and sentence[index+1].whitespace_ == ''
        and sentence[index].whitespace_ == ''
    ):
        return word.capitalize()

    return word


def apply_headline_formatting(txt):
    result = []
    sentence = nlp(txt)
    # Find first and last words
    alphanum_list = [t.text[0].isalnum() for t in sentence]
    first_word_index = alphanum_list.index(True)
    alphanum_list.reverse()
    last_word_index = len(alphanum_list) - alphanum_list.index(True) - 1

    has_dashes = False
    for i, token in enumerate(sentence):
        word = pipeline(
            token.text,
            [
                (apply_lowercase, i, sentence),
                #tap(print),
                (apply_4char_rule, i, sentence),
                #tap(print),
                (apply_first_last_rule, i, first_word_index, last_word_index),
                #tap(print),
                (apply_pos_rule, i, sentence),
                #tap(print),
                #(apply_dash_rule, i, sentence),
                #tap(print),
            ]
        )
        if word == '-':
            has_dashes = True
        result.append(word + token.whitespace_)

    if has_dashes:
        sentence = nlp(''.join(result))
        result = []
        for i, token in enumerate(sentence):
            word = apply_dash_rule(i, sentence, token.text)
            result.append(word + token.whitespace_)

    return ''.join(result)

def validate(data):
    count_valid = 0
    for experiment in data:
        formatted_text = apply_headline_formatting(experiment[0])
        if formatted_text == experiment[1]:
            count_valid += 1
        else:
            print('=' * 30)
            print(experiment[0])
            print(formatted_text)
            print(experiment[1])

    print('Validation results: {0}/{1} ({2:.2f}%)'.format(count_valid, len(data), 100 * count_valid/len(data)))

In [10]:
validate(headlines_validation)

How To Design A College Curriculum to Help You in Life
How to Design A College Curriculum to Help You in Life
How to Design a College Curriculum to Help You in Life
Brazilian & bikini waxing how to choose an sf bay area hair removal salon or spa
Brazilian & Bikini Waxing How to Choose an Sf Bay Area Hair Removal Salon or Spa
Brazilian & Bikini Waxing How to Choose an SF Bay Area Hair Removal Salon or Spa
iPhone 5 release date ahead as Nokia Windows 8 smartphones interest grows
iPhone 5 Release Date Ahead as Nokia Windows 8 Smartphones Interest Grows
iPhone 5 Release Date Ahead As Nokia Windows 8 Smartphones Interest Grows
How it all plays out on Church Street
How It all Plays out on Church Street
How It All Plays out on Church Street
Facebook post helps police nab suspect who ripped off two Burbank salons
Facebook Post Helps Police Nab Suspect Who Ripped off two Burbank Salons
Facebook Post Helps Police Nab Suspect Who Ripped off Two Burbank Salons
Hank Williams and all his rowdy frien

In [11]:
count = 0
for line in headlines_debug:
    formatted_text = apply_headline_formatting(line)
    if formatted_text == line:
        count += 1

In [12]:
print('Results: {0}/{1} ({2:.2f}%)'.format(count, len(headlines_debug), 100 * count/len(headlines_debug)))

Results: 637/5000 (12.74%)
