In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [10]:
def is_tok_hyph(tok):
    return tok.tag_ == 'HYPH'

def is_part_of_hyphenated(tok):
    if tok.i == 0:
        return is_tok_hyph(tok.nbor())
    if tok.i == len(tok.doc) - 1:
        return is_tok_hyph(tok.nbor(-1))  
    return is_tok_hyph(tok.nbor()) or is_tok_hyph(tok.nbor(-1))
  
def smart_capitalize(s):
    return s[0].upper() + s[1:]

def format_headlines(text):
    doc = nlp(text)
    first_word_idx = next((tok.i for tok in doc if not tok.is_punct), None)
    last_word_idx = next((tok.i for tok in reversed(doc) if not tok.is_punct), None)  
    result = ''

    for tok in doc:
        res_text = tok.text

        if any(
            [tok.i == first_word_idx, 
             tok.i == last_word_idx,
             is_part_of_hyphenated(tok),
             tok.pos_ in ['NOUN', 'PROPN', 'PRON', 'NUM', 'VERB', 'AUX', 'ADJ', 'ADV'],
             tok.tag_ == 'PRP$',
             tok.dep_ == 'mark',
             len(tok.text) >= 4]):
            res_text = smart_capitalize(res_text)

        result = result + res_text + tok.whitespace_

    return result

In [13]:
format_headlines("all course members like cats as i do")

'All Course Members Like Cats As I Do'

In [12]:
# debugging stuff
dbg = nlp("Fear the Cliff Run/Walk")

for tok in dbg:
    print("{} ({}) - {} - {}".format(tok.text, tok.pos_, tok.dep_, tok.head.text))

Fear (VERB) - ROOT - Fear
the (DET) - det - Walk
Cliff (PROPN) - compound - Walk
Run (PROPN) - nmod - Walk
/ (SYM) - punct - Walk
Walk (NOUN) - dobj - Fear


In [5]:
with open("../../../tasks/02-structural-linguistics/data/examiner-headlines.txt") as f:
    headlines_corpora = f.readlines()

headlines_corpora[:5]

['Halep enters Rogers Cup final in straight sets win over Errani\n',
 "The phantoms of St. Mary's\n",
 "Talladega turmoil could spell trouble for NASCAR's Chase field\n",
 'Burn those calories! Try the Very Steep Trail.\n',
 "It's the end of the world... and I feel fine\n"]

In [6]:
import random

def show_formatted_sample(n):
    sample = random.sample(headlines_corpora, n)

    for headline in sample:
        formatted = format_headlines(headline)
        print(headline + formatted)

show_formatted_sample(10)

Police: Vampire attacks disabled St. Petersburg man
Police: Vampire Attacks Disabled St. Petersburg Man

New rule set takes center stage as Rose City and Rat City's home teams battle
New Rule Set Takes Center Stage As Rose City and Rat City's Home Teams Battle

Hidamari Sketch X SP licensed by Sentai Filmworks
Hidamari Sketch X SP Licensed by Sentai Filmworks

Oklahoma 2015 lawmaker revisting hoodie problem from 2010?
Oklahoma 2015 Lawmaker Revisting Hoodie Problem From 2010?

NEW YORK BOOK FAIR COMPETITION: All Are Welcome!
NEW YORK BOOK FAIR COMPETITION: All Are Welcome!

Randy Williams, Erick Threets, and a potential spring bullpen battle
Randy Williams, Erick Threets, and a Potential Spring Bullpen Battle

BREAKING: BP CEO sold shares of his company's stock weeks before Gulf disaster
BREAKING: BP CEO Sold Shares of His Company's Stock Weeks Before Gulf Disaster

Fishing for cancer in Baltimore
Fishing for Cancer in Baltimore

Lexus HS going the way of the dodo?
Lexus HS Going the W

In [7]:
import json

with open('../../../tasks/02-structural-linguistics/data/headlines-test-set.json') as f:
    test_set = json.load(f)

In [8]:
def quality():
    ok = 0
    for to_format, expected in test_set:
        if format_headlines(to_format) == expected:
            ok += 1

    return ok / len(test_set)

quality()

0.91