In [None]:
import spacy
nlp = spacy.load("en_core_web_md",  disable=["ner"]) 

def last_word(doc):
    length = len(doc)
    for i in range(length-1, 0, -1):
        if doc[i].pos_ != "PUNCT":
            last_token = doc[i]
            if doc[i].is_lower:
                last_word = doc[i].text.title()
            else:
                last_word = doc[i].text
            break
    return last_word, last_token

def first_word(doc):
    length = len(doc)
    for i in range(length):
        if doc[i].pos_ != "PUNCT":
            first_token = doc[i]
            if doc[i].is_lower and doc[i].shape_ != "dxx":
                first_word = doc[i].text.title()
            else:
                first_word = doc[i].text
            break
    return first_word, first_token


def formatting_headlines(headline):
    
    doc = nlp(headline)
    formatted_text = ""
    last_w, last_token = last_word(doc)
    first_w, first_token = first_word(doc)
    
    for i, token in enumerate(doc):
        if first_token.i == doc[i].i:
            formatted_text += first_w
        elif last_token.i == doc[i].i:
            formatted_text += last_w
            
        elif len(token) > 3:
                if token.is_lower:
                    capitalized = token.text.title()
                    formatted_text += capitalized
                else:
                    formatted_text += token.text
        else:
            if token.pos_ in ["NOUN", "ADJ", "PROPN"] or token.tag_ in ["PRP$", "MD"] or token.dep_ == "appos":
                if token.is_lower:
                    capitalized = token.text.title()
                    formatted_text += capitalized
                else:
                    formatted_text += token.text
            elif token.pos_ in ["ADV", "VERB", "PRON"] and token.shape_ not in ["'x", "x'x"]: 
                capitalized = token.text.title()
                formatted_text += capitalized
            elif doc[i-1].shape_ == ":": 
                capitalized = doc[i].text.title() 
                formatted_text += capitalized
            elif token.pos_ == "ADP" and token.dep_ not in ["prep", "agent", "aux", "ROOT", "pcomp"]:
                capitalized = token.text.title()
                formatted_text += capitalized 
            elif doc[i-1].tag_ == "HYPH": 
                after_hyph = doc[i].text.title()
                formatted_text += after_hyph
            elif doc[i] != doc[-1] and doc[i+1].tag_ == "HYPH":
                before_hyph = doc[i].text.title()
                formatted_text += before_hyph
            else:
                word = token.text.lower()
                formatted_text += word
        formatted_text += token.whitespace_
    return formatted_text

In [None]:
import json
import os

script_path = os.path.abspath('__file__') 
path_list = script_path.split(os.sep)
script_directory = path_list[0:len(path_list)-1]
rel_path = "tasks/02-structural-linguistics/data/"
PATH = "/".join(script_directory[:4]) + "/" + rel_path

def validation ():
    with open(PATH + "headlines-test-set.json", "r", encoding = "utf-8") as f:
        test_data = json.load(f)
    
    passed = 0
    failed = 0

    for row in test_data:
        given = row[0]
        expected = row[1]
        result = formatting_headlines(given)
        if result != expected:
            print("FORMATED OUTPUT: ", result)
            print("EXPECTED OUTPUT: ", expected)
            print("\n")
            failed += 1
        else:
            passed += 1

    total = passed + failed
    print('Passed: {}'.format(passed))
    print('Failed: {}'.format(failed))
    print('Total: {}'.format(total))
    print("Validation score {}/{} = {}%".format(passed, total, passed / total * 100))   

#validation() 

# Passed: 96
# Failed: 4
# Total: 100
# Validation score 96/100 = 96.0%

def examiner():
    with open(PATH + 'examiner-headlines.txt') as f:
        data = f.readlines()

    not_formatted = 0
    formatted = 0
    
    for line in data:
        given = line
        result = formatting_headlines(given)
        if result != given:
            not_formatted += 1
            #print("FORMATED OUTPUT:", result)
            #print("ORIGINAL OUTPUT:", given)
        else:
            formatted += 1
    total = not_formatted + formatted
    
    print('Formatted headlines: {}'.format(formatted))
    print('Not formatted headlines: {}'.format(not_formatted))
    print('Total: {}'.format(total))
    print("Formatted score {}/{} = {}%".format(formatted, total, formatted / total * 100))

examiner() 

# Formatted headlines: 601
# Not formatted headlines: 4399
# Total: 5000
# Formatted score 601/5000 = 12.02%