# Headlines

In [431]:
import spacy
import json
import string
import numpy as np

In [4]:
nlp = spacy.load("en_core_web_md")

In [442]:
def smart_capitalize(word):
    if word.isupper():
        return word
    elif any([letter.isupper() for letter in word]):
        return word
    elif word[0] in string.punctuation:
        return word[0] + word[1:].capitalize()
    else:
        return word.capitalize()

def capitalize_title(s):
    doc = nlp(s)
    pos_tags = ['NOUN', 'PRON', 'VERB', 'ADJ', 'ADV', 'AUX', 'PROPN', 'NUM']
    capitalized_string = ""
    word = ""
    num_tokens = len(doc)
    
    for i, token in enumerate(doc):
        word += token.text
            
        if (
            i == 0 or 
            i == num_tokens - 1 or
            token.pos_ in pos_tags or
            ((token.pos_ == 'ADP' or token.pos_ == 'SCONJ') and token.dep_ == 'mark') or
            (token.pos_ == 'PART' and token.dep_ == 'neg' and word == token.text) or # to handle separate Not word
            token.dep_ == 'poss'
        ):
            word = smart_capitalize(word)
        
        # if token is not a whole word, we will go to next
        if not token.whitespace_ and i != num_tokens - 1:
            #print(2)
            continue
          
        
        if '-' in word and len(word) > 1:
            word = '-'.join([smart_capitalize(part) for part in word.split('-')])
            capitalized_string += word + token.whitespace_
            word = ""
        elif len(word) > 3:
            capitalized_string += smart_capitalize(word) + token.whitespace_
            word = ""
        else:
            capitalized_string += word + token.whitespace_
            word = ""
            
    return capitalized_string

# 2. Working with data

In [139]:
def read_examiner():
    path = '/home/yevhen/prj/prj-nlp-2020/tasks/02-structural-linguistics/data/examiner-headlines.txt'
    with open(path, 'r') as f:
        corpus = f.read().splitlines()
    return corpus

In [140]:
examiner = read_examiner()

In [144]:
j = 1
examiner[j]

"The phantoms of St. Mary's"

In [149]:
capitalize_title(nlp(examiner[j]))

"The Phantoms of St. Mary's"

In [445]:
def calc_accuracy_on_corpus(corpus, validate=False):
    corpus_len = len(corpus)
    correct = 0
    for item in corpus:
        if validate:
            capitalized_title = capitalize_title(item[0])
            if capitalized_title == item[1]:
                correct += 1
            else:
                # Here we print correct and our version to see, where we did not succeed 
                print(item[0], '---', capitalized_title, '---', item[1], '\n')
        else:
            try:
                # I did not catch all edge cases, so added this try-except
                capitalized_title = capitalize_title(item)
            except:
                continue
            if capitalized_title == item:
                correct += 1
    return correct / corpus_len

In [446]:
calc_accuracy_on_corpus(examiner)

0.1472

In [156]:
def read_test_set():
    path = '/home/yevhen/prj/prj-nlp-2020/tasks/02-structural-linguistics/data/headlines-test-set.json'
    with open(path, 'r') as f:
        test_set = json.load(f)
    return test_set

In [157]:
test_set = read_test_set()

In [340]:
calc_accuracy_on_corpus(test_set, validate=True)

How To Design A College Curriculum to Help You in Life --- How To Design A College Curriculum to Help You in Life --- How to Design a College Curriculum to Help You in Life 

Brazilian & bikini waxing how to choose an sf bay area hair removal salon or spa --- Brazilian & Bikini Waxing How to Choose an Sf Bay Area Hair Removal Salon or Spa --- Brazilian & Bikini Waxing How to Choose an SF Bay Area Hair Removal Salon or Spa 

How it all plays out on Church Street --- How It all Plays out on Church Street --- How It All Plays out on Church Street 

How to Rock at Marketing (and Still Like Yourself in the Morning) --- How to Rock at Marketing (And Still Like Yourself in the Morning) --- How to Rock at Marketing (and Still Like Yourself in the Morning) 

'Jackass 3D' trailer explodes right in your face so don't try this at home (video) --- 'Jackass 3D' Trailer Explodes Right in Your Face So Don't Try This at Home (Video) --- 'Jackass 3D' Trailer Explodes Right in Your Face so Don't Try This

0.95

In [386]:
import nltk

In [390]:
from nltk.corpus import sentiwordnet as swn

In [472]:
def are_there_named_entities(doc):
    selected_entity_types = ['PERCENT', 'PRODUCT', 'WORK_OF_ART', 'MONEY', 'QUANTITY']
    for ent in doc.ents:
        if ent.label_ in selected_entity_types:
            return True
    return False

def is_there_non_neutral_score(doc):
    pos_to_senti_mapping = {
    'NOUN': 'n',
    'VERB': 'v',
    'ADJ': 'a',
    'ADV': 'r',
    }
    for token in doc:
        if token.pos_ in ('NOUN', 'VERB', 'ADJ', 'ADV'):
            sentis = list(swn.senti_synsets(token.text, pos_to_senti_mapping[token.pos_]))
            positive_scores = sum([senti.pos_score() for senti in sentis[:5]])
            negative_scores = sum([senti.neg_score() for senti in sentis[:5]])
            # Avoids dividing by zero
            if negative_scores:
                score_ratio = np.divide(positive_scores, negative_scores)
                if score_ratio > 0.5:
                    return True
    return False

def are_there_degrees(doc):
    # If any word has degree, degrees will be non empty
    degrees = [w.morph.degree_ for w in doc if w.morph.degree_ != '']
    return bool(degrees)

In [473]:
def get_viral_score(corpus):
    corpus_len = len(corpus)
    named_ents = 0
    sentiments = 0
    degrees = 0
    for item in corpus:
        doc = nlp(item)
        named_ents += are_there_named_entities(doc)
        sentiments += is_there_non_neutral_score(doc)
        degrees += are_there_degrees(doc)
        
    print(f'Percent of sentences with named entities is {named_ents / corpus_len}')
    print(f'Percent of sentences with sentiments is {sentiments / corpus_len}')
    print(f'Percent of sentences with degrees is {degrees / corpus_len}')

In [474]:
get_viral_score(examiner)

Percent of sentences with named entities is 0.0728
Percent of sentences with sentiments is 0.3704
Percent of sentences with degrees is 0.4984
