In [1]:
import nltk
import spacy
from nltk.corpus import sentiwordnet as swn
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
def convert_tag (pos_tag):
    if pos_tag.startswith('J'):
        return "a"
    elif pos_tag.startswith('N'):
        return "n"
    elif pos_tag.startswith('R'):
        return "r"
    return "v"

def emotion_score (word, pos_tag):
    pos_score = 0
    neg_score = 0
    for element in list(swn.senti_synsets(word, pos = convert_tag(pos_tag)))[:5] :
        pos_score += element.pos_score()
        neg_score += element.neg_score()
    if pos_score >= 0.5 or neg_score >= 0.5:
        return True

def find_named_entity (headline):
    ner_labes = ['PERSON', 'ORG', 'EVENT']
    for ent in headline.ents:
        if ent.label_ in ner_labes:
            return True
    return False

In [4]:
def if_catchy (headline):
    pos_tags = ['JJS', 'JJR' 'RBR' 'RBS']
    parsed_headline = nlp(headline)
    has_named_entity = find_named_entity(parsed_headline)
    result = [False, False, has_named_entity] 
    for token in parsed_headline:
        if token.tag_ in pos_tags:
            result[0] = True
        if emotion_score(token.text, token.tag_):
            result[1] = True
    result.append(all(result))
    return pd.Series(result)

In [5]:
data = pd.read_csv('examiner-headlines.txt', sep='\n', header=None, names=['headline'])
data[['has_tags', 'is_emotional', 'has_named_entity', 'is_viral']] = data['headline'].apply(if_catchy)

In [6]:
data.sample(10, weights = 'is_viral')

Unnamed: 0,headline,has_tags,is_emotional,has_named_entity,is_viral
1396,Wine Spectator's 200 best value wines -- which...,True,True,True,True
563,"Robert Pattinson, Taylor Lautner, Kellan Lutz ...",True,True,True,True
4808,Rudolph the red-nosed reindeer should worry ab...,True,True,True,True
135,Best 2014 Black Friday TV deals online: Amazon...,True,True,True,True
4856,CDC releases latest Sexually Transmitted Infec...,True,True,True,True
4510,"Salvadorans Largest Hispanic Group on LI, Mexi...",True,True,True,True
4765,Fowler: 2-year-old Bassett mix with the cutest...,True,True,True,True
1829,Find your new best friend at the PetSmart Nati...,True,True,True,True
3467,UCLA's kidney transplant survival rate is best...,True,True,True,True
224,Live video and latest updates on the BP oil sp...,True,True,True,True


In [7]:
data['has_tags'].value_counts(normalize = True) * 100

False    98.1
True      1.9
Name: has_tags, dtype: float64

In [8]:
data['is_emotional'].value_counts(normalize = True) * 100

True     65.68
False    34.32
Name: is_emotional, dtype: float64

In [9]:
data['has_named_entity'].value_counts(normalize = True) * 100

True     53.88
False    46.12
Name: has_named_entity, dtype: float64

In [10]:
data['is_viral'].value_counts(normalize = True )* 100

False    99.14
True      0.86
Name: is_viral, dtype: float64