In [1]:
import nltk
import spacy
from nltk.corpus import sentiwordnet as swn
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def convert_tag (pos_tag):
    if pos_tag.startswith('J'):
        return "a"
    elif pos_tag.startswith('N'):
        return "n"
    elif pos_tag.startswith('R'):
        return "r"
    return "v"

def emotion_score (word, pos_tag):
    pos_score = 0
    neg_score = 0
    for element in list(swn.senti_synsets(word, pos = convert_tag(pos_tag)))[:5] :
        pos_score += element.pos_score()
        neg_score += element.neg_score()
    if pos_score >= 0.5 or neg_score >= 0.5:
        return True

In [5]:
def if_catchy (headline):
    pos_tags = ['JJS', 'JJR']
    parsed_headline = nlp(headline)
    has_named_entity = find_named_entity(parsed_headline)
    result = [False, False, has_named_entity] 
    for token in parsed_headline:
        if token.tag_ in pos_tags:
            result[0] = True
        if emotion_score(token.text, token.tag_):
            result[1] = True
    result.append(all(result))
    return pd.Series(result)

In [14]:
data = pd.read_csv('examiner-headlines.txt', sep='\n', header=None, names=['headline'])
data[['has_tags', 'is_emotional', 'has_named_entity', 'is_viral']] = data['headline'].apply(if_catchy)

In [64]:
data.sample(10, weights = 'is_viral')

Unnamed: 0,headline,has_tags,is_emotional,has_named_entity,is_viral
743,Robert Pattinson on E! News -- kissing scenes ...,True,True,True,True
4808,Rudolph the red-nosed reindeer should worry ab...,True,True,True,True
4288,Bobby Deen concocts 'Everybody's Favorite Comf...,True,True,True,True
4977,"Best Buy last-minute deals: Surface tablet, Ma...",True,True,True,True
9,Jersey Shore Season 6 cast's salaries revealed...,True,True,True,True
4276,More MAC launches and new releases for April,True,True,True,True
2082,"The Week Ahead: Indie films, burgers and brews...",True,True,True,True
2929,HBO's 'Game of Thrones' is medieval fantasy at...,True,True,True,True
3413,"Navy's newest aircraft carrier, the USS Gerald...",True,True,True,True
2395,'Suga' Rashad Evans might be the best light he...,True,True,True,True


In [11]:
data['has_tags'].value_counts(normalize = True)* 100

False    96.12
True      3.88
Name: has_tags, dtype: float64

In [12]:
data['is_emotional'].value_counts(normalize = True)* 100

True     64.7
False    35.3
Name: is_emotional, dtype: float64

In [13]:
data['has_named_entity'].value_counts(normalize = True)* 100

True     57.02
False    42.98
Name: has_named_entity, dtype: float64