In [None]:
import spacy
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
#nltk.download('wordnet')
#nltk.download('sentiwordnet')
#nltk.download('stopwords')

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:
def convert_swn_pos(pos):
    if pos == "VERB":
        return "v"
    elif pos == "NOUN":
        return "n"
    elif pos == "ADJ":
        return "a"
    elif pos == "ADV": 
        return "r"
    else:
        return "a"

def adj_adv_finder(headline):
    doc = nlp(headline)
    adj_adv_feature = 0
    for token in doc:
        if token.tag_ in ["JJR", "JJS", "RBR", "RBS"]:
            adj_adv_feature += 1
    return adj_adv_feature

def ner_finder(headline):
    doc = nlp(headline)
    is_person = "PERSON"
    is_org = "ORG"
    is_gpe = "GPE"
    
    total = 0
    person = 0
    org = 0
    gpe = 0
    
    for ent in doc.ents:
        if ent.label_ == is_person:
            person += 1
        if ent.label_ == is_org:
            org += 1
        if ent.label_ == is_gpe:
            gpe += 1
    total = person + org + gpe
    return total


def sentiment_finder(headline):
    doc = nlp(headline)    
    headline_positive_score = 0
    headline_negative_score = 0

    for token in doc:
        if token.pos_ in ["VERB", "NOUN", "ADJ", "ADV"]:
            pos = convert_swn_pos(token.pos_)
            senti = list(swn.senti_synsets(token.text, pos))
            count = 0
            posscore = 0
            negscore = 0
            if len(senti) > 0:
                for synst in senti:
                    count += 1
                    if count <= 5:
                        posscore += synst.pos_score()
                        negscore += synst.neg_score()
                pos_average = posscore / count 
                neg_average = negscore / count
                headline_positive_score += pos_average
                headline_negative_score += neg_average

    if headline_positive_score  > 0.5:
        return "POSITIVE"
    elif headline_negative_score > 0.5:
        return "NEGATIVE"
    else:
        return "NEUTRAL"

In [None]:
import json
import os

script_path = os.path.abspath('__file__') 
path_list = script_path.split(os.sep)
script_directory = path_list[0:len(path_list)-1]
rel_path = "tasks/02-structural-linguistics/data/"
PATH = "/".join(script_directory[:4]) + "/" + rel_path


def examiner():
    with open(PATH + 'examiner-headlines.txt') as f:
        data = f.readlines()

    headlines = 0
    total_a = 0
    total_ner = 0
    n_ner_headlines = 0
    a_headlines = 0
    ner_headlines = 0
    positive_headlines = 0
    negative_headlines = 0
    neutral_headlines = 0
    
    for line in data:
        headlines += 1
        result = adj_adv_finder(line)
        ner_feature = ner_finder(line)
        senti_feature = sentiment_finder(line)
        if result != 0:
            a_headlines += 1
            total_a += result
    
        if ner_feature != 0:
            ner_headlines += 1
            total_ner += ner_feature
            if ner_feature > 1:
                n_ner_headlines += 1 
    
        if senti_feature == "POSITIVE":
            positive_headlines += 1
        elif senti_feature == "NEGATIVE":
            negative_headlines += 1
        else:
            neutral_headlines += 1
            
    print('Headlines with ADJ_ADV Feature: {}'.format(a_headlines))
    print('Total (adjectives & adverbs): {}'.format(total_a))
    print("ADJ_ADV_FEATURE SCORE {}/{} = {}%".format(a_headlines, headlines, a_headlines / headlines * 100))
    
    print('Headlines with NER: {}'.format(ner_headlines))
    print('Headlines with several NERs: {}'.format(n_ner_headlines))
    print('Total number of NERs: {}'.format(total_ner))
    print("NER_FEATURE SCORE {}/{} = {}%".format(ner_headlines, headlines, ner_headlines / headlines * 100))
    
    print("Positive headline score {}/{} = {}%".format(positive_headlines, headlines, positive_headlines / headlines * 100))
    print("Negative headline score {}/{} = {}%".format(negative_headlines, headlines, negative_headlines / headlines * 100))
    print("SENTIMENT_FEATURE SCORE {}+{} = {:.2f}%".format(positive_headlines, negative_headlines, (positive_headlines + negative_headlines) / headlines * 100))

examiner() 
# Headlines with ADJ_ADV Feature: 231
# Total (adjectives & adverbs): 241
# ADJ_ADV_FEATURE SCORE 231/5000 = 4.62%
# Headlines with NER: 3240
# Headlines with several NERs: 1336
# Total number of NERs: 5012
# NER_FEATURE SCORE 3240/5000 = 64.8%
# Positive headline score 390/5000 = 7.8%
# Negative headline score 346/5000 = 6.92%
# SENTIMENT_FEATURE SCORE 390+346 = 14.72%