In [2]:
import spacy
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag
import en_core_web_md
nlp = en_core_web_md.load()

In [3]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

lemmatizer = WordNetLemmatizer()

def get_sentiment(word,tag):
    """ returns list of pos neg and objective score. But returns empty list if not present in senti wordnet. """

    wn_tag = penn_to_wn(tag)
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
        return []

    lemma = lemmatizer.lemmatize(word, pos=wn_tag)
    if not lemma:
        return []

    synsets = wn.synsets(word, pos=wn_tag)
    if not synsets:
        return []

    # Take the first 5 senses and calculate arithmetical mean for both positive and negative values
    synset = synsets[:5]
    pos, neg = 0, 0
    for syn in synset:
        swn_synset = swn.senti_synset(syn.name())
        pos += swn_synset.pos_score()
        neg += swn_synset.neg_score()
    pos_score = pos/len(synset)
    neg_score = neg/len(synset)

    return [pos_score, neg_score]

In [4]:
def virus(text):
    """ returns list of flags for viral title"""
    entity, emotion, comparative = 0, 0, 0
    tokens = nlp(text)
    for ent in tokens.ents:
        if ent.label_ in (['PERSON','ORG']):
            entity = 1
    words_data = text.split()
    pos_val = nltk.pos_tag(words_data)
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    for s in senti_val:
        for i in s:
            if i>=0.5:
                emotion = 1
    for (x,y) in pos_val:
        if y in (['JJR', 'JJS', 'RBR', 'RBS']): 
            comparative = 1
    return [entity, emotion, comparative]

In [5]:
filepath = "../../../tasks/02-structural-linguistics/data/examiner-headlines.txt"
data = []
with open(filepath) as fp:
    for line in fp:
        data.append(virus(line))

In [6]:
for k in range(3):
    print(f'Кількість статей які мають {k+1} ознаку дорівнює {sum(i[k] for i in data)/len(data)*100} %')

Кількість статей які мають 1 ознаку дорівнює 53.04 %
Кількість статей які мають 2 ознаку дорівнює 9.8 %
Кількість статей які мають 3 ознаку дорівнює 4.26 %
