In [22]:
import nltk
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
import pandas as pd
import json
import copy
import string
import re
import math
from collections import Counter
from typing import Any
PATH_TO_UNTOKENIZED_JSON = "../data/shuffled_untokenized_article_swift_data.json"
PATH_TO_CSV_OUTPUT = "../sentiment_analysis/pos_tag_sentiment_analysis.csv"
PATH_TO_JSON_OUTPUT = "../sentiment_analysis/pos_tag_sentiment_analysis.json"

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     /Users/studybuggy/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/studybuggy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/studybuggy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/studybuggy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/studybuggy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
STOP_WORDS = set(stopwords.words('english'))
STOP_WORDS.add("n't")
STOP_WORDS.add("'s")
PUNCTUATION = set(list(string.punctuation))
with open(PATH_TO_UNTOKENIZED_JSON, 'r') as f:
    json_data = json.load(f)


In [24]:
json_copy = copy.deepcopy(json_data)

In [25]:
def penn_to_wn(tag):
    """
    Convert between the PennTreebank tags to simple Wordnet tags
    """
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None


In [26]:
def idf(N:int, nt:int):
    return math.log(N/nt)

def idf_smooth(N: int, nt: int):
    return math.log(N/(1+nt)) + 1


def compute_doc_tf(doc_words:list)-> dict[str, float]:
    doc_word_counts = Counter(doc_words)
    total_count = doc_word_counts.total()
    doc_tf = dict(doc_word_counts)
    
    for w,c in doc_tf.items():
        doc_word_counts[w] = c/total_count
    
    return doc_tf


def compute_all_idf(all_tf: list[dict[Any, float]]) -> dict[str, float]:
    N = len(all_tf)
    
    # Get all the unique terms through all the articles
    all_terms = [list(doc_tf.keys()) for doc_tf in all_tf]
    all_terms = set(sum(all_terms, start=[]))
    
    # create idf to be returned
    idf = dict[tuple[str], float]()
    
    # loop through all unique terms
    for term in all_terms:
        # start counting how many documents contain this word
        nt = 0
        for doc in all_tf:
            if term in doc:
                nt += 1
        idf[term] = idf_smooth(N, nt)
    
    return idf

In [27]:

def compute_articles_tf_idf(articles_info: list[dict]):
    for article in articles_info:
        content: str = article['content']
        
        # replace all punctuation and random characters with a space
        # we do this to clean stop words and random characters since we care more
        # about the content of the articles
        content = re.sub(r'/n', '\n', content)
        content = re.sub(r'[^a-zA-Z0-9]', ' ', content)
        
        # tokenize content from articles using nltk tokenize
        # We use tweet tokenizer since it provides better tokenization
        terms = word_tokenize(content)
        pos_terms = pos_tag(terms)
        # remove stop words from terms
        pos_terms = [pos_t for pos_t in pos_terms if pos_t[0].lower() not in STOP_WORDS]
        
        # store tokenized content
        article['tokenized-pos-content'] = pos_terms
        
        # store the term frequencies
        article['term-frequencies'] = compute_doc_tf(pos_terms)
    
    # compute the idf for all terms existing in every documents
    all_tf = [article['term-frequencies'] for article in articles_info]
    all_idf = compute_all_idf(all_tf)
    
    # compute tf-idf
    for article in articles_info:
        # get the term frequencies 
        doc_tf = article['term-frequencies']
        
        # multiply every term frequency with their idf score
        doc_tf_idf = {pos_term: tf * all_idf[pos_term] for pos_term, tf in doc_tf.items()}
        article['tf-idf'] = doc_tf_idf
    


compute_articles_tf_idf(json_copy)

In [28]:
def compute_sentiment(tf_idf : dict[tuple[str], float], top_words:int = None) -> (float, dict[str, float]):

    # use tweet tokenizer to remove contraction words as well
    tweet_tokenizer = TweetTokenizer()
    tfidf_items = list(tf_idf.items())
    
    tfidf_items.sort(key=lambda item: item[1], reverse=True)
    
    # create word lemmatizer to be used for getting meaning out of words
    lemmatizer = WordNetLemmatizer()
    
    tfidf_sentiment = 0.0
    words = []
    for word_postag, tfidf_score in tfidf_items:
        word, tag = word_postag
        # stop if we have computed the right amount of words
        if (top_words is not None and len(words) >= top_words):
            break
        # # convert tag to wornet tag
        # # make sure we keep nouns, adjectives, adverbs and verbs only
        wn_tag = penn_to_wn(tag)
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
            continue
        
        # create a lemma to represent the word's meaning
        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        if not lemma:
            continue
        
        # find synonym words representing the meaning the best from lemma
        # and grammatical position of the word
        synsets = wn.synsets(lemma, pos=wn_tag)
        if not synsets:
            continue

        # Take the first sense, the most commonly used sense
        synset = synsets[0]
        # Get sentiment analysis of first sense word
        swn_synset = swn.senti_synset(synset.name())

        # calculate the positive vs negative score
        word_sentiment = swn_synset.pos_score() - swn_synset.neg_score()
        tfidf_sentiment += (tfidf_score * word_sentiment)
        
        # calculate the token counts, this can be used to get an avg of sentiment
        # for each non stopword token in the article
        words.append((word_postag, word_sentiment))
    
    top_lemmatized_words = {word_postag[0]:{'pos-tag':word_postag[1], 'tfidf-senti-score':tf_idf[word_postag] * sentiment,} for word_postag, sentiment in words}
    
    
    return (tfidf_sentiment, top_lemmatized_words)

In [29]:
def compute_sentiment_scores(json_data):
    # compute the sentiment scores and store in json data
    for article in json_data:
        sentiment, top_lemmatized_words = compute_sentiment(article['tf-idf'], top_words=10)
        article['total-sentiment-score'] = sentiment
        article['top-10-lemmatized-words'] = top_lemmatized_words
        
        # reformat json for term frequency and tf-idf
        term_frequencies:list[dict[str, Any]]  = list()
        for term_pos_tag, freq in article['term-frequencies'].items():
            term, pos_tag = term_pos_tag
            term_frequencies.append(dict(word=term, tag=pos_tag, freq=freq))
        article['term-frequencies'] = term_frequencies
        
        tf_idf : list[dict[str, Any]] = list()
        for term_pos_tag, tfidf_score in article['tf-idf'].items():
            term, pos_tag = term_pos_tag
            tf_idf.append(dict(word=term, tag=pos_tag, score=tfidf_score))
        article['tf-idf'] = tf_idf
    
        
compute_sentiment_scores(json_copy)



In [30]:
# create csv file
def json_to_df(json_data):
    
    columns = list(json_data[0].keys())
    columns.remove('top-10-lemmatized-words')
    columns.remove('tf-idf')
    columns.remove('term-frequencies')
    columns.remove('tokenized-pos-content')
    dict_df = {c:[] for c in columns}
    
    for article in json_data:
        for col in columns:
            dict_df[col].append(article[col])
    
    return pd.DataFrame.from_dict(dict_df)

df = json_to_df(json_copy)
df

Unnamed: 0,author,title,description,url,source,category,country,published_at,isPriority,hasMain,id,content,total-sentiment-score
0,"Victor Barbosa, Yardbarker",Mark Cuban wants to introduce Taylor Swift to ...,If Taylor Swift's much-rumored romance with Ka...,https://www.yardbarker.com/nba/articles/mark_c...,Yardbarker,sports,us,2023-09-28T19:44:18+00:00,False,False,0,If Taylor Swift's much-rumored romance with K...,2.244153
1,"Hugh McIntyre, Contributor",Taylor Swift Becomes The First Woman To Hit A ...,August 2023 has turned out to be perhaps Swift...,https://www.forbes.com/sites/hughmcintyre/2023...,Forbes,general,us,2023-08-29T17:42:14+00:00,True,True,1,US singer-songwriter Taylor Swift poses on the...,7.401537
2,Jonah Valdez,Travis Kelce's ex-girlfriend says she's gettin...,Travis Kelce's ex-girlfriend Maya Benberry sai...,https://www.latimes.com/entertainment-arts/sto...,latimes,general,us,2023-09-30T01:42:09+00:00,True,True,2,"Travis Kelce’s former girlfriend, Maya Benberr...",17.387883
3,TMZ Staff,Travis Kelce Arrives In Argentina Ahead of Tay...,Travis Kelce has touched down in Argentina -- ...,https://www.tmz.com/2023/11/10/travis-kelce-ar...,TMZ,entertainment,us,2023-11-10T18:15:59+00:00,True,True,3,update/n11:54 AM PT -- Huge bummer for Tayvis ...,-5.706278
4,Areeba Basharat,‘Need Taylor Swift’: Moments After the Devasta...,The first day out on the Italian greens turned...,https://www.essentiallysports.com/golf-news-pg...,Essentially Sports,sports,us,2023-09-29T19:06:25+00:00,False,True,4,Follow Us/nvia Reuters/nGolf – The 2023 Ryder ...,9.628437
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,TMZ Staff,Travis Kelce Opens Up About Taylor Swift Relat...,Travis Kelce is opening up about the one thing...,https://www.tmz.com/2023/09/27/travis-kelce-ta...,TMZ,entertainment,us,2023-09-27T13:32:19+00:00,True,True,495,Travis Kelce is opening up about the one thing...,7.650970
496,"Mary Whitfill Roeloffs, Forbes Staff",Taylor Swift’s ‘Eras Tour’ Film Rakes In Recor...,A film documenting Taylor Swift’s megatour ear...,https://www.forbes.com/sites/maryroeloffs/2023...,Forbes,general,us,2023-10-16T18:29:08+00:00,True,True,496,Taylor Swift’s concert film earned $92.8 milli...,-1.939834
497,Audrey Rock,David Beckham Addresses ‘Noise’ Surrounding Ta...,As speculation about Taylor's romance with the...,https://hollywoodlife.com/2023/10/03/david-bec...,hollywoodlife,general,us,2023-10-03T20:45:38+00:00,False,True,497,\n\t\t\t\tAs speculation about Taylor's romanc...,18.999955
498,,Taylor Swift enters her football era and break...,Taylor Swift enters her football era and break...,https://www.npr.org/2023/09/26/1201849542/tayl...,wnyc,general,us,2023-09-26T20:28:02+00:00,True,True,498,"By /n\n\n Mia Venkat\n \n/n, /n\n\n ...",2.886523


In [31]:
with open(PATH_TO_JSON_OUTPUT, 'w') as f:
    json.dump(json_copy, f, indent=2)

with open(PATH_TO_CSV_OUTPUT, 'w') as f:
    df.to_csv(f, index=True)


