In [1]:
import requests
import re
import json
import os

import numpy as np
import pandas as pd

from collections import Counter

In [2]:
# Download hedonometer word happiness scores
fname = "hedonometer_words.json"

if not os.path.exists(fname):
    r = requests.get("https://hedonometer.org/api/v1/words/?format=json&wordlist__title=labMT-en-v2")
    hedonometer = r.json()

    with open(fname, 'w') as f:
        json.dump(hedonometer, f)
else:
    with open(fname) as f:
        hedonometer = json.load(f)

In [3]:
hedonometer['objects'][0]

{'happs': 5.1,
 'rank': 6648,
 'stdDev': 0.99,
 'text': 'according',
 'word': 'according',
 'word_english': 'according',
 'wordlist': {'date': '2020-03-28',
  'language': 'en',
  'reference': 'https://arxiv.org/abs/2003.12614',
  'referencetitle': "How the world's collective attention is being paid to a pandemic: COVID-19 related 1-gram time series for 24 languages on Twitter",
  'title': 'labMT-en-v2'}}

In [4]:
scores = {object['word']: object['happs'] for object in hedonometer['objects']}

In [5]:
list(scores.items())[:10]

[('according', 5.1),
 ('administrative', 5.1),
 ('albert', 5.1),
 ('alleen', 5.1),
 ('allen', 5.1),
 ('ave', 5.1),
 ('average', 5.1),
 ('bases', 5.1),
 ('before', 5.1),
 ('bellwether', 5.1)]

In [6]:
# The hedonometer filters all words out that have 4<happiness<6 to increase the sensitivity
filtered_scores = {object['word']: object['happs'] for object in hedonometer['objects']
                   if not (4 < object['happs'] < 6)}

In [7]:
for word in ["can't", "cannot", "book", "books"]: 
    print(f"{word}: {filtered_scores[word]}")

can't: 3.42
cannot: 3.32
book: 7.24
books: 7.28


In [8]:
articles = pd.read_csv('cc-nz-articles.csv')

# parse dates
articles['Datetime'] = pd.to_datetime(articles['Datetime'], format="%Y-%m-%dT%H:%M:%SZ")

In [9]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1709 entries, 0 to 1708
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Datetime  1709 non-null   datetime64[ns]
 1   URL       1709 non-null   object        
 2   Text      1709 non-null   object        
dtypes: datetime64[ns](1), object(2)
memory usage: 40.2+ KB


In [10]:
def get_happiness_score(text, word_scores):
    # Finds all words (including those with contractions)
    words = re.findall(r"[\w']+", text)
    word_counts = Counter(words)
    num_words = sum([count for word, count in word_counts.items() 
                     if word in word_scores.keys()])
    
    happiness_score = 0
    for word in word_counts:
        if word in word_scores:
            norm_freq = word_counts[word] / num_words
            happiness_score += word_scores[word] * norm_freq
    
    if happiness_score == 0:
        happiness_score = 5  # the text is completely neutral
        
    return happiness_score

In [11]:
# articles['Happiness'] = articles['Text'].map(get_happiness_score)
articles['Happiness'] = [get_happiness_score(text_i, filtered_scores) for text_i in articles['Text']]

In [12]:
print(
    [word for word, count in Counter(re.findall(r"[\w']+", articles.loc[3, 'Text'])).items() 
     if word in filtered_scores.keys()]
)

['windows', 'problem', 'raised', 'smashed', 'up', 'broken', 'active', 'not', 'good', 'well', 'damage', 'done', 'think', 'rest', 'community', 'ownership', 'failed', 'huge', 'costs', 'all', 'no', 'proposal', 'save', 'preserved', "couldn't", 'ahead', 'issues', 'service', 'parade', 'will', 'band', 'war', 'traffic', 'plans', 'concert', 'future', 'information', 'trading', 'enable', 'share', 'help', 'towns', 'invited', 'contact', 'tour', 'meet', 'opportunities', 'thanks', 'food', 'tree', 'donated', 'toward', 'new', 'homes', 'honour', 'lost', 'lives', 'tribute', 'country']


In [13]:
get_happiness_score("book", filtered_scores)

7.24

In [14]:
# Get daily averages
articles.resample('D', on='Datetime')['Happiness'].mean().dropna()

Datetime
2021-03-04    5.840926
2021-03-05    5.855561
2021-03-06    5.770108
2021-03-07    5.979881
Freq: D, Name: Happiness, dtype: float64