In [1]:
import requests
import re
import json
import os

import numpy as np
import pandas as pd

from collections import Counter

In [2]:
# Download hedonometer word happiness scores
fname = "hedonometer_words.json"

if not os.path.exists(fname):
    r = requests.get("https://hedonometer.org/api/v1/words/?format=json&wordlist__title=labMT-en-v2")
    hedonometer = r.json()

    with open(fname, 'w') as f:
        json.dump(hedonometer, f)
else:
    with open(fname) as f:
        hedonometer = json.load(f)

In [3]:
hedonometer['objects'][0]

{'happs': 5.1,
 'rank': 6648,
 'stdDev': 0.99,
 'text': 'according',
 'word': 'according',
 'word_english': 'according',
 'wordlist': {'date': '2020-03-28',
  'language': 'en',
  'reference': 'https://arxiv.org/abs/2003.12614',
  'referencetitle': "How the world's collective attention is being paid to a pandemic: COVID-19 related 1-gram time series for 24 languages on Twitter",
  'title': 'labMT-en-v2'}}

In [4]:
scores = {object['word']: object['happs'] for object in hedonometer['objects']}

In [5]:
list(scores.items())[:10]

[('according', 5.1),
 ('administrative', 5.1),
 ('albert', 5.1),
 ('alleen', 5.1),
 ('allen', 5.1),
 ('ave', 5.1),
 ('average', 5.1),
 ('bases', 5.1),
 ('before', 5.1),
 ('bellwether', 5.1)]

In [6]:
# The hedonometer filters all words out that have 4<happiness<6 to increase the sensitivity
filtered_scores = {object['word']: object['happs'] for object in hedonometer['objects']
                   if not (4 < object['happs'] < 6)}

In [7]:
for word in ["can't", "cannot", "book", "books"]: 
    print(f"{word}: {filtered_scores[word]}")

can't: 3.42
cannot: 3.32
book: 7.24
books: 7.28


In [8]:
articles = pd.read_csv('nz-articles.csv')

# parse dates
articles['Datetime'] = pd.to_datetime(articles['Datetime'], format="%Y-%m-%dT%H:%M:%SZ")

In [9]:
articles.head()

Unnamed: 0,Datetime,URL,Text
0,2021-03-06 22:41:36,https://www.nzherald.co.nz/nz/covid-19-coronav...,National leader Judith Collins says there has ...
1,2021-03-06 22:45:37,https://www.nzherald.co.nz/world/taiwan-in-imm...,"""Taiwan is the next big prize"" now Beijing had..."
2,2021-03-06 22:45:42,https://www.nzherald.co.nz/world/covid-19-coro...,An exhausted US Senate has narrowly approved a...
3,2021-03-06 22:49:38,https://www.nzherald.co.nz/entertainment/wanda...,Imagine being trapped in the confines of your ...
4,2021-03-06 22:49:43,https://www.nzherald.co.nz/hawkes-bay-today/ne...,"One lucky Napier player has won $500,000 as pa..."


In [10]:
for url in articles[articles['Text'].isna()]['URL']: print(url)

https://pressf1.pcworld.co.nz/showthread.php?149628-Earthquake&s=0991d0b9a317f0331876d94f79ffcce1&p=1304718
https://pressf1.pcworld.co.nz/showthread.php?149629-More-OMG-Tech-Support-Pics&s=0991d0b9a317f0331876d94f79ffcce1&p=1304723
https://pressf1.pcworld.co.nz/showthread.php?149626-Prime-Time&s=c403163e6cd075bd42b0f09f6159daf9&p=1304709
https://www.philips.co.nz/c-m-ho/cooking-accessories/all
https://pressf1.pcworld.co.nz/showthread.php?149630-CCLEANER-issues&s=9aeda749d08133cb7c4d5892806291d6&p=1304730
https://www.philips.co.nz/p-p/DIS602_01/zoom-nitewhite-take-home-whitening-treatment/support
http://times-age.co.nz/1265-2/
https://pressf1.pcworld.co.nz/showthread.php?149633-LG-plasma-42PG10R-Disappeared-from-Input-Option&s=cff20dc01fd40835685b5ca8042f34a3&p=1304757
https://pressf1.pcworld.co.nz/showthread.php?149632-Darn-missed-out-again&s=9aeda749d08133cb7c4d5892806291d6&p=1304736


In [14]:
def get_happiness_score(text, word_scores):
    # Finds all words (including those with contractions)
    words = re.findall(r"[\w']+", text)
    word_counts = Counter(words)
    num_words = sum([count for word, count in word_counts.items() 
                     if word in word_scores.keys()])
    
    happiness_score = 0
    for word in word_counts:
        if word in word_scores:
            norm_freq = word_counts[word] / num_words
            happiness_score += word_scores[word] * norm_freq
    
    if happiness_score == 0:
        happiness_score = 5  # the text is completely neutral
        
    return happiness_score

In [16]:
# articles['Happiness'] = articles['Text'].map(get_happiness_score)
articles['Happiness'] = [get_happiness_score(text_i, filtered_scores) for text_i in articles['Text']
                         if not text_i.isna()
                         else np.nan]

SyntaxError: invalid syntax (<ipython-input-16-1195265cdfd8>, line 4)

In [25]:
g = pd.read_csv('nz-articles_old.csv')
g[g['URL'] == "http://times-age.co.nz/1265-2/"]['Text']

1615     Add a comment10:46 AM Friday Aug 26, 2016  Yo...
Name: Text, dtype: object

In [None]:
print(
    [word for word, count in Counter(re.findall(r"[\w']+", articles.loc[3, 'Text'])).items() 
     if word in filtered_scores.keys()]
)

In [None]:
get_happiness_score("book", filtered_scores)

In [None]:
# Get daily averages
articles.resample('D', on='Datetime')['Happiness'].mean().dropna()

In [None]:
articles.shape

In [None]:
len(os.listdir("CC-NEWS warc gz"))