In [14]:
import requests
import re
import csv, json
import os
import boto3
from collections import Counter

import numpy as np
import pandas as pd
from pandarallel import pandarallel

import gensim

In [2]:
# Download hedonometer word happiness scores
fname = "hedonometer_words.json"

if not os.path.exists(fname):
    r = requests.get("https://hedonometer.org/api/v1/words/?format=json&wordlist__title=labMT-en-v2")
    hedonometer = r.json()

    with open(fname, 'w') as f:
        json.dump(hedonometer, f)
else:
    with open(fname) as f:
        hedonometer = json.load(f)

In [3]:
scores = {object['word']: object['happs'] for object in hedonometer['objects']}

# The hedonometer filters all words out that have 4<happiness<6 to increase the sensitivity
filtered_scores = {object['word']: object['happs'] for object in hedonometer['objects']
                   if not (4 < object['happs'] < 6)}

In [4]:
for word in ["can't", "cannot", "book", "books"]: 
    print(f"{word}: {filtered_scores[word]}")

can't: 3.42
cannot: 3.32
book: 7.24
books: 7.28


In [11]:
%%time
def read_csv_from_s3(bucket, key):
    s3_client = sess.client('s3')
    resp = s3_client.get_object(Bucket=bucket, Key=key)
    # Python 3.8/3.9 can't download files over 2GB via HTTP, so file is 
    # streamed just in case
    csv_str = b''.join([
        chunk for chunk in resp['Body'].iter_chunks()
    ]).decode()
    
    # keepends=True to preserve newlines within Text fields
    csv_lines = csv_str.splitlines(keepends=True)
    csv_list = list(csv.reader(csv_lines, quotechar='"'))
    df = pd.DataFrame(csv_list[1:], columns=csv_list[0])
    
    return df

sess = boto3.Session(profile_name="xmiles")
s3 = sess.client('s3')
output_keys = [
    x['Key'] for x in
    s3.list_objects_v2(Bucket="statsnz-covid-xmiles", Prefix="commoncrawl/processed_ccmain_bunches/CC-MAIN-2021-10/")['Contents']
]

ccmain = pd.concat((
    read_csv_from_s3("statsnz-covid-xmiles", key) for key in output_keys
)).reset_index(drop=True)
ccmain['Datetime'] = pd.to_datetime(ccmain['Datetime'], format="%Y-%m-%dT%H:%M:%S.000Z")

CPU times: user 1min 4s, sys: 8.25 s, total: 1min 12s
Wall time: 1min 53s


In [12]:
ccmain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1676767 entries, 0 to 5580
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype         
---  ------    --------------    -----         
 0   Datetime  1676767 non-null  datetime64[ns]
 1   URL       1676767 non-null  object        
 2   Text      1676767 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 51.2+ MB


In [13]:
ccmain.head()

Unnamed: 0,Datetime,URL,Text
0,2021-03-05 13:19:47,http://0800treetrim.nz/,Trees And First Impressions When it comes to R...
1,2021-03-05 12:38:51,http://0800treetrim.nz/author/treeguys/,Tree Care And Tree Selection\n\nTree Care star...
2,2021-03-05 12:52:16,http://0800treetrim.nz/category/treework/,Tree Care And Tree Selection\n\nTree Care star...
3,2021-03-05 13:08:37,http://0800treetrim.nz/coming-soon/,Site under construction.\n\nNew tree service c...
4,2021-03-05 14:15:11,http://0800treetrim.nz/faqs/,Why 0800 Tree Trim Ltd is topping the competit...


In [34]:
%%time
x = ccmain.loc[:10000, 'Text'].map(gensim.utils.simple_preprocess)

CPU times: user 3.11 s, sys: 124 ms, total: 3.24 s
Wall time: 3.24 s


In [35]:
%%time
y = [[word.lower() for word in re.findall(r"[\w']+", text)] for text in ccmain.loc[:10000, 'Text']]

CPU times: user 1.11 s, sys: 28.7 ms, total: 1.14 s
Wall time: 1.14 s


In [None]:
%%time
ccmain['words'] = ccmain['Text'].map(gensim.utils.simple_preprocess)

In [None]:
def get_happiness_score(text, word_scores):
    # Finds all words (including those with contractions)
    words = re.findall(r"[\w']+", text)
    word_counts = Counter(words)
    num_words = sum([count for word, count in word_counts.items() 
                     if word in word_scores.keys()])
    
    happiness_score = 0
    for word in word_counts:
        if word in word_scores:
            norm_freq = word_counts[word] / num_words
            happiness_score += word_scores[word] * norm_freq
    
    if happiness_score == 0:
        happiness_score = 5  # the text is completely neutral
        
    return happiness_score

In [None]:
# articles['Happiness'] = articles['Text'].map(get_happiness_score)
ccmain['Happiness'] = [get_happiness_score(text_i, filtered_scores) for text_i in articles['Text']]

In [None]:
print(
    [word for word, count in Counter(re.findall(r"[\w']+", ccmain.loc[3, 'Text'])).items() 
     if word in filtered_scores.keys()]
)

In [None]:
get_happiness_score("book", filtered_scores)

In [None]:
# Get daily averages
ccmain.resample('D', on='Datetime')['Happiness'].mean().dropna()