In [1]:
import os
import sys

sys.path.insert(0, "../../")

from src.text.epu import EPU
from src.text.utils import generate_continous_df
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
from tqdm import tqdm
from dateutil.parser import parse

from bokeh.layouts import Row, column, gridplot
from bokeh.models import (Title, Legend, ColumnDataSource, Select, HoverTool,
                          BoxZoomTool, ResetTool, DataTable, DateFormatter,
                          TableColumn)
from bokeh.models.layouts import TabPanel, Tabs
from bokeh.plotting import figure, output_file, show, output_notebook
output_file(filename=sys.path[0] + "docs/images/interactive/text/ep_sentiment.html")

In [2]:
parent_dirs = sys.path[0] + "data/text/"
country_dirs = [
    parent_dirs + country for country in os.listdir(parent_dirs)
    if country != ".DS_Store" and "marshall_islands" not in country
]
output_dir = sys.path[0] + "outputs/text/"

In [3]:
def sentiment_analysis(df):

    sid = SentimentIntensityAnalyzer()
    results = []
    with tqdm(total=len(df)) as pbar:
        for news in df.news:
            scores = sid.polarity_scores(str(news))
            results.append(scores)
            pbar.update(1)
    return results

def calculate_sentiment(df):

    df = df[(df.econ == True)].reset_index(drop=True)

    sent_res = sentiment_analysis(df)

    df["score"] = [i["compound"] for i in sent_res]
    df["date"] = df["date"].apply(lambda x: parse(str(x)).date())
    df["date"] = pd.to_datetime(df["date"])
    month_sent = (df.set_index("date")
                    .groupby(pd.Grouper(freq="MS"))[["score"]]
                    .mean()
                    .reset_index())

    return month_sent

In [4]:
sent_dfs = []
for country in country_dirs:
    country_name = country.split("/")[-1]
    news_dirs = [
        f"{country}/{file}" for file in os.listdir(country) if "news" in file
    ]

    e = EPU(news_dirs, cutoff=None)
    e.get_epu_category(
        subset_condition="date >= '2016-01-01' and date < '2024-01-01'")
    
    dfs = pd.DataFrame()
    for _, df in e.raw_files:
        df_select = df[["news", "date", "econ", "policy"]]
        dfs = pd.concat([dfs, df_select], axis=0).reset_index(drop=True)
    
    sent_df = calculate_sentiment(dfs)

    min_date = str(sent_df.date.min().date())
    max_date = str(sent_df.date.max().date())

    sent_df = generate_continous_df(sent_df, min_date, max_date, freq="MS")
    sent_mean, sent_std = sent_df.score.mean(), sent_df.score.std()
    sent_df["z_score"] = sent_df["score"].apply(lambda x:
                                                (x - sent_mean) / sent_std)
    sent_dfs.append((country_name, sent_df))
    
    saved_folder = output_dir + f"{country_name}/sentiment/"
    if not os.path.exists(saved_folder):
        os.mkdir(saved_folder)
    
    sent_df.to_csv(saved_folder + f"{country_name}_sentiment.csv", encoding="utf-8")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5065/5065 [00:15<00:00, 317.71it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 83/83 [00:00<00:00, 191.30it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1220/1220 [00:03<00:00, 398.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8180/8180 [00:16<00:00, 487.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8252/8252 [00:20<00:00, 409.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4430/4430 [00:09<00:00, 460.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████

In [6]:
tabs = []
for name, df in sent_dfs:
    if name != "tonga":

        source = ColumnDataSource(df)

        hover = HoverTool(tooltips=[('Date', '@date{%Y-%m}'),
                                    ('Sentiment', '@score'),
                                    ('Z-Score', '@z_score')],
                          formatters={'@date': 'datetime'})

        p = figure(height=400,
                   width=750,
                   x_axis_type="datetime",
                   tools=[hover, BoxZoomTool(), ResetTool()])

        p.line("date",
               "score",
               source=source,
               name="sentiment",
               line_width=2,
               legend_label="Sentiment")

        p.line("date",
               "z_score",
               source=source,
               name="epu_unweighted",
               color='darkorange',
               line_width=1.5,
               legend_label="Z score")

        p.legend.location = "top_left"
        p.legend.click_policy = "mute"

        columns = [
            TableColumn(field="date", title="Date", formatter=DateFormatter()),
            TableColumn(field="score", title="Sentiment (Avg.)"),
            TableColumn(field="z_score", title="Z Score"),
        ]
        dt = DataTable(source=source, columns=columns, width=700, height=300)
        combined = column(p, dt)

        # Uppercase the first letter of the country name
        title = " ".join(w[0].upper() + w[1:] for w in name.split("_"))
        tab = TabPanel(child=combined, title=title)
        tabs.append(tab)
    
show(Tabs(tabs=tabs))