In [1]:
import os
import sys

sys.path.insert(0, "../../")

from src.text.epu import EPU
from src.text.utils import generate_continous_df
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
from tqdm import tqdm
from dateutil.parser import parse

from bokeh.layouts import Row, column, gridplot
from bokeh.models import (Title, Legend, ColumnDataSource, Select, HoverTool,
                          BoxZoomTool, ResetTool, DataTable, DateFormatter,
                          TableColumn, LinearAxis, LegendItem, CustomJS, Toggle)
from bokeh.models.layouts import TabPanel, Tabs
from bokeh.plotting import figure, output_file, show, output_notebook
from bokeh.palettes import Category20

output_file(filename=sys.path[0] +
            "docs/images/interactive/text/ep_sentiment.html")

In [2]:
parent_dirs = sys.path[0] + "data/text/"
country_dirs = [
    parent_dirs + country for country in os.listdir(parent_dirs)
    if country != ".DS_Store" and "marshall_islands" not in country
]
output_dir = sys.path[0] + "outputs/text/"

In [3]:
def sentiment_analysis(df):

    sid = SentimentIntensityAnalyzer()
    results = []
    with tqdm(total=len(df)) as pbar:
        for news in df.news:
            scores = sid.polarity_scores(str(news))
            results.append(scores)
            pbar.update(1)
    return results


def calculate_sentiment(df):

    df = df[(df.econ) & (df.policy)].reset_index(drop=True)

    sent_res = sentiment_analysis(df)

    df["score"] = [i["compound"] for i in sent_res]
    df["date"] = df["date"].apply(lambda x: parse(str(x)).date())
    df["date"] = pd.to_datetime(df["date"])
    month_sent = (df.set_index("date").groupby(
        pd.Grouper(freq="MS"))[["score"]].mean().reset_index())

    return month_sent, df.score.mean(), df.score.std()

In [4]:
sent_dfs = []
for country in country_dirs:
    country_name = country.split("/")[-1]
    news_dirs = [
        f"{country}/{file}" for file in os.listdir(country)
        if "news" in file and "ner" not in file
    ]

    e = EPU(news_dirs, cutoff=None)
    e.get_epu_category(
        subset_condition="date >= '2016-01-01' and date < '2024-01-01'")

    dfs = pd.DataFrame()
    for _, df in e.raw_files:
        df_select = df[["news", "date", "econ", "policy"]]
        dfs = pd.concat([dfs, df_select], axis=0).reset_index(drop=True)

    sent_df, sent_mean, sent_std = calculate_sentiment(dfs)

    min_date = str(sent_df.date.min().date())
    max_date = str(sent_df.date.max().date())

    sent_df = generate_continous_df(sent_df, min_date, max_date, freq="MS")
    sent_df["z_score"] = sent_df["score"].apply(lambda x:
                                                (x - sent_mean) / sent_std)
    sent_dfs.append((country_name, sent_df, sent_mean))

    saved_folder = output_dir + f"{country_name}/sentiment/"
    if not os.path.exists(saved_folder):
        os.mkdir(saved_folder)

    sent_df.to_csv(saved_folder + f"{country_name}_sentiment.csv",
                   encoding="utf-8")

100%|███████████████████████████████████████████████████████████████████████████████████| 4474/4474 [00:15<00:00, 294.10it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 68/68 [00:00<00:00, 184.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 963/963 [00:02<00:00, 350.31it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7093/7093 [00:15<00:00, 462.35it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 6646/6646 [00:17<00:00, 381.18it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3469/3469 [00:08<00:00, 419.12it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 6967/6967 [00:23<00:00, 298.41it/s]


In [5]:
tabs = []
z_score_df = pd.DataFrame()
for name, df, sentiment in sent_dfs:
    if name != "tonga":
        temp = df[["date", "z_score"]].rename({"z_score": name}, axis=1)
        if z_score_df.empty:
            z_score_df = temp
        else:
            z_score_df = z_score_df.merge(temp, how="outer", on="date")

z_score_df = z_score_df.round(3)
source = ColumnDataSource(z_score_df)
hover = HoverTool(tooltips=[('Date', '@date{%Y-%m}'), ('Z-Score', '@$name')],
                  formatters={'@date': 'datetime'})
p = figure(height=400,
           width=750,
           x_axis_type="datetime",
           tools=[hover, BoxZoomTool(), ResetTool()])

columns = [
    TableColumn(field="date", title="Date", formatter=DateFormatter()),
]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
line_styles = ['solid', 'dashed', 'dotted', 'dotdash', 'dashdot', 'solid']
toggles = []

for (idx, name) in enumerate(z_score_df.columns[1:]):
    if name != "tonga":
        prettfied_name = " ".join(w[0].upper() + w[1:]
                                  for w in name.split("_"))
        line = p.line("date",
                      name,
                      source=z_score_df,
                      name=name,
                      line_width=2,
                      color=colors[idx],
                      line_dash=line_styles[idx])
        line.visible = idx < 1
        toggle = Toggle(label=f"{prettfied_name}",
                        button_type="light",
                        active=(idx < 1))
        toggle.js_link('active', line, 'visible')

        toggles.append(toggle)
        columns.append(TableColumn(field=name, title=prettfied_name))

dt = DataTable(source=source, columns=columns, width=720, height=300)
combined = column(Row(*toggles), p, dt)
tabs.append(TabPanel(child=combined, title="Comparison"))

In [6]:
for name, df, sentiment in sent_dfs:
    if name != "tonga":
        df = df.round(3)
        source = ColumnDataSource(df)

        hover = HoverTool(tooltips=[('Date', '@date{%Y-%m}'),
                                    ('Sentiment', '@score')],
                          formatters={'@date': 'datetime'})

        p = figure(height=400,
                   width=750,
                   y_range=(df.score.min() - 0.1, df.score.max() + 0.1),
                   x_axis_type="datetime",
                   tools=[hover, BoxZoomTool(),
                          ResetTool()])

        # Define the primary y-axis for 'score'
        p.line("date",
               "score",
               source=source,
               name="sentiment",
               line_width=2,
               legend_label="Sentiment")

        p.hspan(y=sentiment, line_width=2, line_dash=["dashed"])
        p.legend.location = "top_left"
        p.legend.click_policy = "mute"

        columns = [
            TableColumn(field="date", title="Date", formatter=DateFormatter()),
            TableColumn(field="score", title="Sentiment (Avg.)")
        ]
        dt = DataTable(source=source, columns=columns, width=720, height=300)
        combined = column(p, dt)

        # Uppercase the first letter of the country name
        title = " ".join(w[0].upper() + w[1:] for w in name.split("_"))
        tab = TabPanel(child=combined, title=title)
        tabs.append(tab)

show(Tabs(tabs=tabs))