In [1]:
import os
os.chdir("../../../")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy

from statsmodels.tsa.vector_ar.vecm import coint_johansen
from scripts.python.tsa.ts_utils import *
from statsmodels.tsa.api import VAR

In [2]:
folderpath = os.getcwd() + "/data/text/solomon_islands/"
nlp = spacy.load("en_core_web_sm")

def process_data(filename, folderpath=folderpath):
    df = pd.read_csv(folderpath + filename).drop("Unnamed: 0", axis=1)
    df["news"] = df["news"].replace("\n", "")
    df["date"] = pd.to_datetime(df["date"])
    df["ym"] = [str(d.year) + "-" + str(d.month) for d in df.date]
    return df


def get_news_count(data: pd.DataFrame, column: str):
    count_df = (data.set_index("date").groupby("ym")[[
        str(column)
    ]].count().reset_index().rename({str(column): str(column) + "_count"},
                                    axis=1))
    return count_df


def check_epu_category(row, terms):
    return any([word in str(row) for word in terms])


def get_epu_zscore(data: pd.DataFrame, 
                   cutoff_index: int,
                   epu_col: str = "epu_count",
                   news_col: str = "news_count") -> pd.Series:
    ratio = data[epu_col] / data[news_col]
    ratio_t1 = ratio[:cutoff_index]
    mean, std = ratio_t1.mean(), ratio_t1.std()
    z_score = abs(ratio - mean) / std
    return z_score

In [3]:
ss = process_data("solomon_stars_news.csv")
st = process_data("solomon_times_news.csv")
tis = process_data("island_sun_news.csv")
sibc = process_data("sibc_news.csv")

In [4]:
def extract_entities(corpus: str):
    doc = nlp(corpus)
    ner_dict = {"LOC": "", "ORG": "", "GPE": "", "WORK_OF_ART": ""}
    for e in doc.ents:
        if e.label_ in ner_dict.keys() and e.text not in ner_dict[e.label_]:
            ner_dict[e.label_] += e.text + ", "

    for key, val in ner_dict.items():
        if val.endswith(", "):
            ner_dict[key] = val[:-2]
    return ner_dict

In [None]:
from tqdm import tqdm
for data, name in zip([st], ["solomon_times_ner"]):
    output = []
    with tqdm(total=len(data)) as pbar:
        for news in data["news"]:
            if type(news) == str:
                ner_dict = extract_entities(news)
            else:
                ner_dict = {
                    "LOC": "Missing",
                    "ORG": "Missing",
                    "GPE": "Missing",
                    "WORK_OF_ART": "Missing"
                }
            output.append(ner_dict)
            pbar.update(1)
    ner_df = pd.DataFrame(output)
    data = pd.concat([data, output], axis=1)
#     data.to_csv(folderpath+name+".csv", encoding="utf-8")

In [5]:
econ_lst = [
    "economy", "economic", "economics", "business", "commerce", "finance",
    "financial", "industry", "food"
]

uncertain_lst = [
    "uncertain", "uncertainty", "uncertainties", "unknown", "unstable",
    "unsure", "undetermined", "risky", "not certain", "non-reliable"
]

policy_lst = [
    "government", "governmental", "authorities", "minister", "ministry",
    "parliament", "parliamentary", "tax", "regulation", "legislation",
    "central bank", "cbsi", "imf", "world bank", "international monetary fund",
    "debt"
]

solomon_lst = [
    "solomon", "solomon islands", "honiara", "central province", "tulagi",
    "choiseul", "taro", "guadalcanal", "isabel province", "buala",
    "malaita", "auki", "kirakira", "makira-ulawa", "makira ulawa",
    "rennell and bellona", "tigoa", "temotu", "lata", "western province",
    "gizo"
]

In [6]:
mindate = min(st.date.min(), ss.date.min(), tis.date.min())
maxdate = max(st.date.max(), st.date.max(),  tis.date.max())
start = str(mindate.year) + "-" + str(mindate.month)
end = str(maxdate.year) + "-" + str(maxdate.month)

In [7]:
stats = pd.DataFrame(pd.date_range(start, end, freq="MS"), columns=["date"])
for nps, np_name in zip([st, ss, tis, sibc], ["st", "ss", "tis", "sibc"]):
    nps["news"] = nps["news"].str.lower()
    np_count = get_news_count(nps, "news")
    for (category, term_lst) in zip(["econ", "uncertain", "policy"],
                                    [econ_lst, uncertain_lst, policy_lst]):
        if np_name != "ss":
            nps["solomon"] = nps["news"].apply(check_epu_category,
                                             terms=solomon_lst)
        else:
            nps["solomon"] = True
        
        nps[str(category)] = nps["news"].apply(check_epu_category,
                                             terms=term_lst)
    nps["epu"] = ((nps.econ == True) & (nps.uncertain == True) &
                 (nps.policy == True) & (nps.solomon == True))
    np_epu_count = get_news_count(nps[nps.epu == True], column="epu")
    np_stat = np_count.merge(np_epu_count, how="left").fillna(0)
    np_stat["date"] = pd.to_datetime(np_stat["ym"])
    cutoff_index = np_stat[np_stat.date == "2019-12"].index[0] + 1
    np_stat["z_score"] = get_epu_zscore(np_stat, cutoff_index)
    np_stat.columns = [
        str(np_name) + "_" + str(col) if col != "date" and col != "ym" else col
        for col in np_stat.columns
    ]
    stats = (stats.merge(np_stat, how="left",
                         on="date")).drop("ym", axis=1).fillna(0)

In [19]:
ss.to_csv("ss_sample.csv", encoding="utf-8")

In [None]:
select_cols = [col for col in stats.columns if "z_score" in col]
scaling_factor = 100/(stats[stats.date >= "2020-01-01"][select_cols].mean(axis=1).mean())
stats["z_score"] = stats[select_cols].mean(axis=1)
stats["epu_index"] = stats["z_score"] * scaling_factor

## EPU -> Macroeconomic

In [17]:
si_cpi = [
    98.6, 99.0, 100.6, 100.7, 101.0, 100.6, 100.5, 100.1, 99.5, 99.6, 99.5,
    100.3, 102.0, 104.2, 103.9, 103.9, 103.0, 102.8, 103.0, 103.2, 104.0,
    103.8, 104.1, 104.2, 104.5, 103.8, 105.1, 104.2, 104.2, 104.3, 105.0,
    108.0, 105.5, 106.6, 106.9, 106.9, 112.4, 112.7, 112.7, 112.2, 111.1,
    107.2, 105.8, 105.1, 105.4, 105.2, 105.3, 104.1, 106.7, 110.6, 112.4,
    109.8, 109.2, 106.6, 106.1, 106.6, 107.3, 107.5, 107.4, 107.8, 108.8,
    109.4, 109.4, 111.5, 113.9, 114.1, 115.4, 117.1, 118.0, 117.8, 117.2,
    117.0, 118.9, 119.2, 119.8
]

si_infl_df = pd.DataFrame(pd.date_range("2017-01", "2023-03", freq="MS"),
                          columns=["date"])
si_infl_df["cpi"] = si_cpi
si_infl = si_infl_df.merge(stats[["date", "epu_index"]], how="left", on="date")
si_infl["inflation"] = (si_infl["cpi"].diff() / si_infl["cpi"].shift(1)) * 100
si_infl["epu_change"] = (si_infl["epu_index"].diff() /
                         si_infl["epu_index"].shift(1)) * 100

KeyError: "['epu_index'] not in index"

In [None]:
si_infl

In [None]:
from scipy import signal
def ccf_values(series1, series2):
    p = series1
    q = series2
    p = (p - p.mean()) / (p.std() * len(p))
    q = (q - q.mean()) / (q.std())
    c = np.correlate(p, q, 'full')
    return c


corr = ccf_values(si_infl["epu_index"][1:], si_infl["inflation"][1:])
lags = signal.correlation_lags(len(si_infl["inflation"][1:]),
                               len(si_infl["epu_index"][1:]))
plt.plot(lags, corr);

In [None]:
ccf_df = pd.DataFrame([lags, corr], index=["lags", "ccf"]).T
ccf_df[(ccf_df.lags >= -3) & (ccf_df.lags <= 6)]

In [None]:
var_df = si_infl[["inflation", "epu_index"]][1:].reset_index(drop=True)
print(adf_test(var_df["epu_index"]), "\n", adf_test(var_df["inflation"]))

In [None]:
def cointegration_test(df, alpha=0.05):
    out = coint_johansen(df, -1, 5)
    d = {'0.90': 0, '0.95': 1, '0.99': 2}
    traces = out.lr1
    cvts = out.cvt[:, d[str(1 - alpha)]]

    def adjust(val, length=6):
        return str(val).ljust(length)

    print('Name   ::  Test Stat > C(95%)    =>   Signif  \n', '--' * 20)
    for col, trace, cvt in zip(df.columns, traces, cvts):
        print(adjust(col), ':: ', adjust(round(trace, 2), 9), ">",
              adjust(cvt, 8), ' =>  ', trace > cvt)


cointegration_test(si_infl[["inflation", "epu_index"]].iloc[
    1:,
])

In [None]:
length = len(si_infl.iloc[1:])
train = int(length * 0.9)
test = length - train


model = VAR(endog=var_df[:train+1])

eval_lst = []
for i in range(0, 13):
    result = model.fit(i)
    eval_metrics = {
        "lag": i,
        "AIC": result.aic,
        "BIC": result.bic,
        "FPE": result.fpe,
        "HQIC": result.hqic
    }
    eval_lst.append(eval_metrics)
    
eval_df = pd.DataFrame(eval_lst)
eval_df

In [None]:
best_mod = model.fit(1)
best_mod.summary()

In [None]:
si_infl["epu_change_1"] = si_infl["epu_change"].shift(1)
si_infl["epu_change_2"] = si_infl["epu_change"].shift(2)
si_infl

In [None]:
import statsmodels.formula.api as smf
ols_mod = smf.ols("inflation~epu_change_1", data=si_infl[:train+1])
ols_res = ols_mod.fit()
ols_res.summary()

In [None]:
ols_predict = ols_res.fittedvalues
ols_forecast = ols_res.predict(si_infl["epu_change_1"][-test:])
ols_all = pd.concat([ols_predict, ols_forecast], axis=0)

In [None]:
from statsmodels.stats.stattools import durbin_watson
out = durbin_watson(best_mod.resid)
out

In [None]:
var_forecast = best_mod.forecast(y=var_df.values[-test:], steps=test)
var_forecast_df = pd.DataFrame(var_forecast, index=var_df.index[-test:], columns=["var_inflation", "var_epu"])

In [None]:
best_mod.fittedvalues

In [None]:
var_df["date"] = pd.date_range(start="2017-01", periods=len(var_df), freq="MS")
fit_df = best_mod.fittedvalues.rename({"epu_index": "var_epu_index",
                                       "inflation": "var_inflation"}, axis=1)
fit_df = pd.concat([fit_df, var_df[1:train+1]], axis=1).reset_index(drop=True)

In [None]:
fit_df

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(var_df["date"].values, var_df["inflation"].values, color="green")
fit_df.plot(x="date", y="var_inflation", ax=ax, color="blue")

# ax.plot(var_forecast_df["var_inflation"].index.values,
#         var_forecast_df["var_inflation"].values,
#         label="var_predict_test", color="blue")
# ax.plot(ols_all, color="orange")
plt.show();

## EPU

In [None]:
plt.plot(stats["date"].values, stats["epu_index"].values, label="EPU Index")
plt.xlabel("Date")
plt.vlines(x=pd.to_datetime("2020-01-01"), ymin=0, ymax=800, color="red", linestyle="dashed")
# plt.hlines(y=100, xmin=pd.to_datetime("2007-01"), xmax=pd.to_datetime("2024-01"), color="green", linestyle="dashed")
plt.show()

In [None]:
ss[(ss.date>="2015-12-01") & (ss.date <= "2015-12-31") & (ss.epu == True)].url.tolist()

In [None]:
np_names = ["st", "ss", "tis", "sibc"]
news_col = [col + "_news_count" for col in np_names]
stats["news_count"] = stats[news_col].sum(axis=1)
for nc, name in zip(news_col, np_names):
    stats[str(name) + '_ratio'] = stats[nc]/stats["news_count"]

In [None]:
stats[stats.adj_epu_index >= np.percentile(stats.adj_epu_index, 95)]

In [None]:
adj_ratio_lst = []
for idx in stats.index:
    adj_ratio = 0
    for news in np_names:
        zscore_col = news + "_z_score"
        ratio_col = news + "_ratio"
        adj_ratio += stats[zscore_col][idx] * stats[ratio_col][idx]
    adj_ratio_lst.append(adj_ratio)
    
stats["adj_z_score"] = adj_ratio_lst

In [None]:
scaling_factor = 100/(stats[stats.date >= "2020-01-01"]["adj_z_score"].mean())
stats["adj_epu_index"] = stats["adj_z_score"] * scaling_factor
stats.plot(x="date", y="adj_epu_index");

In [None]:
stats_alt = stats.copy()
stats_alt["news_count"] = stats_alt[["st_news_count", "ss_news_count", "tis_news_count", "sibc_news_count"]].mean(axis=1)
stats_alt["epu_count"] = stats_alt[["st_epu_count", "ss_epu_count", "tis_epu_count", "sibc_epu_count"]].mean(axis=1)
cutoff_index = stats_alt[stats_alt.date == pd.to_datetime("2019-12-01")].index[0] + 1
stats_alt["z_score_alt"] = get_epu_zscore(stats_alt, cutoff_index=cutoff_index)
sf_alt = 100/(stats_alt["z_score_alt"][cutoff_index:].mean())
stats_alt["epu_alt"] = sf_alt * stats_alt["z_score_alt"]
stats_alt.plot(x="date", y="epu_alt");