In [1]:
import os 
import pandas as pd
import numpy as np

from src.utils.dict_loader import TopicDictionary
from src.utils.output_loader import load_all_topvecs, load_all_bstr_arrs
from src.utils.downstream_aggregate import normalize

import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("retina")

import yaml
with open("../../src/configs.yml", "r") as configs:
    configs = yaml.safe_load(configs)

ROOTPATH = configs["ROOTPATH"]

START2016 = pd.to_datetime(configs["START2016"])
END2016 = pd.to_datetime(configs["END2016"])
START2020 = pd.to_datetime(configs["START2020"])
END2020 = pd.to_datetime(configs["END2020"])

INPUT_FPATH = configs["DATE_TOPVEC_PATH"]
print(f"INPUT_FPATH=={INPUT_FPATH}")

# OUTPUT_FPATH = ROOTPATH + "output/figs/"
# print(f"OUTPUT_FPATH=={OUTPUT_FPATH}")

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yijingch/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yijingch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


INPUT_FPATH==/Users/yijingch/Documents/GITHUB/intermedia-agenda-setting/output/date-topvec-min2-gtm1/


In [2]:
from src.utils.dict_configuration import dictionary2016, dictionary2020

TOPICS2DROP = [
    "election_campaign", "general_controversies", "no_topic", 
    "forestry", "land_water_management", "agriculture", "housing", 
    "transportation", "culture"]

TOPICS2DROP_IDX2016 = [dictionary2016.topic2index[x] for x in TOPICS2DROP]
TOPICS2DROP_IDX2020 = [dictionary2020.topic2index[x] for x in TOPICS2DROP]

Successfully loaded dictionary!
	# of unique topics: 27
	# of unique words: 1426
Successfully loaded dictionary!
	# of unique topics: 27
	# of unique words: 1453


In [3]:
topvec_dfs2016 = load_all_topvecs(year=2016, topvec_fpath=INPUT_FPATH, normalize_by_unit=True)
topvec_dfs2020 = load_all_topvecs(year=2020, topvec_fpath=INPUT_FPATH, normalize_by_unit=True)

bstr_arrs2016 = load_all_bstr_arrs(year=2016, vec_fpath=INPUT_FPATH, vec_type="topvecs", normalize_by_unit=True)
bstr_arrs2020 = load_all_bstr_arrs(year=2020, vec_fpath=INPUT_FPATH, vec_type="topvecs", normalize_by_unit=True)

In [4]:
import statsmodels.api as sm
from statsmodels.tsa.stattools import grangercausalitytests
from collections import Counter

# https://rishi-a.github.io/2020/05/25/granger-causality.html

def granger_causality_all_topics(arr1, arr2, dictionary, maxlag=5, topics2skip=TOPICS2DROP, verbose=False, p_thres=.01, postfix=""):
    arr1 = np.diff(arr1, axis=0)
    arr2 = np.diff(arr2, axis=0)

    topics2test = list(set(dictionary.topics) - set(TOPICS2DROP))
    out_df = pd.DataFrame()
    out_df["topic"] = topics2test 
    p_values = []

    for topic in topics2test:
        topic_idx = dictionary.topic2index[topic]
        test_df = pd.DataFrame()
        test_df["ts_Y"] = arr1[:,topic_idx]
        test_df["ts_X"] = arr2[:,topic_idx]
        # whether the time series in the second column Granger causes the time series in the first column
        if test_df["ts_X"].nunique() > 1 and test_df["ts_Y"].nunique() > 1:
            # print(test_df)
            try:
                test_result = grangercausalitytests(test_df, maxlag=maxlag, verbose=False)
                p = [round(test_result[i+1][0]["ssr_chi2test"][1],4) for i in range(maxlag)]
                # p = [round(test_result[i+1][0]["ssr_ftest"][1],4) for i in range(maxlag)]
                if verbose: print(f"topic = {topic}, p = {p}")
            except Exception as e:
                # print(topic, e) 
                # print(topic)
                p = [np.NaN for i in range(maxlag)]
        else:
            p = [np.NaN for i in range(maxlag)]
        p_values.append(p)

    out_df[f"p_value{postfix}"] = p_values
    out_df[f"opt_lag{postfix}"] = out_df[f"p_value{postfix}"].map(lambda x: np.where(np.array(x) == np.array(x).min())[0]+1)
    out_df[f"opt_lag_sig{p_thres}{postfix}"] = out_df.apply(lambda x: [y for y in x[f"opt_lag{postfix}"] if x[f"p_value{postfix}"][y-1]<p_thres], axis=1)
    return out_df

def granger_causality_for_df(df1, df2, dictionary, maxlag=5, topics2skip=TOPICS2DROP, verbose=False, p_thres=.01):
    arr1 = np.array(df1["majority_topvec"].tolist())
    arr2 = np.array(df2["majority_topvec"].tolist())
    out_df = granger_causality_all_topics(
        arr1, arr2, 
        dictionary=dictionary, 
        maxlag=maxlag, 
        topics2skip=topics2skip, 
        verbose=verbose, 
        p_thres=p_thres)
    return out_df

def granger_causality_for_bstr_arrs(bstr_arrs1, bstr_arrs2, dictionary, maxlag=5, topics2skip=TOPICS2DROP, verbose=False, p_thres=.01):
    n_runs = bstr_arrs1.shape[0]
    big_out_df = pd.DataFrame()
    for r in range(n_runs):
        out_df = granger_causality_all_topics(
            bstr_arrs1[r], bstr_arrs2[r],
            dictionary=dictionary, maxlag=maxlag, 
            topics2skip=topics2skip, verbose=verbose, p_thres=p_thres, postfix=str(r))
        big_out_df = pd.concat([big_out_df, out_df], axis=1)
        if r == 0:
            topics = out_df["topic"].tolist()
    big_out_df.drop(columns="topic", inplace=True)
    big_out_df["topic"] = topics
    big_out_df[f"ls_opt_lag_sig{p_thres}"] = big_out_df.apply(lambda x: [i for ls in [x[f"opt_lag_sig{p_thres}{r}"] for r in range(200)] for i in ls], axis=1)
    big_out_df[f"count_opt_lag_sig{p_thres}"] = big_out_df[f"ls_opt_lag_sig{p_thres}"].map(lambda x: Counter(x))
    for lag in range(maxlag):
        big_out_df[f"lag{lag+1}_count{p_thres}"] = big_out_df[f"count_opt_lag_sig{p_thres}"].map(lambda x: x[lag+1] if lag+1 in x.keys() else 0)
    return big_out_df

## Test Granger causality for sliding windows

In [5]:
THRES = 0.05

def generate_output_df(dictionary, labela, labelb, a2b_cand, b2a_cand, thres=THRES):
    output_df = pd.DataFrame() 
    output_df["topic"] = dictionary.topics

    output_df = output_df.merge(a2b_cand[["topic",f"opt_lag_sig{thres}"]])
    output_df[f"led_by_{labela}"] = output_df[f"opt_lag_sig{thres}"].map(lambda x: 1 if len(x) > 0 else 0)
    output_df.drop(columns=f"opt_lag_sig{thres}", inplace=True)

    output_df = output_df.merge(b2a_cand[["topic",f"opt_lag_sig{thres}"]])
    output_df[f"led_by_{labelb}"] = output_df[f"opt_lag_sig{thres}"].map(lambda x: 1 if len(x) > 0 else 0)
    output_df.drop(columns=f"opt_lag_sig{thres}", inplace=True)

    output_df["mutual"] = output_df.apply(lambda x: 1 if x[f"led_by_{labela}"] + x[f"led_by_{labelb}"] == 2 else 0, axis=1)
    output_df["no_relation"] = output_df.apply(lambda x: 1 if x[f"led_by_{labela}"] + x[f"led_by_{labelb}"] == 0 else 0, axis=1)

    output_df[f"led_by_{labela}_only"] = output_df[f"led_by_{labela}"] - output_df["mutual"]
    output_df[f"led_by_{labelb}_only"] = output_df[f"led_by_{labelb}"] - output_df["mutual"]

    return output_df[["topic", f"led_by_{labela}_only", f"led_by_{labelb}_only", "mutual", "no_relation"]]

def generate_output_df_bstr(dictionary, labela, labelb, bstr_a2b_cand, bstr_b2a_cand, thres=THRES):
    output_df_all = pd.DataFrame()
    output_df_all["topic"] = dictionary.topics
    nruns = 200
    for r in range(nruns):
        a2b_cand = bstr_a2b_cand[["topic",f"opt_lag_sig{thres}{r}"]].copy()
        a2b_cand.rename(columns={f"opt_lag_sig{thres}{r}": f"opt_lag_sig{thres}"}, inplace=True)
        b2a_cand = bstr_b2a_cand[["topic",f"opt_lag_sig{thres}{r}"]].copy()
        b2a_cand.rename(columns={f"opt_lag_sig{thres}{r}": f"opt_lag_sig{thres}"}, inplace=True)
        output_df_r = generate_output_df(dictionary, labela, labelb, a2b_cand, b2a_cand, thres=thres)
        output_df_r.rename(columns={
            f"led_by_{labela}_only": f"led_by_{labela}_only_{r}",
            f"led_by_{labelb}_only": f"led_by_{labelb}_only_{r}",
            "mutual": f"mutual_{r}",
            "no_relation": f"no_relation_{r}"}, inplace=True)
        output_df_all = pd.concat([output_df_all, output_df_r], axis=1)
        if r != nruns-1:
            output_df_all.drop(columns="topic", inplace=True)

    output_df_all[f"led_by_{labela}_only"] = output_df_all.apply(
        lambda x: np.sum([x[f"led_by_{labela}_only_{r}"] for r in range(nruns)]), axis=1)
    output_df_all[f"led_by_{labelb}_only"] = output_df_all.apply(
        lambda x: np.sum([x[f"led_by_{labelb}_only_{r}"] for r in range(nruns)]), axis=1)
    output_df_all["mutual"] = output_df_all.apply(
        lambda x: np.sum([x[f"mutual_{r}"] for r in range(nruns)]), axis=1)
    output_df_all["no_relation"] = output_df_all.apply(
        lambda x: np.sum([x[f"no_relation_{r}"] for r in range(nruns)]), axis=1)

    cols = ["topic", f"led_by_{labela}_only", f"led_by_{labelb}_only", "mutual", "no_relation"]

    return output_df_all[cols].dropna()

In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:
year = 2020
if year == 2016:
    cand1 = "trump"
    cand2 = "clinton"
    dictionary = dictionary2016
    topvec_dfs = topvec_dfs2016
    bstr_arrs = bstr_arrs2016
elif year == 2020:
    cand1 = "biden"
    cand2 = "trump"
    dictionary = dictionary2020
    topvec_dfs = topvec_dfs2020
    bstr_arrs = bstr_arrs2020

THRES = 0.05
WINDOW_LEN = 90
SLIDING_WINDOW_OUTPUT = []

labela = "trad"
labelb = "lowc"

for i in range(len(topvec_dfs["headline"][0][1])-WINDOW_LEN):

    print("window:", i)
    trad2lowc_cand1 = granger_causality_for_df(topvec_dfs["headline"][0][1].loc[i:i+WINDOW_LEN], topvec_dfs["headline"][0][2].loc[i:i+WINDOW_LEN], dictionary, p_thres=THRES)
    lowc2trad_cand1 = granger_causality_for_df(topvec_dfs["headline"][0][2].loc[i:i+WINDOW_LEN], topvec_dfs["headline"][0][1].loc[i:i+WINDOW_LEN], dictionary, p_thres=THRES)

    trad2lowc_cand2 = granger_causality_for_df(topvec_dfs["headline"][1][1].loc[i:i+WINDOW_LEN], topvec_dfs["headline"][1][2].loc[i:i+WINDOW_LEN], dictionary, p_thres=THRES)
    lowc2trad_cand2 = granger_causality_for_df(topvec_dfs["headline"][1][2].loc[i:i+WINDOW_LEN], topvec_dfs["headline"][1][1].loc[i:i+WINDOW_LEN], dictionary, p_thres=THRES)

    bstr_trad2lowc_cand1 = granger_causality_for_bstr_arrs(bstr_arrs["headline"][0][1][:,i:i+WINDOW_LEN,:], bstr_arrs["headline"][0][2][:,i:i+WINDOW_LEN,], dictionary, p_thres=THRES)
    bstr_lowc2trad_cand1 = granger_causality_for_bstr_arrs(bstr_arrs["headline"][0][2][:,i:i+WINDOW_LEN,:], bstr_arrs["headline"][0][1][:,i:i+WINDOW_LEN,], dictionary, p_thres=THRES)

    bstr_trad2lowc_cand2 = granger_causality_for_bstr_arrs(bstr_arrs["headline"][1][1][:,i:i+WINDOW_LEN,:], bstr_arrs["headline"][1][2][:,i:i+WINDOW_LEN,], dictionary, p_thres=THRES)
    bstr_lowc2trad_cand2 = granger_causality_for_bstr_arrs(bstr_arrs["headline"][1][2][:,i:i+WINDOW_LEN,:], bstr_arrs["headline"][1][1][:,i:i+WINDOW_LEN,], dictionary, p_thres=THRES)

    cred1_output = generate_output_df(dictionary, labela=labela, labelb=labelb,
        a2b_cand=trad2lowc_cand1, b2a_cand=lowc2trad_cand1, thres=THRES)

    cred2_output = generate_output_df(dictionary, labela=labela, labelb=labelb,
        a2b_cand=trad2lowc_cand2, b2a_cand=lowc2trad_cand2, thres=THRES)

    bstr_cred1_output = generate_output_df_bstr(
        dictionary, labela=labela, labelb=labelb,
        bstr_a2b_cand=bstr_trad2lowc_cand1, bstr_b2a_cand=bstr_lowc2trad_cand1, thres=THRES)

    bstr_cred2_output = generate_output_df_bstr(
        dictionary, labela=labela, labelb=labelb,
        bstr_a2b_cand=bstr_trad2lowc_cand2, bstr_b2a_cand=bstr_lowc2trad_cand2, thres=THRES)

    bstr_cred1_output.rename(columns={
        f"led_by_{labela}_only": f"led_by_{labela}_only_{cand1}",
        f"led_by_{labelb}_only": f"led_by_{labelb}_only_{cand1}",
        "mutual": f"mutual_{cand1}",
        "no_relation": f"no_relation_{cand1}"}, inplace=True)
    bstr_cred2_output.rename(columns={
        f"led_by_{labela}_only": f"led_by_{labela}_only_{cand2}",
        f"led_by_{labelb}_only": f"led_by_{labelb}_only_{cand2}",
        "mutual": f"mutual_{cand2}",
        "no_relation": f"no_relation_{cand2}"}, inplace=True)

    SLIDING_WINDOW_OUTPUT.append([bstr_cred1_output, bstr_cred2_output])

window: 0
window: 1
window: 2
window: 3
window: 4
window: 5
window: 6
window: 7
window: 8
window: 9
window: 10
window: 11
window: 12
window: 13
window: 14
window: 15
window: 16
window: 17
window: 18
window: 19
window: 20
window: 21
window: 22
window: 23
window: 24
window: 25
window: 26
window: 27
window: 28
window: 29
window: 30
window: 31
window: 32
window: 33
window: 34
window: 35
window: 36
window: 37
window: 38
window: 39
window: 40
window: 41
window: 42
window: 43
window: 44
window: 45
window: 46
window: 47
window: 48
window: 49
window: 50
window: 51
window: 52
window: 53
window: 54
window: 55
window: 56
window: 57
window: 58
window: 59
window: 60
window: 61


In [1]:
# labela = "left"
# labelb = "right"

# for i in range(len(topvec_dfs["headline"][0][1])-WINDOW_LEN):

#     print("window:", i)
#     left2right_cand1 = granger_causality_for_df(topvec_dfs["headline"][0][5].loc[i:i+WINDOW_LEN], topvec_dfs["headline"][0][3].loc[i:i+WINDOW_LEN], dictionary, p_thres=THRES)
#     right2left_cand1 = granger_causality_for_df(topvec_dfs["headline"][0][3].loc[i:i+WINDOW_LEN], topvec_dfs["headline"][0][5].loc[i:i+WINDOW_LEN], dictionary, p_thres=THRES)

#     left2right_cand2 = granger_causality_for_df(topvec_dfs["headline"][1][5].loc[i:i+WINDOW_LEN], topvec_dfs["headline"][1][3].loc[i:i+WINDOW_LEN], dictionary, p_thres=THRES)
#     right2left_cand2 = granger_causality_for_df(topvec_dfs["headline"][1][3].loc[i:i+WINDOW_LEN], topvec_dfs["headline"][1][5].loc[i:i+WINDOW_LEN], dictionary, p_thres=THRES)

#     bstr_left2right_cand1 = granger_causality_for_bstr_arrs(bstr_arrs["headline"][0][5][:,i:i+WINDOW_LEN,:], bstr_arrs["headline"][0][3][:,i:i+WINDOW_LEN,], dictionary, p_thres=THRES)
#     bstr_right2left_cand1 = granger_causality_for_bstr_arrs(bstr_arrs["headline"][0][3][:,i:i+WINDOW_LEN,:], bstr_arrs["headline"][0][5][:,i:i+WINDOW_LEN,], dictionary, p_thres=THRES)

#     bstr_left2right_cand2 = granger_causality_for_bstr_arrs(bstr_arrs["headline"][1][5][:,i:i+WINDOW_LEN,:], bstr_arrs["headline"][1][3][:,i:i+WINDOW_LEN,], dictionary, p_thres=THRES)
#     bstr_right2left_cand2 = granger_causality_for_bstr_arrs(bstr_arrs["headline"][1][3][:,i:i+WINDOW_LEN,:], bstr_arrs["headline"][1][5][:,i:i+WINDOW_LEN,], dictionary, p_thres=THRES)

#     ideo1_output = generate_output_df(dictionary, labela=labela, labelb=labelb,
#         a2b_cand=left2right_cand1, b2a_cand=right2left_cand1, thres=THRES)

#     ideo2_output = generate_output_df(dictionary, labela=labela, labelb=labelb,
#         a2b_cand=left2right_cand2, b2a_cand=right2left_cand2, thres=THRES)

#     bstr_ideo1_output = generate_output_df_bstr(
#         dictionary, labela=labela, labelb=labelb,
#         bstr_a2b_cand=bstr_left2right_cand1, bstr_b2a_cand=bstr_right2left_cand1, thres=THRES)

#     bstr_ideo2_output = generate_output_df_bstr(
#         dictionary, labela=labela, labelb=labelb,
#         bstr_a2b_cand=bstr_left2right_cand2, bstr_b2a_cand=bstr_right2left_cand2, thres=THRES)

#     bstr_ideo1_output.rename(columns={
#         f"led_by_{labela}_only": f"led_by_{labela}_only_{cand1}",
#         f"led_by_{labelb}_only": f"led_by_{labelb}_only_{cand1}",
#         "mutual": f"mutual_{cand1}",
#         "no_relation": f"no_relation_{cand1}"}, inplace=True)
#     bstr_ideo2_output.rename(columns={
#         f"led_by_{labela}_only": f"led_by_{labela}_only_{cand2}",
#         f"led_by_{labelb}_only": f"led_by_{labelb}_only_{cand2}",
#         "mutual": f"mutual_{cand2}",
#         "no_relation": f"no_relation_{cand2}"}, inplace=True)

#     SLIDING_WINDOW_OUTPUT.append([bstr_ideo1_output, bstr_ideo2_output])

In [9]:
ROBUST_THRES = 0.95*200
def get_significant_result(bstr_group_output, thres=ROBUST_THRES):
    cols = list(bstr_group_output.columns)
    cols.remove("topic")
    sig_result = {}
    for _,row in bstr_group_output.iterrows():
        t = row["topic"]
        largest = np.max([row[c] for c in cols])
        sig_result[t] = "NO_SIG_RESULT"
        for c in cols:
            if row[c] == largest and row[c] > thres:
                sig_result[t] = c
    return sig_result

In [10]:
MODES = [f"led_by_{labela}_only", f"led_by_{labelb}_only", "mutual", "no_relation"]
tmp_plot_ias = pd.DataFrame()
tmp_plot_ias["date"] = topvec_dfs["headline"][0][1][:len(SLIDING_WINDOW_OUTPUT)]["date"]
res_cand1 = []
res_cand2 = []
for bstr_cred1_output, bstr_cred2_output in SLIDING_WINDOW_OUTPUT:
    res_cand1.append(get_significant_result(bstr_cred1_output))
    res_cand2.append(get_significant_result(bstr_cred2_output))
# for bstr_ideo1_output, bstr_ideo2_output in SLIDING_WINDOW_OUTPUT:
#     res_cand1.append(get_significant_result(bstr_ideo1_output))
#     res_cand2.append(get_significant_result(bstr_ideo2_output))
tmp_plot_ias[f"res_{cand1}"] = res_cand1
tmp_plot_ias[f"res_{cand2}"] = res_cand2

for t in tmp_plot_ias[f"res_{cand1}"].tolist()[0].keys():
    tmp_plot_ias[f"{t}_{cand1}"] = tmp_plot_ias[f"res_{cand1}"].map(lambda x: x[t])
    tmp_plot_ias[f"{t}_{cand2}"] = tmp_plot_ias[f"res_{cand2}"].map(lambda x: x[t])
tmp_plot_ias.to_csv(ROOTPATH + f"output/ias-sliding-window/cred_{year}_window90.csv", index=False)
# tmp_plot_ias.to_csv(ROOTPATH + f"output/ias-sliding-window/ideo_{year}_window90.csv", index=False)