## For aggregated analysis at the topic level, we sum up the raw count of topics without aggregating first by a certain time unit.

In [1]:
import pandas as pd 
import numpy as np
import os, re
from datetime import datetime

from src.utils.downstream_aggregate import load_model_output 
from src.utils.downstream_sum import sum_headline_topvec, sum_survey_topvec, sum_tweet_topvec
from src.utils.downstream_sum import bootstrap_sum_topvec
from src.utils.data_loader import Headlines, Surveys, Tweets
from src.utils.downstream_process import clean_domain_url, trim_period
from src.utils.dict_loader import TopicDictionary

import yaml
with open("../../src/configs.yml", "r") as configs:
    configs = yaml.safe_load(configs)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yijingch/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yijingch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# input 
ROOTPATH = configs["ROOTPATH"]
DATAPATH = configs["DATAPATH"] 
CACHE_FPATH = configs["TOPVEC_CACHE_PATH"]
date_string = "032424"

# output
OUTPUT_FPATH = configs["SUM_TOPVEC_PATH"]

if not os.path.exists(OUTPUT_FPATH):
    os.mkdir(OUTPUT_FPATH)
    print("Created output folder!")

## set up and load data

In [3]:
from src.utils.dict_configuration import dictionary2016, dictionary2020
year = 2020

if year == 2016:
    cand1 = "trump"
    cand2 = "clinton"
    start = pd.to_datetime(configs["START2016"])
    end = pd.to_datetime(configs["END2016"])
    dictionary = dictionary2016
    domains_to_include_df = pd.read_csv("../../index/domains/domains_to_keep2016_coverage0.5.csv")
    
else:
    cand1 = "biden"
    cand2 = "trump"
    start = pd.to_datetime(configs["START2020"])
    end = pd.to_datetime(configs["END2020"])
    dictionary = dictionary2020
    domains_to_include_df = pd.read_csv("../../index/domains/domains_to_keep2020_coverage0.5.csv")

print(start)
print(end)
print("# of domains to include:", len(domains_to_include_df))

Successfully loaded dictionary!
	# of unique topics: 27
	# of unique words: 1426
Successfully loaded dictionary!
	# of unique topics: 27
	# of unique words: 1453
2020-07-01 00:00:00
2020-11-30 00:00:00
# of domains to include: 804


## aggregate headlines

In [4]:
headline_cachepath1 = CACHE_FPATH + f"headline/{date_string}_{cand1}{year}_topvec_cache.pkl"
headline_cachepath2 = CACHE_FPATH + f"headline/{date_string}_{cand2}{year}_topvec_cache.pkl"
headline_topics1 = load_model_output(headline_cachepath1, start=start, end=end)
headline_topics2 = load_model_output(headline_cachepath2, start=start, end=end)

headlines = Headlines(ROOTPATH + "data/", year=year, drop_duplicates=False)
headlines.trim(start=start, end=end)

In [5]:
domains_to_include_df.head()

Unnamed: 0,domain,date,n_coverage,pct_coverage
0,12news.com,"{Timestamp('2020-08-12 00:00:00'), Timestamp('...",145,0.953947
1,21stcenturywire.com,"{Timestamp('2020-08-12 00:00:00'), Timestamp('...",128,0.842105
2,4threvolutionarywar.wordpress.com,"{Timestamp('2020-08-12 00:00:00'), Timestamp('...",131,0.861842
3,680news.com,"{Timestamp('2020-08-12 00:00:00'), Timestamp('...",108,0.710526
4,abc12.com,"{Timestamp('2020-08-17 00:00:00'), Timestamp('...",89,0.585526


### load domain labels --- NEW VERSION AFTER I REFRESH

In [6]:
fake_df = pd.read_csv("../../index/domains/MASTER_fake_refreshed081123.tsv", sep="\t")
mixed_df = pd.read_csv("../../index/domains/MASTER_mixed_refreshed081123.tsv", sep="\t")
ideo_df = pd.read_csv("../../index/domains/ideo_domain_mbfc081123.tsv", sep="\t")

In [7]:
domains_to_include = domains_to_include_df["domain"].tolist()
lowcs = set(domains_to_include).intersection(set(fake_df[fake_df["fake_sum"]>=1]["domain"].tolist()))
lowcs.remove("foxnews.com")

trads = set(domains_to_include) - set(fake_df[fake_df["fake_sum"]>=1]["domain"].tolist())
trads -= set(mixed_df[mixed_df["mixed_sum"]>=1]["domain"].tolist())
trads.add("foxnews.com")

lefts = set(domains_to_include).intersection(set(ideo_df[ideo_df["mbfc_ideo"]==-1]["domain"].tolist()))
rights = set(domains_to_include).intersection(set(ideo_df[ideo_df["mbfc_ideo"]==1]["domain"].tolist()))
centers = set(domains_to_include).intersection(set(ideo_df[ideo_df["mbfc_ideo"]==0]["domain"].tolist()))

print("# of domains:", len(domains_to_include))
print("# of right-leaning domains:", len(rights))
print("# of center domains:", len(centers))
print("# of left-leaning domains:", len(lefts))
print("# of low-credibility domains:", len(lowcs))
print("# of traditional domains:", len(trads))

# of domains: 804
# of right-leaning domains: 232
# of center domains: 134
# of left-leaning domains: 249
# of low-credibility domains: 220
# of traditional domains: 505


### load popularity list (uncomment this block if we want to weight by popularity)

032424: the current version doesn't use popularity weight

In [8]:
# # load popularity weights 
# popularity_df = pd.read_csv("../../index/domains/domain_popularity.csv")
# popularity_df

# # create a popularity dict {domain:weight}
popularity_dict = {}

# for _,row in popularity_df.iterrows():
#     popularity_dict[row["domain"]] = row["ave_m_log10"]
#     popularity_dict[row["domain"]] = row["ave_m"]

In [9]:
SUBSET_LABEL = ["", "_lowc", "_trad", "_left", "_center", "_right"]
DOMAIN_LIST = [domains_to_include, lowcs, trads, lefts, centers, rights]
frac = .8

weight_by_popularity = False
normalize_by_snapshot = True


OUTPUT_FOLDER = "headline-filter0.5-nopopw-normsnap"
if not os.path.exists(OUTPUT_FPATH + OUTPUT_FOLDER): 
    os.mkdir(OUTPUT_FPATH + OUTPUT_FOLDER)

if not os.path.exists(OUTPUT_FPATH + OUTPUT_FOLDER + "/bootstrap"): 
    os.mkdir(OUTPUT_FPATH + OUTPUT_FOLDER + "/bootstrap")

for this_subset, this_list in zip(SUBSET_LABEL, DOMAIN_LIST):
    print("Aggregating:", this_subset)
    sum_headline1 = sum_headline_topvec(
        output_df=headline_topics1, raw_df=headlines.df_cand1, cand=cand1, dictionary=dictionary, select_domains=this_list, 
        weight_by_popularity=weight_by_popularity, popularity_dict=popularity_dict, 
        print_info=True, normalize_by_snapshot=normalize_by_snapshot)
    sum_headline2 = sum_headline_topvec(
        output_df=headline_topics2, raw_df=headlines.df_cand2, cand=cand2, dictionary=dictionary, select_domains=this_list, 
        weight_by_popularity=weight_by_popularity, popularity_dict=popularity_dict, 
        print_info=True, normalize_by_snapshot=normalize_by_snapshot)
    
    bstr_headline_arr1 = bootstrap_sum_topvec(
        data_source="headline", output_df=headline_topics1, cand=cand1, dictionary=dictionary, raw_df=headlines.df_cand1, 
        select_domains=this_list, weight_by_popularity=weight_by_popularity, popularity_dict=popularity_dict,
        normalize_by_snapshot=normalize_by_snapshot, sample_frac=frac)
    bstr_headline_arr2 = bootstrap_sum_topvec(
        data_source="headline", output_df=headline_topics2, cand=cand2, dictionary=dictionary, raw_df=headlines.df_cand2, 
        select_domains=this_list, weight_by_popularity=weight_by_popularity, popularity_dict=popularity_dict,
        normalize_by_snapshot=normalize_by_snapshot, sample_frac=frac)

    np.save(f"{OUTPUT_FPATH}/{OUTPUT_FOLDER}/{cand1}{year}_SUM_topvecs{this_subset}.npy", sum_headline1)
    np.save(f"{OUTPUT_FPATH}{OUTPUT_FOLDER}/bootstrap/{cand1}{year}_bstr_SUM_topvecs{this_subset}.npy", bstr_headline_arr1) 

    np.save(f"{OUTPUT_FPATH}/{OUTPUT_FOLDER}/{cand2}{year}_SUM_topvecs{this_subset}.npy", sum_headline2)
    np.save(f"{OUTPUT_FPATH}{OUTPUT_FOLDER}/bootstrap/{cand2}{year}_bstr_SUM_topvecs{this_subset}.npy", bstr_headline_arr2) 

# # this would take a while 
# # COMPLETE HEADLINE DATASET
# # ~ 82 mins for 2020 (date 78 mins)
# # ～51 mins for 2016 (date xx mins)

Aggregating: 
	# of unique domains: 741
	# of unique domains: 784
progress: 0.0
progress: 0.1
progress: 0.2
progress: 0.3
progress: 0.4
progress: 0.5
progress: 0.6
progress: 0.7
progress: 0.8
progress: 0.9
(200, 27)
progress: 0.0
progress: 0.1
progress: 0.2
progress: 0.3
progress: 0.4
progress: 0.5
progress: 0.6
progress: 0.7
progress: 0.8
progress: 0.9
(200, 27)
Aggregating: _lowc
	# of unique domains: 187
	# of unique domains: 210
progress: 0.0
progress: 0.1
progress: 0.2
progress: 0.3
progress: 0.4
progress: 0.5
progress: 0.6
progress: 0.7
progress: 0.8
progress: 0.9
(200, 27)
progress: 0.0
progress: 0.1
progress: 0.2
progress: 0.3
progress: 0.4
progress: 0.5
progress: 0.6
progress: 0.7
progress: 0.8
progress: 0.9
(200, 27)
Aggregating: _trad
	# of unique domains: 481
	# of unique domains: 495
progress: 0.0
progress: 0.1
progress: 0.2
progress: 0.3
progress: 0.4
progress: 0.5
progress: 0.6
progress: 0.7
progress: 0.8
progress: 0.9
(200, 27)
progress: 0.0
progress: 0.1
progress: 0.2
