## Select domains to include for downstream analysis

In [1]:
import pandas as pd 
import numpy as np
from src.utils.data_loader import Headlines
from src.utils.downstream_process import trim_period

import yaml
with open("../../src/configs.yml", "r") as configs:
    configs = yaml.safe_load(configs)

DATAPATH = configs["DATAPATH"]
START2016 = pd.to_datetime(configs["START2016"])
END2016 = pd.to_datetime(configs["END2016"])
START2020 = pd.to_datetime(configs["START2020"])
END2020 = pd.to_datetime(configs["END2020"])

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/yijingch/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yijingch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load data and labels

In [2]:
headlines2016 = pd.read_csv("../../data/headline/headlines_2016_all.tsv", sep="\t")
headlines2020 = pd.read_csv("../../data/headline/headlines_2020_all.tsv", sep="\t")

headlines2016 = trim_period(headlines2016, start=START2016, end=END2016)
headlines2020 = trim_period(headlines2020, start=START2020, end=END2020)

In [3]:
print("# of unique days")
print("\t2016:", headlines2016["date"].nunique())
print("\t2020:", headlines2020["date"].nunique())

n_days2016 = headlines2016["date"].nunique()
n_days2020 = headlines2020["date"].nunique()

# # of unique days
# 	2016: 152
# 	2020: 152

# of unique days
	2016: 152
	2020: 152


In [4]:
domains2016 = pd.DataFrame()
domains2016["domain"] = headlines2016["domain"].unique()
domains2020 = pd.DataFrame()
domains2020["domain"] = headlines2020["domain"].unique()

In [5]:
aggr_domain_date2016 = headlines2016.groupby("domain").agg({
    "date":lambda x: set(x)}).reset_index()
aggr_domain_date2020 = headlines2020.groupby("domain").agg({
    "date":lambda x: set(x)}).reset_index()

In [6]:
aggr_domain_date2016["n_coverage"] = aggr_domain_date2016["date"].map(lambda x: len(x))
aggr_domain_date2020["n_coverage"] = aggr_domain_date2020["date"].map(lambda x: len(x))

aggr_domain_date2016["pct_coverage"] = aggr_domain_date2016["n_coverage"]/n_days2016
aggr_domain_date2020["pct_coverage"] = aggr_domain_date2020["n_coverage"]/n_days2020

In [7]:
aggr_domain_date2016

Unnamed: 0,domain,date,n_coverage,pct_coverage
0,100percentfedup.com,"{2016-10-24 00:00:00, 2016-10-12 00:00:00, 201...",7,0.046053
1,1011now.com,"{2016-10-11 00:00:00, 2016-07-19 00:00:00, 201...",6,0.039474
2,10news.com,{2016-10-17 00:00:00},1,0.006579
3,10tv.com,"{2016-11-13 00:00:00, 2016-11-22 00:00:00, 201...",77,0.506579
4,11thhouronline.com,"{2016-07-19 00:00:00, 2016-11-28 00:00:00, 201...",3,0.019737
...,...,...,...,...
4975,zdnet.com,"{2016-11-13 00:00:00, 2016-09-14 00:00:00, 201...",47,0.309211
4976,zerohedge.com,"{2016-11-13 00:00:00, 2016-08-25 00:00:00, 201...",137,0.901316
4977,zip06.com,"{2016-11-13 00:00:00, 2016-08-21 00:00:00, 201...",12,0.078947
4978,zmescience.com,"{2016-11-13 00:00:00, 2016-11-15 00:00:00, 201...",14,0.092105


In [8]:
thres = 0.5
keep2016 = aggr_domain_date2016[aggr_domain_date2016["pct_coverage"]>=thres].reset_index().drop(columns="index")
keep2020 = aggr_domain_date2020[aggr_domain_date2020["pct_coverage"]>=thres].reset_index().drop(columns="index")

print("# of qualified domains:")
print("\t2016:", len(keep2016))
print("\t2020:", len(keep2020))

# # of qualified domains:
# 	2016: 443
# 	2020: 805

# of qualified domains:
	2016: 443
	2020: 805


In [9]:
keep2020[keep2020["domain"].str.contains("rumormillnews")]

Unnamed: 0,domain,date,n_coverage,pct_coverage
586,rumormillnews.com,"{2020-08-12 00:00:00, 2020-08-17 00:00:00, 202...",151,0.993421


In [10]:
# filter out rumormilllnews
domains_to_exclude = ["rumormillnews.com"]
keep2016 = keep2016[~keep2016["domain"].isin(domains_to_exclude)]
keep2020 = keep2020[~keep2020["domain"].isin(domains_to_exclude)]

In [11]:
keep2016[keep2016["domain"].str.contains("rumormillnews")]

Unnamed: 0,domain,date,n_coverage,pct_coverage


In [12]:
keep2016.to_csv(f"../../index/domains/domains_to_keep2016_coverage{thres}.csv", index=False)
keep2020.to_csv(f"../../index/domains/domains_to_keep2020_coverage{thres}.csv", index=False)