# Count of URLs scraped per outlet by publication year (Table 4)

In [1]:
import os
import pandas as pd

from usrightmedia.shared.media_references import get_mediacloud_outlet_ids

In [2]:
from inca import Inca
myinca = Inca()

INFO:INCA:Providing verbose output


In [3]:
# matplotlib is logged even though disable_existing_loggers=yes in logging_config.yaml
# https://stackoverflow.com/a/51529172/7016397
# workaround is to manually set the level before creating my logger
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '01-mediacloud-dataset', logger_type='main')

dir_notes = os.path.join('..', '..', 'data', '04-notes')

## 1.0 Retrieve Outlets' Media Cloud IDs

In [4]:
df_outlets = get_mediacloud_outlet_ids()
df_outlets

Unnamed: 0,outlet,media_id
0,American Renaissance,26186
1,Breitbart,19334
2,Daily Caller,18775
3,Daily Stormer,113988
4,Fox News,1092
5,Gateway Pundit,25444
6,InfoWars,18515
7,Newsmax,25349
8,One America News,127733
9,Rush Limbaugh,24669


## 2.0 Retrieve URLs

In [5]:
outlet_doctypes = [
    "americanrenaissance",
    "breitbart",
    "dailycaller",
    "dailystormer",
    "foxnews",
    "gatewaypundit",
    "infowars",
    "newsmax",
    "oneamericanews",
    "rushlimbaugh",
    "seanhannity",
    "vdare",
    "washingtonexaminer",
]

https://github.com/wlmwng/inca/blob/usrightmedia/base/inca/scrapers/usmedia_scraper.py

- `publish_date` stored in INCA as UTC (converted from implicit EST) \
https://github.com/wlmwng/us-right-media/blob/develop/usrightmedia/shared/datetime_utils.py \
https://github.com/wlmwng/us-right-media/blob/develop/usrightmedia/code/03-mediacloud/01-mediacloud-prep-urls.ipynb

In [6]:
def retrieve_urls(outlet_doctypes):
    dfs = []
    for doctype in outlet_doctypes:
        doctype_urls = [{key: doc["_source"][key] for key in ["doctype", "url_id", "url", "publish_date", 
                                                              # "FETCH_FUNCTION", "fetch_error", "alt_url", "resolved_url", "standardized_url"
                                                              ]}
                         for doc in myinca.database.doctype_generator(doctype)]
        df = pd.DataFrame(doctype_urls)
        dfs.append(df)
    df_all = pd.concat(dfs)
    df_all["publish_date"] =  pd.to_datetime(df_all["publish_date"])
    df_all["publish_year"] = df_all["publish_date"].dt.year
    df_all = df_all[["doctype", "url_id", "url", "publish_year"]]
    df_all = df_all.sort_values("publish_year").reset_index(drop=True)
    return df_all

In [7]:
df = retrieve_urls(outlet_doctypes)

100%|██████████| 9838/9838 [00:04<00:00, 1978.14it/s]
100%|██████████| 149241/149241 [00:59<00:00, 2520.01it/s]
100%|██████████| 121822/121822 [00:46<00:00, 2623.10it/s]
100%|██████████| 15823/15823 [00:05<00:00, 2836.08it/s]
100%|██████████| 264620/264620 [01:50<00:00, 2391.25it/s]
100%|██████████| 39401/39401 [00:14<00:00, 2750.15it/s]
100%|██████████| 28453/28453 [00:11<00:00, 2529.41it/s]
100%|██████████| 71146/71146 [00:27<00:00, 2598.67it/s]
100%|██████████| 117287/117287 [00:42<00:00, 2736.99it/s]
100%|██████████| 9396/9396 [00:05<00:00, 1587.52it/s]
100%|██████████| 5647/5647 [00:02<00:00, 2734.37it/s]
100%|██████████| 19643/19643 [00:10<00:00, 1946.04it/s]
100%|██████████| 71710/71710 [00:28<00:00, 2529.00it/s]


In [8]:
df

Unnamed: 0,doctype,url_id,url,publish_year
0,dailycaller,908481477,http://dailycaller.com/2016/11/10/trumps-trans...,2016
1,newsmax,621116989,http://www.newsmax.com/Newsfront/loretta-lynch...,2016
2,newsmax,621086517,http://www.newsmax.com/Politics/trump-mnuchin-...,2016
3,dailycaller,453280686,http://dailycaller.com/2016/04/26/sara-sampaio...,2016
4,dailycaller,459325972,http://dailycaller.com/2016/05/03/has-america-...,2016
...,...,...,...,...
924022,foxnews,1606913375,https://www.foxnews.com/us/some-michigan-law-e...,2020
924023,foxnews,1606896898,https://www.foxnews.com/world/mexicos-cartels-...,2020
924024,foxnews,1606896894,https://www.foxnews.com/lifestyle/should-runne...,2020
924025,foxnews,1607218561,http://feeds.foxnews.com/~r/foxnews/health/~3/...,2020


## 3.0 Summarize by outlet and publish year

In [9]:
with pd.option_context("display.max_rows",None):
    display(pd.pivot_table(df, index="doctype", values="url", columns="publish_year", aggfunc="count", margins=True))

publish_year,2016,2017,2018,2019,2020,All
doctype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
americanrenaissance,1693,1795,1749,1825,2776,9838
breitbart,30083,31771,24591,25908,36888,149241
dailycaller,19359,26241,25856,21268,29098,121822
dailystormer,4361,2953,1107,3025,4377,15823
foxnews,47737,48922,45156,47632,75173,264620
gatewaypundit,4810,8095,5988,7195,13313,39401
infowars,2094,2392,8071,7537,8359,28453
newsmax,10568,11093,7706,9566,32213,71146
oneamericanews,7568,21018,22732,23282,42687,117287
rushlimbaugh,1471,2098,2082,2017,1728,9396


## 3.0 Export

In [10]:
df.to_pickle(os.path.join(dir_notes, 'Table_4__count_of_URLs_scraped_per_outlet_by_publication_year.pkl'))
df.to_csv(os.path.join(dir_notes, 'Table_4__count_of_URLs_scraped_per_outlet_by_publication_year.csv'), index=False)