# Sample of Media Cloud URLs
**Purpose**: 
- check if URLs from Media Cloud are accessible
- check whether NewsPlease is able to retrieve content

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)
# when urlExpander is given a timezone-aware datetime for 'publish_date', it makes date buckets (year, quarter, month, week).
# these buckets don't retain timezone info, which will lead to the following warning.
# /home/[username]/.virtualenvs/urlExpander/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py:1143:
# UserWarning: Converting to PeriodArray/Index representation will drop timezone information.

import pandas as pd
import os
import json
import urlexpander

In [2]:
from usrightmedia.shared.loggers import get_logger
LOGGER = get_logger(filename = '02-mediacloud-sample', logger_type='main')

In [3]:
dir_sample = os.path.join('..', '..', 'data', '02-intermediate', '03-mediacloud-sample')

In [4]:
df_sample = pd.read_pickle(os.path.join(dir_sample, 'mediacloud_urls_sampled.pkl'))

In [5]:
urls_to_fetch = df_sample.to_dict("records")

#### Fetch sample

In [6]:
fn_sample = f"mediacloud_urls_sampled_fetched.jsonl"

```
url 0, fetch_url: https://www.thegatewaypundit.com/2020/04/joe-biden-appears-to-be-reading-from-note-cards-during-media-spot-video/
url 1, fetch_url: http://feedproxy.google.com/~r/breitbart/~3/bh9JQvQPihk/
url 2, fetch_url: https://www.amren.com/news/2019/03/zimbabwe-needs-r3-2-billion-in-aid-donations-to-feed-its-people-un/
url 3, fetch_url: https://vdare.com/posts/verified-twitter-users-wrongfully-blame-white-supremacy-for-dayton-shooting
url 4, fetch_url: http://dailycaller.com/2016/05/25/state-dept-admits-that-hillary-clinton-failed-to-turn-over-secretive-email/
url 5, fetch_url: https://www.infowars.com/ex-fbi-chief-strzok-belongs-in-prison-for-fabricating-information/
...

```

In [7]:
# urlexpander.fetch_urls_to_file(
#     urls=urls_to_fetch,
#     fetch_func=urlexpander.fetch_url,
#     path=dir_sample,
#     filename=fn_sample,
#     write_mode="a",
# )

In [8]:
g_sample = urlexpander.load_fetched_from_file(path=dir_sample, filename=fn_sample)
r_sample = [json.loads(r)  for r in g_sample]

In [9]:
len(r_sample)

361

In [10]:
df_fetched = pd.DataFrame(r_sample)

In [11]:
df_fetched.columns

Index(['original_url', 'url_id', 'outlet', 'publish_year', 'title',
       'ap_syndicated', 'themes', 'publish_date', 'publish_quarter',
       'publish_month', 'publish_week', 'resolved_url', 'resolved_domain',
       'resolved_text', 'response_url', 'response_code', 'response_reason',
       'fetch_error', 'newsplease_maintext', 'resolved_netloc',
       'resolved_url_v1', 'resolved_url_v2', 'resolved_url_v3',
       'resolved_url_v4', 'is_generic_url', 'FETCH_FUNC', 'FETCH_AT'],
      dtype='object')

In [12]:
# '600' is a custom response code from an old version of urlExpander; public fork version of urlExpander uses NaN
with pd.option_context('display.max_rows', None):
    display(df_fetched.groupby(['outlet', 'FETCH_FUNC', 'response_code']).size().to_frame('count'))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
outlet,FETCH_FUNC,response_code,Unnamed: 3_level_1
American Renaissance,try_urlexpander,200,25
Breitbart,try_urlexpander,200,30
Daily Caller,try_urlexpander,200,30
Daily Stormer,try_urlexpander,200,18
Daily Stormer,try_waybackpy,200,8
Daily Stormer,try_waybackpy,600,1
Daily Stormer,try_waybackpy,602,1
Fox News,try_urlexpander,200,29
Fox News,try_waybackpy,200,1
Gateway Pundit,try_urlexpander,200,29


In [34]:
df_fetched.loc[df_fetched['newsplease_maintext']=="None"].groupby(['outlet', 'original_url', 'resolved_url', 'response_code']).size().to_frame('count')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
outlet,original_url,resolved_url,response_code,Unnamed: 4_level_1
Rush Limbaugh,http://feedproxy.google.com/~r/RushLimbaugh-AllContent/~3/8IWd6krep9M/newest_outrage_trump_jokes_about_crying_baby,https://www.rushlimbaugh.com/daily/2016/08/02/newest_outrage_trump_jokes_about_crying_baby/?utm_source=feedburner&utm_medium=feed,200,1
Rush Limbaugh,http://feedproxy.google.com/~r/RushLimbaugh-AllContent/~3/E89GzLT2k88/bernie_got_schlonged_on_delegates,https://www.rushlimbaugh.com/daily/2016/02/10/bernie_got_schlonged_on_delegates/?utm_source=feedburner&utm_medium=feed,200,1
Rush Limbaugh,http://feedproxy.google.com/~r/RushLimbaugh-AllContent/~3/PfrR_G3zOEM/why_trump_look_at_europe,https://www.rushlimbaugh.com/daily/2016/11/04/why_trump_look_at_europe/?utm_source=feedburner&utm_medium=feed,200,1
Rush Limbaugh,http://feedproxy.google.com/~r/RushLimbaugh-AllContent/~3/wHFpC2zIgp8/the_fbi_vs_apple,https://www.rushlimbaugh.com/daily/2016/02/17/the_fbi_vs_apple/?utm_source=feedburner&utm_medium=feed,200,1
Rush Limbaugh,http://feedproxy.google.com/~r/RushLimbaugh-Transcripts/~3/M2CGU-wlNYY/,https://www.rushlimbaugh.com/daily/2020/05/15/the-chicoms-vow-to-meddle-in-the-2020-election/?utm_source=feedburner&utm_medium=feed,200,1
Rush Limbaugh,http://feedproxy.google.com/~r/RushLimbaugh-Transcripts/~3/_oDegznCWyQ/,https://www.rushlimbaugh.com/daily/2020/05/12/explaining-trumps-answer-to-the-cbs-news-provocateur/?utm_source=feedburner&utm_medium=feed,200,1
Rush Limbaugh,https://www.rushlimbaugh.com/daily/2017/01/19/a-dangerous-act-of-irresponsibility-on-cnn/,https://www.rushlimbaugh.com/daily/2017/01/19/a-dangerous-act-of-irresponsibility-on-cnn/,200,1
Rush Limbaugh,https://www.rushlimbaugh.com/daily/2017/04/14/trumps-policies-are-popular/,https://www.rushlimbaugh.com/daily/2017/04/14/trumps-policies-are-popular/,200,1
Rush Limbaugh,https://www.rushlimbaugh.com/daily/2017/05/23/mick-mulvaney-unveils-the-revolutionary-trump-budget/,https://www.rushlimbaugh.com/daily/2017/05/23/mick-mulvaney-unveils-the-revolutionary-trump-budget/,200,1
Rush Limbaugh,https://www.rushlimbaugh.com/daily/2017/06/20/frank-luntz-focus-group-of-trump-voters-shocks-washington/,https://www.rushlimbaugh.com/daily/2017/06/20/frank-luntz-focus-group-of-trump-voters-shocks-washington/,200,1


In [14]:
with pd.option_context('display.max_rows', None):
    display(df_fetched.loc[df_fetched['outlet']=='Rush Limbaugh'].groupby(['outlet', 'newsplease_maintext']).size().to_frame('count'))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
outlet,newsplease_maintext,Unnamed: 2_level_1
Rush Limbaugh,,1
Rush Limbaugh,,24


In [50]:
with pd.option_context('display.max_rows', None, 'display.max_colwidth', 200):
    display(df_fetched.loc[(df_fetched['outlet']=='Washington Examiner')&(df_fetched['newsplease_maintext']=="None")].columns)

Index(['original_url', 'url_id', 'outlet', 'publish_year', 'title',
       'ap_syndicated', 'themes', 'publish_date', 'publish_quarter',
       'publish_month', 'publish_week', 'resolved_url', 'resolved_domain',
       'resolved_text', 'response_url', 'response_code', 'response_reason',
       'fetch_error', 'newsplease_maintext', 'resolved_netloc',
       'resolved_url_v1', 'resolved_url_v2', 'resolved_url_v3',
       'resolved_url_v4', 'is_generic_url', 'FETCH_FUNC', 'FETCH_AT'],
      dtype='object')

In [37]:
d = df_fetched.loc[(df_fetched['outlet']=='Washington Examiner')&(df_fetched['newsplease_maintext']=="None")]['resolved_text'][41]