In [618]:
# Import statements
import urllib
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from dateutil.parser import parse
from dateutil import parser
from collections import defaultdict
import re
import pickle
import os
import pandas as pd
import numpy as np

In [1192]:
from urllib.error import URLError

def soupify(url):
    if url[:8] != 'https://':
        url = 'https://'+url
        
    try:
        req = urllib.request.Request(url, headers={'User-Agent' : "Magic Browser"}) 
        con = urllib.request.urlopen( req )
        html = con.read()
        soup = BeautifulSoup(html,'html.parser')
        return soup
    except URLError:
        return None

# Contents:
* [Get URLs](#Get-URLs)
    * [Climate change](#Climate-change)
        * [URLs from search](#URLs-from-search)
        * [MediaCloud URLs](#MediaCloud-urls)
        * [Reid Google search URLs](#Reid-Urls)
    * [Vaccines](#Vaccines)
* [Post-processing](#Post-processing)
    * [Filter out non-(relevant)-article URLs](#Filter-out-non-(relevant)-article-URLs)
    * [Check consistency of coding](#Check-consistency-of-coding)
    * [Deduplicate](#Dedup)
* [Scraping fulltext and missing meta info](#Scraping-fulltext-and-missing-meta-info)
* [Dedup via title similarity](#Dedup-via-title-similarity)
* [Summary stats](#Summary-stats)

# Get URLs

We use SerpApi (https://serpapi.com/search-api) to scrape Google search results from querying climate change-related keywords on various websites.

In [58]:
SERP_API_KEY = "481df24348cbec5d00f65baa55986d30b3b1ef2b09c5ab9de0f667dd43ce51d2"
from serpapi.google_search_results import GoogleSearchResults

The following query parameters restrict searches to desktop, US-based, English-language results:

In [60]:
query_params = {"location":"United States", "device":"desktop", "hl":"en", "gl":"us", "serp_api_key":SERP_API_KEY}
client = GoogleSearchResults(query_params)

In [224]:
def do_serpapi(domain,keyword):
    keyword = keyword.replace('_',' ').replace('+',' ').replace('-',' ')
    client.params_dict["q"] = "site:{} {}".format(domain,keyword) # Update query to restrict to particular site
    print('Searching w/ query: {}...'.format(client.params_dict["q"]))
    page_no = 1 
    client.params_dict["start"] = (page_no-1)*10                  # Update pagination
    
    dict_list = []
    while 'error' not in client.get_dict(): # Get results as long as more pages exist
        dict_list.append(client.get_dict())
        page_no += 1
        client.params_dict["start"] = (page_no-1)*10 
    
    return dict_list

In [234]:
def parse_serpapi_results(d_list):
    meta = []
    for d in d_list:
        if 'error' in d:
            print(d['error'])
        elif d['search_metadata']['status'] == 'Success':
            res = d['organic_results']
            page_no = d['search_information']['page_number'] if 'page_number' in d['search_information'] else 1
            print('Number of results on page {}: {}'.format(page_no,len(res)))
            meta.extend([(x['title'],x['link'],x['date']) if 'date' in x
                        else (x['title'],x['link']) for x in res])
        else:
            print("API get failure")
    return meta

## Climate change

In [239]:
# Keywords to use for climate change searches
CC_KEYWORDS = ['climate_change','global_warming','fossil_fuels','carbon_dioxide','co2']

The below 3 sections generate the following data structures:
    * google_search_res_climate_change.pkl, a dictionary with outer keys for domains and inner keys for search terms;
    * mediacloud_df.pkl, a dataframe w/ output from MediaCloud;
    * reid_urls, a dictionary with urls from each of 6 domains searched via Google by Reid

### URLs from search

In [202]:
# Initialize default nested dict with outer keys for each media domain and inner keys for each keyword.
#URLS_PER_DOMAIN = defaultdict(dict)

In [108]:
# Domains to search for climate change articles
DOMAINS = ['www.foxnews.com','www.breitbart.com','www.theblaze.com','www.pjmedia.com','www.nationalreview.com',
           'www.thenation.com','www.buzzfeednews.com','www.vox.com','www.washingtonpost.com','www.progressive.org',
          'www.nytimes.com','www.motherjones.com','www.democracynow.org']

In [235]:
# Query each domain for each keyword
for DOMAIN in DOMAINS:
    for KW in CC_KEYWORDS:
        dl = do_serpapi(DOMAIN,KW)
        results = parse_serpapi_results(dl)
        URLS_PER_DOMAIN[DOMAIN][KW] = results

Searching w/ query: site:www.theblaze.com climate change...
Number of results on page 1: 10
Number of results on page 2: 10
Number of results on page 3: 10
Number of results on page 4: 10
Number of results on page 5: 10
Number of results on page 6: 10
Number of results on page 7: 10
Number of results on page 8: 10
Number of results on page 9: 10
Number of results on page 10: 10
Number of results on page 11: 10
Number of results on page 12: 10
Number of results on page 13: 10
Number of results on page 14: 10
Number of results on page 15: 10
Number of results on page 16: 10
Number of results on page 17: 10
Number of results on page 18: 10
Number of results on page 19: 10
Number of results on page 20: 10
Number of results on page 21: 10
Number of results on page 22: 10
Number of results on page 23: 10
Number of results on page 24: 10
Number of results on page 25: 10
Number of results on page 26: 10
Number of results on page 27: 10
Number of results on page 28: 10
Number of results on page

Number of results on page 1: 10
Number of results on page 2: 10
Number of results on page 3: 10
Number of results on page 4: 10
Number of results on page 5: 10
Number of results on page 6: 10
Number of results on page 7: 10
Number of results on page 8: 10
Number of results on page 9: 9
Number of results on page 10: 10
Number of results on page 11: 10
Number of results on page 12: 10
Number of results on page 13: 10
Number of results on page 14: 10
Number of results on page 15: 10
Number of results on page 16: 10
Number of results on page 17: 10
Number of results on page 18: 10
Number of results on page 19: 10
Number of results on page 20: 10
Number of results on page 21: 10
Number of results on page 22: 10
Number of results on page 23: 10
Number of results on page 24: 10
Number of results on page 25: 10
Number of results on page 26: 10
Number of results on page 27: 10
Number of results on page 28: 1
Searching w/ query: site:www.buzzfeednews.com climate change...
Number of results on pa

Number of results on page 1: 10
Number of results on page 2: 10
Number of results on page 3: 10
Number of results on page 4: 10
Number of results on page 5: 10
Number of results on page 6: 10
Number of results on page 7: 10
Number of results on page 8: 10
Number of results on page 9: 10
Number of results on page 10: 10
Number of results on page 11: 10
Number of results on page 12: 10
Number of results on page 13: 10
Number of results on page 14: 10
Number of results on page 15: 10
Number of results on page 16: 10
Number of results on page 17: 10
Number of results on page 18: 10
Number of results on page 19: 10
Number of results on page 20: 10
Number of results on page 21: 10
Number of results on page 22: 10
Number of results on page 23: 10
Number of results on page 24: 10
Number of results on page 25: 10
Number of results on page 26: 10
Number of results on page 27: 10
Number of results on page 28: 10
Number of results on page 29: 10
Number of results on page 30: 10
Number of results o

Number of results on page 1: 10
Number of results on page 2: 10
Number of results on page 3: 9
Number of results on page 4: 10
Number of results on page 5: 10
Number of results on page 6: 10
Number of results on page 7: 10
Number of results on page 8: 10
Number of results on page 9: 10
Number of results on page 10: 10
Number of results on page 11: 10
Number of results on page 12: 10
Number of results on page 13: 10
Number of results on page 14: 10
Number of results on page 15: 10
Number of results on page 16: 10
Number of results on page 17: 10
Number of results on page 18: 10
Number of results on page 19: 10
Number of results on page 20: 10
Number of results on page 21: 10
Number of results on page 22: 10
Number of results on page 23: 10
Number of results on page 24: 10
Number of results on page 25: 10
Number of results on page 26: 10
Number of results on page 27: 10
Number of results on page 28: 3
Searching w/ query: site:www.democracynow.org climate change...
Number of results on pa

In [241]:
# Save nested dict
pickle.dump(URLS_PER_DOMAIN,open('google_search_res_climate_change.pkl','wb'))

In [242]:
URLS_PER_DOMAIN.keys()

dict_keys(['www.foxnews.com', 'www.breitbart.com', 'www.theblaze.com', 'www.pjmedia.com', 'www.nationalreview.com', 'www.thenation.com', 'www.buzzfeednews.com', 'www.vox.com', 'www.washingtonpost.com', 'www.progressive.org', 'www.nytimes.com', 'www.motherjones.com', 'www.democracynow.org'])

### MediaCloud URLs

We use the MediaCloud Python client (https://github.com/mitmedialab/MediaCloud-API-Client) to fetch stories from a larger set of media outlets.

In [244]:
import datetime
import mediacloud.api
mc = mediacloud.api.MediaCloud('feb32a16d870132da7e7d93a0414d796fec95edd30a55d453075927d083a807b')

These are the fields that we're interested in getting:

In [245]:
mc_metadata = ['ap_syndicated','language','media_id','media_name','publish_date','title','guid','url','word_count']

Read in tab-separated file with outlet id and stance information.

In [282]:
mc_ids = pd.read_csv('mediacloud_ids.txt',sep='\t',header=None)
mc_ids.columns=['id','outlet_name','stance']
mc_ids.head()

Unnamed: 0,id,outlet_name,stance
0,1,new_york_times,pro
1,2,washington_post,pro
2,3,christian_science_monitor,between
3,4,usa_today,pro
4,1092,fox,anti


In [273]:
# Collect stories from each outlet
for curr_outlet_ix in mc_ids.index:
    curr_outlet_id = mc_ids.iloc[curr_outlet_ix]['id']
    curr_outlet_stance = mc_ids.iloc[curr_outlet_ix]['stance']
    fetch_size = 500
    stories = []
    last_processed_stories_id = 0
    for start_year in range(2000,2021,5): # Start collecting stories from Jan. 1, 2000 
        while len(stories) < 2000:
            fetched_stories = mc.storyList('(climate AND chang*) OR (global AND warming) OR (carbon AND dioxide) OR (co2) AND media_id:{}'.format(curr_outlet_id), 
                                           solr_filter=mc.publish_date_query(datetime.date(start_year,1,1), datetime.date(start_year+4,12,31)),
                                           last_processed_stories_id=last_processed_stories_id, rows= fetch_size)
            stories.extend(fetched_stories)
            if len( fetched_stories) < fetch_size:
                break
            last_processed_stories_id = stories[-1]['processed_stories_id']
    if len(stories) > 0:
        df = pd.DataFrame({key: [s[key] for s in stories] for key in mc_metadata})
        df['topic'] = ['cc']*len(df)
        df['stance'] = curr_outlet_stance
        df.sort_values(by='publish_date')

        OUTLET_NAME = df['media_name'].iloc[0].lower().replace(' ','_')
        df.to_pickle(os.path.join('mediacloud','{}_df.pkl'.format(OUTLET_NAME)))
        print('Done fetching stories from {} (outlet id = {}).'.format(OUTLET_NAME,curr_outlet_id))

Done fetching stories from new_york_times (outlet id = 1).
Done fetching stories from washington_post (outlet id = 2).
Done fetching stories from christian_science_monitor (outlet id = 3).


Now, merge into a single df; filter out stories not in English; clean titles.

In [274]:
dfs = []
for filename in os.listdir('mediacloud'):
    df = pd.read_pickle(os.path.join('mediacloud',filename))
    dfs.append(df)
df_all = pd.concat(dfs,ignore_index=True)

In [275]:
df_all = df_all[df_all.language == 'en']
df_all['clean_title'] = df_all.title.apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x.lower()))

In [280]:
df_all

Unnamed: 0,ap_syndicated,language,media_id,media_name,publish_date,title,guid,url,word_count,topic,stance,clean_title
0,False,en,18468,activistpost.com,2011-05-28 17:32:00,Activist Post: David Cameron Says Non-Violent ...,http://www.activistpost.com/2014/09/david-came...,http://www.activistpost.com/2014/09/david-came...,,cc,anti,activist post david cameron says nonviolent co...
1,False,en,18468,activistpost.com,2012-09-29 05:00:00,Activist Post: Is the CDC's Mandated Vaccine ...,http://www.activistpost.com/2012/09/is-cdcs-ma...,http://www.activistpost.com/2012/09/is-cdcs-ma...,,cc,anti,activist post is the cdcs mandated vaccine sc...
2,False,en,18468,activistpost.com,2012-04-14 05:00:00,Activist Post: Haters of Humanity: The Church ...,http://www.activistpost.com/2012/04/haters-of-...,http://www.activistpost.com/2012/04/haters-of-...,,cc,anti,activist post haters of humanity the church of...
3,False,en,18468,activistpost.com,2011-04-17 05:00:00,Activist Post: Masters Of The World Meet To Pl...,http://www.activistpost.com/2011/04/masters-of...,http://www.activistpost.com/2011/04/masters-of...,,cc,anti,activist post masters of the world meet to pla...
4,False,en,18468,activistpost.com,2012-04-01 08:00:00,Is The CIA Manipulating the Weather?,https://www.activistpost.com/2016/07/is-the-ci...,https://www.activistpost.com/2016/07/is-the-ci...,,cc,anti,is the cia manipulating the weather
...,...,...,...,...,...,...,...,...,...,...,...,...
10442,False,en,2,Washington Post,2020-02-23 19:53:13,The anti-Greta: A conservative think tank take...,https://www.washingtonpost.com/climate-environ...,https://www.washingtonpost.com/climate-environ...,,cc,pro,the antigreta a conservative think tank takes ...
10443,False,en,2,Washington Post,2020-03-06 12:00:00,It was only a matter of time. Lab-created ‘mol...,https://www.washingtonpost.com/lifestyle/food/...,https://www.washingtonpost.com/lifestyle/food/...,,cc,pro,it was only a matter of time labcreated molecu...
10444,False,en,2,Washington Post,2020-03-06 03:27:58,Global crises have spurred declines in emissio...,https://www.washingtonpost.com/climate-environ...,https://www.washingtonpost.com/climate-environ...,,cc,pro,global crises have spurred declines in emissio...
10445,False,en,2,Washington Post,2020-03-09 07:56:07,The Energy 202: Three charts that explain what...,https://www.washingtonpost.com/politics/the-en...,https://www.washingtonpost.com/politics/the-en...,,cc,pro,the energy 202 three charts that explain what ...


In [283]:
df_all[df_all.stance=='anti'].shape

(3407, 12)

In [284]:
df_all[df_all.stance=='pro'].shape

(6428, 12)

In [285]:
df_all.to_pickle('mediacloud_df.pkl')

### Reid URLs

In [287]:
reid_urls = {}

In [286]:
load_dir = '/Users/yiweiluo/Dropbox/research/QP2/reid_urls/'

In [289]:
for filename in os.listdir(load_dir):
    domain = '_'.join(filename.split('_')[:-1])
    urls = pd.read_csv(load_dir+filename,header=None)
    reid_urls[domain] = urls

In [290]:
reid_urls.keys()

dict_keys(['buzzfeed_news', 'democracy_now', 'the_nation', 'the_progressive', 'vox', 'washington_post'])

In [301]:
pickle.dump(reid_urls,open('reid_urls.pkl','wb'))

## Vaccines

This section will create a file ```temp_vax_blog_urls.pkl``` containing urls from each blog.

In [599]:
advocacy_blogs = ['https://www.voicesforvaccines.org/blog/',
                 'https://adultvaccinesnow.org/blog/',
                 'https://shotofprevention.com/',
                 'https://immunizationevidence.org/featured_issues/',
                 'https://www.nfid.org/blog/',
                 'https://vaxopedia.org/category/blog/',
                 'https://www.familiesfightingflu.org/insights-on-influenza/'
                 ]

Create datastructure to store post urls from each blog:

In [619]:
#blog_dict = defaultdict(list)

### Anti-vaccine blogs

Get URLs from each blog.

In [621]:
# Children's Health Defense
root_url = 'https://childrenshealthdefense.org/kennedy-news-views/page/'
for p_no in range(1,17):
    url = root_url + str(p_no)
    soup = soupify(url)
    articles = soup.find_all('section',attrs={"class":'knv-section'})
    urls_and_meta = []
    for art in articles:
        art_meta = art.find('figcaption').find('h4').find('a')
        art_date = art.find('figcaption').find('span').text.strip()
        art_title = art_meta.text.strip()
        art_url = art_meta['href']
        urls_and_meta.append((art_url,art_title,art_date))
    blog_dict['CHD'].extend(urls_and_meta)

In [660]:
# Vaccine Safety Commission
root_url = 'https://vaccinesafetycommission.org/studies.html'
soup = soupify(root_url)
panel_bodies = soup.find_all('div',attrs={'class':'panel-body'})
print(len(panel_bodies))
text = ""
for pb in panel_bodies:
    text += pb.text.strip()
blog_dict['vax_safety_commission'].append((root_url,'50 Studies the AAP Avoided to Mention',None))

17


### Vaccine-advocacy blogs

Get URLs from each blog.

In [620]:
# Voices for Vaccines
for page_no in range(1,21):
    url = 'https://www.voicesforvaccines.org/blog/page/{}/'.format(page_no)
    soup = soupify(url)
    posts = soup.find('div',attrs={'class':'collection posts view-as-grid two-thirds'})
    ul = posts.find_all('ul',recursive=False)
    assert len(ul) == 1
    lis = ul[0].find_all('article')
    articles = [x.find('h3').find('a') for x in lis]
    urls_and_meta = [(x['href'],x['title']) for x in articles]
    blog_dict['https://www.voicesforvaccines.org/blog/'].extend(urls_and_meta)
    
# Adult Vaccine Access Coalition
for page_no in range(1,5):
    url = 'https://adultvaccinesnow.org/blog/page/{}/'.format(page_no)
    soup = soupify(url)
    main = soup.find('div',attrs={'class':'x-main full'})
    arts = main.find_all('article')
    art_objs = [a.find('h2',attrs={'class':True}).find('a') for a in arts]
    urls_and_titles = [(x['href'],x.text) for x in art_objs]
    dates = [parser.parse(a.find('header',attrs={'class':'entry-header'}).find('span').text)
         for a in arts]
    urls_and_meta = [(x[0],x[1],dates[ix]) for ix,x in enumerate(urls_and_titles)]
    blog_dict['https://adultvaccinesnow.org/blog/'].extend(urls_and_meta)
    
# Shot of Prevention
n_per_cat = {'science-research':22, 
            'testimonials-personal-stories':19,
            'questions':12,
            'policy-advocacy':27,
            'expert-insights-and-commentary':32,
            'news-outbreaks':27}

for category in ['news-outbreaks','science-research','expert-insights-and-commentary',
                'policy-advocacy','questions','testimonials-personal-stories']:
    for page_no in range(1,n_per_cat[category]+1):
        url = 'https://shotofprevention.com/category/{}/page/{}/'.format(category,page_no)
        soup = soupify(url)
        container = soup.find('div',attrs={'class':'category-container'})
        art_objs = container.find_all('div',attrs={'class':'single-box'})
        articles = [a.find('h3') for a in art_objs]
        urls_and_meta = [(x.find('a')['href'],x.text) for x in articles]
        blog_dict['https://shotofprevention.com/'].extend(urls_and_meta)
        
# VoICE https://immunizationevidence.org
for page_no in range(1,3):
    url = 'https://immunizationevidence.org/featured_issues/page/{}/'.format(page_no)
    soup = soupify(url)
    issues = soup.find_all('div',attrs={'class','featuredIssueHighlights'})
    titles = [x.find('h3',attrs={'class':'featuredTitle'}).text for x in issues]
    urls = [x.find('a',attrs={'class':'blueButton'})['href'] for x in issues]
    urls_and_meta = list(zip(urls,titles))
    blog_dict['https://immunizationevidence.org/featured_issues/'].extend(urls_and_meta)
    
# National Foundation for Infectious Diseases https://www.nfid.org/blog/
for page_no in range(1,3):
    url = 'https://www.nfid.org/blog/page/{}/'.format(page_no)
    soup = soupify(url)

In [661]:
pickle.dump(blog_dict,open('temp_vax_blog_urls.pkl','wb'))

In [662]:
test = pickle.load(open('temp_vax_blog_urls.pkl','rb'))
for k in test:
    print(k,len(test[k]))

https://www.voicesforvaccines.org/blog/ 193
https://adultvaccinesnow.org/blog/ 40
https://shotofprevention.com/ 1365
https://immunizationevidence.org/featured_issues/ 13
CHD 469
vax_safety_commission 1


# Post-processing

In [954]:
google_cc_urls = pickle.load(open('google_search_res_climate_change.pkl','rb')) # domain, keyword, title, url, date
mediacloud_cc_urls = pd.read_pickle('mediacloud_df.pkl') # ap_syndicated, domain, title, url, date
reid_cc_urls = pd.read_pickle('reid_urls.pkl') # url only
vax_urls = pickle.load(open('temp_vax_blog_urls.pkl','rb'))

In [955]:
def get_google_res_stance(x):
    if 'foxnews.com' in x:
        return 'anti'
    elif 'breitbart.com' in x:
        return 'anti'
    elif 'blaze.com' in x:
        return 'anti'
    elif 'pjmedia.com' in x:
        return 'anti'
    elif 'nationalreview.com' in x:
        return 'anti'
    else:
        return 'pro'

## Filter out non-(relevant)-article URLs

The following are NYT tags that we deem indicate that an article is irrelevant.

In [1257]:
NYT_SECTIONS_TO_REMOVE = set(['/automobiles/','/autoreviews/','/autoshow/','/business/','/campaign-stops/',
                          '/crosswords/',
               '/booming/','/giving/','/gmcvb/','/jobs/','/lens/','/letters/','/newyorktoday/',
               '/nutrition/','/sept-11-reckoning/','/smallbusiness/',
               '/sunday-review/','/garden/','/arts/','/theater/','/sports/','/dining/','/books/','/weekinreview/','/your-money/',
                         '/movies/','/fashion/','/technology/','/pageoneplus/','/travel/','/nytnow/',
                         '/public-editor/','/education/','/learning/','/podcasts/','/style/','/t-magazine/',
                         '/reader-center/','/awardsseason/','/briefing/','/dealbook/','/es/',
                          '/greathomesanddestinations/','/interactive/','/media/',
                         '/mutfund/','/obituaries/','/personaltech/','/realestate/',
                          '/smarter-living/','/todayspaper/','/your-money/','/yourtaxes/',
                             '/slideshow/','/interactive/'])

 The following are URL tags that indicate a given URL is not truly a text article.

In [1194]:
BLACKLIST_URL_STRS = set(['/tag/','/author/','/clips/','/podcasts/','/subject/','/authors/',
                         '/category/','/person/','/category/','/shows/','/video/','/topic/',
                         '/es/','/topics/','/de/','/tags/','/slideshow/',
                         '/interactive/','/transcripts/'])

In [1201]:
BLACKLIST_URL_INIT_STRS = set(['rss.','feeds.','rssfeeds.'])

In [1258]:
def is_rss(url):
    for xx in BLACKLIST_URL_INIT_STRS:
        if url[:len(xx)] == xx:
            return True
    return False

In [1260]:
def is_blacklist(url):
    for xx in BLACKLIST_URL_STRS:
        if xx in url:
            return True
    return False

In [1273]:
not is_blacklist(combined_df.url.loc[4539]) and \
not is_rss(combined_df.url.loc[4539])

True

Create a dataframe combining all data structures with urls, that filters according to above criteria.

In [1289]:
# url, title, date, domain, is_AP
filtered_urls = []
filtered_titles = []
filtered_dates = []
filtered_domains = []
filtered_stances = []
filtered_topics = []
filtered_is_AP = []

In [1290]:
for key in google_cc_urls:
    for keyword in google_cc_urls[key]:
        for item in google_cc_urls[key][keyword]:
            url = item[1]
            if not is_rss(url) and not is_blacklist(url):
                title = item[0]
                date = item[2] if len(item) > 2 else None
                stance = get_google_res_stance(url)
                topic = 'cc'
                is_AP = None

                if ' | ' not in title:
                    filtered_urls.append(url)
                    filtered_titles.append(title)
                    filtered_dates.append(date)
                    filtered_domains.append(key)
                    filtered_stances.append(stance)
                    filtered_topics.append(topic)
                    filtered_is_AP.append(is_AP)

In [1291]:
for ix in mediacloud_cc_urls.index:
    row = mediacloud_cc_urls.loc[ix]
    url = row['url'] if 'http' in row['url'] else row['guid']
    if not is_rss(url) and not is_blacklist(url):
        title = row['clean_title']
        date = row['publish_date']
        domain = row['media_name']
        stance = row['stance']
        topic = row['topic']
        is_AP = row['ap_syndicated']

        if ' | ' not in title:
            filtered_urls.append(url)
            filtered_titles.append(title)
            filtered_dates.append(date)
            filtered_domains.append(domain)
            filtered_stances.append(stance)
            filtered_topics.append(topic)
            filtered_is_AP.append(is_AP)

In [1292]:
for key in reid_urls:
    for url in reid_urls[key][0].values:
        if not is_rss(url) and not is_blacklist(url):
            filtered_urls.append(url)
            filtered_titles.append(None)
            filtered_dates.append(None)
            filtered_domains.append(key)
            filtered_stances.append('pro')
            filtered_topics.append('cc')
            filtered_is_AP.append(None)

In [1293]:
for key in vax_urls:
    for item in vax_urls[key]:
        url = item[0]
        if not is_blacklist(url) and not is_rss(url):
            title = item[1]
            item[2] if len(item) > 2 else None
            stance = 'anti' if key == 'CHD' or key == 'vax_safety_commission' else 'pro'
            is_AP = False
        
            filtered_urls.append(url)
            filtered_titles.append(title)
            filtered_dates.append(date)
            filtered_domains.append(key)
            filtered_stances.append(stance)
            filtered_topics.append('vax')
            filtered_is_AP.append(is_AP)

In [1294]:
combined_df = pd.DataFrame({'url':filtered_urls,
                              'title':filtered_titles,
                              'date':filtered_dates,
                              'domain':filtered_domains,
                              'stance':filtered_stances,
                              'topic':filtered_topics,
                              'is_AP':filtered_is_AP})

In [1295]:
combined_df.shape

(27706, 7)

In [1296]:
combined_df.head()

Unnamed: 0,url,title,date,domain,stance,topic,is_AP
0,https://www.foxnews.com/science/todays-climate...,Today's Climate Change Is Worse Than Anything ...,"Jul 25, 2019",www.foxnews.com,anti,cc,
1,https://www.foxnews.com/science/climate-change...,Climate change could destroy half of Earth's a...,"Feb 13, 2020",www.foxnews.com,anti,cc,
2,https://www.foxnews.com/media/david-webb-clima...,David Webb: 'Climate change is the religion of...,"Sep 24, 2019",www.foxnews.com,anti,cc,
3,https://www.foxnews.com/science/half-worlds-be...,Half of world's beaches will disappear by 2100...,"Mar 2, 2020",www.foxnews.com,anti,cc,
4,https://www.foxnews.com/media/mattis-climate-s...,Mattis turns up heat on climate change deniers...,"Sep 5, 2019",www.foxnews.com,anti,cc,


## Check consistency of coding

We apply a standardization function on the ``domain`` field:

In [1297]:
def standardize_domain(x):
    if x == 'Guardian US':
        return 'guardian_us'
    elif 'washingtonpost.com' in x:
        return 'wapo'
    elif 'vox.com' in x:
        return 'vox'
    elif 'breitbart.com' in x:
        return 'breitbart'
    elif 'nytimes.com' in x:
        return 'nyt'
    elif 'motherjones.com' in x:
        return 'mj'
    elif x == 'democracy_now':
        return 'dem_now'
    elif 'foxnews.com' in x:
        return 'fox'
    elif 'buzzfeednews.com' in x:
        return 'buzzfeed'
    elif x == 'Daily Caller':
        return 'daily_caller'
    elif x == 'Washington Post':
        return 'wapo'
    elif 'theblaze.com' in x:
        return 'blaze'
    elif 'democracynow.org' in x:
        return 'dem_now'
    elif x == 'Grist':
        return 'grist'
    elif x == 'New York Times':
        return 'nyt'
    elif 'nationalreview.com' in x:
        return 'nat_review'
    elif 'thenation.com' in x:
        return 'nation'
    elif x == 'Breitbart':
        return 'breitbart'
    elif x == 'Christian Science Monitor':
        return 'cs_monitor'
    elif x == 'buzzfeed_news':
        return 'buzzfeed'
    elif x == 'washington_post':
        return 'wapo'
    elif x == 'FOX News':
        return 'fox'
    elif x == 'USA Today':
        return 'usa_today'
    elif x == 'Mother Jones':
        return 'mj'
    elif x == 'NBC News':
        return 'nbc'
    elif x == 'Democracy Now!':
        return 'dem_now'
    elif x == 'National Review':
        return 'nat_review'
    elif x == 'CNS News':
        return 'cns'
    elif x == 'Buzzfeed':
        return 'buzzfeed'
    elif x == 'The Nation':
        return 'nation'
    elif 'pjmedia.com' in x:
        return 'pj'
    elif 'pajamas_media' in x:
        return 'pj'
    else:
        return x.lower().strip().replace(' ','_').replace('.com','')

In [1298]:
combined_df['domain'] = combined_df.domain.apply(standardize_domain)

In [1299]:
combined_df.domain.value_counts()

wapo                     2763
dem_now                  2133
vox                      2122
breitbart                1918
nyt                      1912
                         ... 
bipartisanreport            4
charismanews                3
cbn                         3
conservative_review         2
vax_safety_commission       1
Name: domain, Length: 66, dtype: int64

And we strip extra whitespace around titles:

In [1300]:
combined_df.title = combined_df.title.apply(lambda x: x.strip() if x 
                                           is not None else x)
combined_df.to_pickle('temp_combined_df.pkl')

## Dedup 

First, we remove initial 'http(s):' from urls.

In [1301]:
def strip_url(x):
    return x.split('http://')[-1].split('https://')[-1]

In [1302]:
combined_df.url = combined_df.url.apply(strip_url)

In [1303]:
combined_df.index

RangeIndex(start=0, stop=27706, step=1)

In [1304]:
combined_df.shape

(27706, 7)

Second, we sort ```combined_df``` by ```title``` and ```date``` so that when we drop duplicate URLs, we keep the one that doesn't have a null value for these fields.

In [1305]:
combined_df = combined_df.sort_values(['title','date'],axis=0)#,ignore_index=True)
combined_df.shape

(27706, 7)

In [1306]:
combined_df = combined_df.drop_duplicates(subset='url',keep='first')#,ignore_index=True)
combined_df.shape

(22077, 7)

# Scraping fulltext and missing meta info

We use newspaper3k (https://newspaper.readthedocs.io/en/latest/) to scrape article information including fulltext and titles.

In [1085]:
from newspaper import Article
from newspaper import ArticleException

Wrapper functions for using newspaper3k:

In [1131]:
from urllib.error import HTTPError

In [1313]:
def newspaper_parse(url):
    if url[:8] != 'https://':
        url = 'https://'+url
        
    try:
        article = Article(url)
        article.download()
        article.parse()
        return (article.title,
                article.text.replace('\n',' '))
    except ArticleException:
        return (None,None)

In [1180]:
from nltk.tokenize import sent_tokenize

def get_fulltext(url,domain):
    stop_ix,title,text = None,None,None
    if domain == 'alternet':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'american_conservative':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'https://adultvaccinesnow.org/blog/':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'activistpost':
        title,text = newspaper_parse(url)
        stop_ix = -4
    elif domain == 'american_thinker':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'blaze':
        title,text = newspaper_parse(url)
    elif domain == 'boston_globe':
        title,text = newspaper_parse(url)
    elif domain == 'breitbart':
        title,text = newspaper_parse(url)
    elif domain == 'buzzfeed':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'cbn':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'charismanews':
        title,text = newspaper_parse(url)
        stop_ix = -14
    elif domain == 'chd':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'cns':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'commdiginews':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'conservative_review':
        title,text = newspaper_parse(url)
    elif domain == 'conservative_treehouse':
        title,text = newspaper_parse(url)
    elif domain == 'conservativedailynews':
        title,text = newspaper_parse(url)
        stop_ix = -4
    elif domain == 'conservativefiringline':
        title,text = newspaper_parse(url)
        stop_ix = -10
    elif domain == 'cs_monitor':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'daily_caller':
        title,text = newspaper_parse(url)
        stop_ix = -3
    elif domain == 'daily_dot':
        title,text = newspaper_parse(url)
    elif domain == 'dem_now':
        soup = soupify(url)
        try:
            ps = soup.find('div',attrs={'itemprop':'articleBody'}).find_all('p')
        except AttributeError:
            ps = soup.find('div',attrs={'class':'story_summary'}).find_all('p')
        text = ' '.join([p.text.replace('\n', ' ') for p in ps])
    elif domain == 'drudgereport':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'fox':
        title,text = newspaper_parse(url)
    elif domain == 'gateway_pundit':
        title,text = newspaper_parse(url)
    elif domain == 'gawker':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'grabien':
        title,text = newspaper_parse(url)
    elif domain == 'grist':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'guardian_us':
        title,text = newspaper_parse(url)
    elif domain == 'hot_air':
        title,text = newspaper_parse(url)
    elif domain == 'https://adultvaccinesnow.org/blog/':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'https://immunizationevidence.org/featured_issues/':
        title,text = newspaper_parse(url)
    elif domain == 'https://shotofprevention/':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'https://www.voicesforvaccines.org/blog/':
        title,text = newspaper_parse(url)
    elif domain == 'independentsentinel':
        title,text = newspaper_parse(url)
    elif domain == 'infowars':
        title,text = newspaper_parse(url)
    elif domain == 'inthesetimes':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'libertyunyielding':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'mj':
        title,text = newspaper_parse(url)
    elif domain == 'nat_review':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'nation':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'nbc':
        soup = soupify(url)
        ps = soup.find('div',attrs={'class':'article-body__content'}).\
        find_all('p',attrs={'class':'endmarkEnabled'})
        text = ' '.join([p.text.replace('\n',' ') for p in ps])
    elif domain == 'new_york_magazine':
        title,text = newspaper_parse(url)
    elif domain == 'newsweek':
        soup = soupify(url)
        ps = soup.find('div',attrs={'class':'article-content'}).find_all('p')
        text = ' '.join([p.text.replace('\n', ' ') for p in ps])
    elif domain == 'newswithviews':
        title,text = newspaper_parse(url)
        stop_ix = -10
    elif domain == 'nyt':
        try:
            soup = soupify(url)
            if soup is not None:
                ps = soup.find('section',attrs={'itemprop':'articleBody'}).find_all('p')#,recursive=False)
                text = ' '.join([p.text.replace('\n', ' ') for p in ps])
                stop_ix = -5
        except HTTPError:
            pass
    elif domain == 'pajamas_media':
        title,text = newspaper_parse(url)
    elif domain == 'pj':
        title,text = newspaper_parse(url)
    elif domain == 'progressivestoday':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'quartz':
        title,text = newspaper_parse(url)
    elif domain == 'rare.us':
        title,text = newspaper_parse(url)
    elif domain == 'reason':
        title,text = newspaper_parse(url)
        stop_ix = -3
    elif domain == 'redstate':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'sgtreport':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'shoebat':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'sonsoflibertymedia':
        title,text = newspaper_parse(url)
        stop_ix = -1
    elif domain == 'the_american_conservative':
        title,text = newspaper_parse(url)
    elif domain == 'the_american_spectator':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'the_nation':
        title,text = newspaper_parse(url)
    elif domain == 'the_progressive':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'the_verge':
        title,text = newspaper_parse(url)
        stop_ix = 0
    elif domain == 'the_week':
        title,text = newspaper_parse(url)
    elif domain == 'usa_today':
        title,text = newspaper_parse(url)
    elif domain == 'vax_safety_commission':
        root_url = 'https://vaccinesafetycommission.org/studies.html'
        soup = soupify(root_url)
        panel_bodies = soup.find_all('div',attrs={'class':'panel-body'})
        #print(len(panel_bodies))
        text = ""
        for pb in panel_bodies:
            text += pb.text.strip()
    elif domain == 'vice':
        title,text = newspaper_parse(url)
    elif domain == 'vox':
        title,text = newspaper_parse(url)
    elif domain == 'wapo':
        title,text = newspaper_parse(url)
    else:
        print('Unknown domain!')
    
    if text is not None and len(text) > 0:
        text = text.strip()
        sent_tokens = sent_tokenize(text)
    
        # Remove final 2 sentences (usually about social media)
        sent_tokens = sent_tokens[:stop_ix] if stop_ix is not None else sent_tokens[:-2]
        text = ' '.join(sent_tokens)
    
    return (title,text)

Now, we go through every article and scrape its fulltext. If its title is null, or shorter than the title that Newspaper finds, we replace that field with the Newspaper title.<br>
We define a special separator token, ```SEP_TOK```, to use to replace the "/" character in the urls to identify each article uniquely, and add this unique key to ```combined_df```. We then save the fulltext in a directory called "fulltexts" which has a .txt file named with each unique key.

In [1308]:
SEP_TOK = '[SEP]'

In [1310]:
#urls_needed = []
#url_unique_keys = {}

In [None]:
for n,ix in enumerate(combined_df.index):
    row = combined_df.loc[ix]
    url = row['url']
    domain = row['domain']
    title = row['title']
    try:
        newspaper_title,ft = get_fulltext(url,domain)
    
        # Replace title w/ newspaper title if it's longer 
        if newspaper_title is not None and \
        title is not None and \
        len(newspaper_title) > len(title):
            title = newspaper_title
        # Replace title w/ newspaper title if the former is null but
        # not the latter
        elif newspaper_title is not None and title is None:
            title = newspaper_title
        else:
            pass

        if ft is not None:
            save_url = SEP_TOK.join(url.split('/'))
            try:
                with open('./fulltexts/{}.txt'.format(save_url),'w') as f:
                    f.write(ft)
                url_unique_keys[url] = save_url
            except OSError:
                with open('./fulltexts/{}.txt'.format(save_url[:90]),'w') as f:
                    f.write(ft)
                url_unique_keys[url] = save_url[:90]
        else:
            urls_needed.append(ix)
    except AttributeError:
        urls_needed.append(ix)
        
    if n % 100 == 0:
        print(n)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100


In [1249]:
pickle.dump(urls_needed,open('fulltext_needed_urls.pkl','wb'))
#urls_needed = pickle.load(open('./temp/fulltext_needed_urls.pkl','rb'))
print(len(urls_needed))

2786


In [1087]:
combined_df.shape

(22164, 7)

In [1088]:
combined_df.topic.value_counts()

cc     20729
vax     1435
Name: topic, dtype: int64

In [1089]:
combined_df.loc[combined_df.topic=='cc'].stance.value_counts()

pro        14061
anti        6200
between      468
Name: stance, dtype: int64

In [1090]:
combined_df.loc[combined_df.topic=='vax'].stance.value_counts()

pro     965
anti    470
Name: stance, dtype: int64

In [1123]:
# Try to get fulltext of urls missing fulltext, 
#remove urls from combined_df otherwise

In [1236]:
for ix in urls_needed:
    assert combined_df.loc[ix].shape == (7,)

In [1250]:
dem_now_urls_needed = [x for x in urls_needed if 
                      combined_df.loc[x]['domain'] == 'dem_now']
combined_df.loc[dem_now_urls_needed]

Unnamed: 0,url,title,date,domain,stance,topic,is_AP
11979,www.democracynow.org/2011/11/4/headlines/imf_e...,16 Arrested at Occupy Wall Street Protest Outs...,"Nov 4, 2011",dem_now,pro,cc,
11527,www.democracynow.org/2017/11/14/1st_female_pre...,1st Female President of the Marshall Islands &...,"Nov 14, 2017",dem_now,pro,cc,
11563,www.democracynow.org/2019/10/8/extinction_rebe...,700+ Arrested As Extinction Rebellion Fights C...,"Oct 8, 2019",dem_now,pro,cc,
11448,www.democracynow.org/2012/11/1/a_crisis_foreto...,A Crisis Foretold: Studies Warned New York Inf...,"Nov 1, 2012",dem_now,pro,cc,
11432,www.democracynow.org/2018/9/14/a_debate_on_geo...,A Debate on Geoengineering: Should We Delibera...,"Sep 14, 2018",dem_now,pro,cc,
...,...,...,...,...,...,...,...
23998,www.democracynow.org/2007/9/14/climate_porn_si...,,,dem_now,pro,cc,
23999,www.democracynow.org/2007/8/16/the_11th_hour_h...,,,dem_now,pro,cc,
24007,www.democracynow.org/2005/9/1/katrinas_real_na...,,,dem_now,pro,cc,
24008,www.democracynow.org/2005/8/30/dozens_dead_as_...,,,dem_now,pro,cc,


In [1240]:
from collections import Counter
counted_fnames = Counter(os.listdir('fulltexts'))

In [1251]:
print(len(os.listdir('fulltexts')))
print(len(counted_fnames))

16648
16648


In [1196]:
for ix in urls_needed:
    if '/interactive/' in combined_df.loc[ix].url or \
    '/slides/' in combined_df.loc[ix].url or \
    '/transcripts/' in combined_df.loc[ix].url:
        urls_needed.remove(ix)

In [1232]:
#pickle.dump(urls_needed,open('./temp/fulltext_needed_urls_3.pkl','wb'))
urls_needed = pickle.load(open('./temp/fulltext_needed_urls_3.pkl','rb'))
print(len(urls_needed))

1763


In [None]:
# Also combine w/ my existing articles 
# 1(all_url_df--shoot for 10k on each side for cc)

# Dedup via title similarity

First, we need to get missing titles.

In [None]:
def get_title_date(soup,domain):
    if domain == 'buzzfeed':
        title = soup.find('h1').text.strip()
        try:
            date = parse(soup.find('time').text\
                         .split('Posted on ')[-1].split(' - ')[-1].split(', ')[0].strip())
        except AttributeError:
            try:
                date = parse(soup.find('time').text.strip())
            except AttributeError:
                date = parse(soup.find('div',attrs={'class':'news-article-header__timestamps'}).text\
            .split('Posted on ')[-1].split(' - ')[-1].split(', ')[0].strip())
    elif domain == 'dem_now':
        title = soup.find('h1').text.strip()
        date = parse(soup.find('span',attrs={'class':'date'}).text.strip())
    elif domain == 'the_nation':
        title = soup.find('h1',attrs={'class':'title'}).text.strip()
        date = parse(soup.find('h4').text.strip())
    elif domain == 'wapo':
        title = soup.find('h1').text.strip()
        date = parse(soup.find('div',attrs={'class':'display-date'}).text.strip())
    elif domain == 'vox':
        title = soup.find('h1').text.strip()
        try:
            date = soup.find('time')['datetime']
        except TypeError:
            date = None
    elif domain == 'the_progressive':
        title = soup.find('h1').text.strip()
        date = soup.find('time')['datetime']
    else:
        title,date = None,None
        
    return (title,date)

(Information on URLs, media domains, stance, and topic is all complete.)

In [1092]:
assert len(combined_df.loc[pd.isnull(combined_df.url)]) == 0
assert len(combined_df.loc[pd.isnull(combined_df.domain)]) == 0
assert len(combined_df.loc[pd.isnull(combined_df.stance)]) == 0
assert len(combined_df.loc[pd.isnull(combined_df.topic)]) == 0

In [None]:
missing_title_df = combined_df.loc[pd.isnull(combined_df.title)]
missing_title_df.domain.value_counts()

In [None]:
for n,ix in enumerate(missing_title_df.index):
    row = missing_title_df.loc[ix]
    url = row['url']
    domain = row['domain']
    soup = soupify(url)
    title,date = get_title_date(soup,domain)
    combined_df.loc[ix]['title'] = title
    combined_df.loc[ix]['date'] = date
    if n % 100 == 0:
        print(n)

In [None]:
print(combined_df.loc[pd.isnull(combined_df.title)].shape)
print(combined_df.loc[pd.isnull(combined_df.date)].shape)

We use the heuristic as described in Petersen et al. to deduplicate articles based on titles.

In [None]:
def d_l_dist(s1, s2):
    d = {}
    lenstr1 = len(s1)
    lenstr2 = len(s2)
    for i in range(-1,lenstr1+1):
        d[(i,-1)] = i+1
    for j in range(-1,lenstr2+1):
        d[(-1,j)] = j+1

    for i in range(lenstr1):
        for j in range(lenstr2):
            if s1[i] == s2[j]:
                cost = 0
            else:
                cost = 1
            d[(i,j)] = min(
                           d[(i-1,j)] + 1, # deletion
                           d[(i,j-1)] + 1, # insertion
                           d[(i-1,j-1)] + cost, # substitution
                          )
            if i and j and s1[i]==s2[j-1] and s1[i-1] == s2[j]:
                d[(i,j)] = min (d[(i,j)], d[i-2,j-2] + cost) # transposition

    return d[lenstr1-1,lenstr2-1]

def is_same(u1,u2):
    #Djk ≤ 0.2 × Min.[|Tj|,|Tk|]
    # determine Damerau-Levensthtein edit distance
    D_jk = d_l_dist(u1,u2)
    t_j = len(u1)
    t_k = len(u2)
    min_ = min(t_j,t_k)
    return D_jk < 0.2*min_ 

In [None]:
outlet_groups = combined_df.groupby('domain')
outlet_groups.first()

In [None]:
#to_remove = []

In [None]:
# actually, improve how I save indices to remove

In [None]:
for outlet in outlet_groups.first().index[4:]:
    outlet_df = outlet_groups.get_group(outlet)
    print('Processing {} with {} URLS'.format(outlet,len(outlet_df)))
    outlet_titles = outlet_df.title.values
    for ix1 in range(len(outlet_titles)-1):
        for ix2 in range(ix1+1,len(outlet_titles)):
            t1 = outlet_titles[ix1]
            t2 = outlet_titles[ix2]
            #print(t1,t2)
            if is_same(t1,t2):
                to_remove.append((outlet_df.index[outlet_df['title'] == t1],
                                 outlet_df.index[outlet_df['title'] == t2]))

In [None]:
pickle.dump(to_remove,open('dups_to_remove.pkl','wb'))

# Summary stats

In [None]:
# exclude articles w/ empty fulltext in final df