In [1]:
#Install Requirements
import urllib
from bs4 import BeautifulSoup 
from selectolax.parser import HTMLParser

from urllib.request import urlparse, urljoin
import json 
from datetime import datetime
from dateutil.parser import parse
import nltk

In [2]:
class Crawler(object):
    
    def __init__(self, keywords, max_urls=50):
        self.links = []
        self.keywords = keywords
        self.max_urls = max_urls
        self.reset()

    def reset(self):
        self.internal_urls = set()
        self.external_urls = set()
        self.total_urls_visited = 0
        
    def get_links(self):
        return self.internal_urls
        
    def crawl(self, url):
        """
        Crawls a web page and extracts all links.
        You'll find all links in `external_urls` and `internal_urls` global set variables.
        params:
            max_urls (int): number of max urls to crawl, default is 30.
        """
        self.total_urls_visited += 1
        if self.total_urls_visited <= self.max_urls:
            print("crawling", self.total_urls_visited, "/", self.max_urls)
        
        self.links = self.get_all_website_links_selectolax(url)
        
        for link in self.links:
            if self.total_urls_visited > self.max_urls:
                break
            self.crawl(link)
            
        
            
    def is_valid(self, url):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)
    
    def get_all_website_links_selectolax(self, url):
        urls = set()
        
        domain_name = urlparse(url).netloc
        
        try:
            r = urllib.request.urlopen(url)
            sll =  HTMLParser(r.read())
        except:
            return urls
        
        for a_tag in sll.css("a"):
            if not "href" in a_tag.attributes:
                continue
            href = a_tag.attrs["href"]
            if href == "" or href is None:
                continue

            href = urljoin(url,href)
            parsed_href = urlparse(href)
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

            if not self.is_valid(href):
                # not a valid URL
                continue
            if href in self.internal_urls:
                # already in the set
                continue
            if domain_name not in href:
                # external link
    #             if href not in external_urls:
    #                 external_urls.add(href)
                continue

            a_tag_text = a_tag.text(deep=True, separator='', strip=False)
            if not (any(word in a_tag_text for word in self.keywords) or any(word in href for word in self.keywords)):
                #check if A tag text OR A tag href doesn't contain keyword  
                continue

            urls.add(href)
            self.internal_urls.add(href)
        return urls

In [3]:
class NewsSource(object):
    
    def __init__(self, journal, seed_url, crawler):
        self.journal = journal
        
        print('Crawling for sources using seed...')
        print(seed_url)
        crawler.reset()
        crawler.crawl(seed_url)
        print('Crawling done')
        
        print('Filtering links...')
        self.links = self.filter_links(crawler.get_links())
        self.output = None
        print('Done.')
    
    def get_output(self):
        return self.output
    
    def get_links(self):
        return self.links
    
    def export(self, filename):
        if self.output == None:
            print('Run scrape first!')
            return
        
        with open(filename + '.json', 'w') as outfile:
            json.dump(self.output, outfile)
        
    
    def filter_links(self, links):
        return links
    
    def scrape(self):
        print('Scraping', len(self.links), 'sources on', self.journal)

In [4]:
class WashingtonPost(NewsSource):
    
    def __init__(self, seed_url, crawler):
        super().__init__("Washington Post", seed_url, crawler)
        
    def scrape(self):
        super().scrape()
        articles = self.__wp_general_article()
        live_data = self.__wp_live_update()
        self.output = articles + live_data
        
    #WP's live-update article seems structured differently since it is a trail of small posts.
    def __wp_live_update(self):
        URL = "https://www.washingtonpost.com/world/asia_pacific/coronavirus-china-live-updates/2020/02/21/81d2aa50-543e-11ea-b119-4faabac6674f_story.html"
        r = requests.get(URL) 
        soup = BeautifulSoup(r.content, 'html.parser')

        #Seems to be the only consistent part for every article?
        main_article = soup.find('div', attrs = {'class':'article-body'})
        latest_article = soup.find('div', attrs ={'class':'teaser-content'})
        previous_articles = soup.find('div', attrs ={'class':'remainder-content'}).find('section')

        articles = []

        #The inline-story tag is "unique" to the live-update article
        for event in previous_articles.findAll('div', attrs = {'data-qa':'inline-story'}):
            story = {}
            story['time-stamp'] = event.find('div', attrs = {'class':'display-date'}).text
            story['headline'] = event.find('h2', attrs={'data-qa':'headline'}).text
            story['content'] = []
            story['journal'] = self.journal

            for para in event.find('div', attrs={'data-qa':'article-body'}).find('section').findAll('div'):
                if(para.find('p')):
                    story['content'].append(para.find('p').text)

            articles.append(story)

        return articles
    
    def __wp_general_article(self):
        articles = []
        for URL in self.links:
            r = requests.get(URL) 
            soup = BeautifulSoup(r.content, 'html.parser')

            article_header = soup.find('header', attrs = {'data-qa':'main-full'})
            main_article = soup.find('div', attrs = {'class':'article-body'})
            teaser = soup.find('div', attrs ={'class':'teaser-content'}).find('section')
            body = soup.find('div', attrs ={'class':'remainder-content'}).find('section')
            timestep = soup.find('div', attrs={'class':'display-date'}).text

            story = {}
            story['content'] = []
            story['headline'] = article_header.find('h1', attrs={'data-qa':'headline'}).text
            story['time-stamp'] = timestep
            story['journal'] = self.journal

            for teaser_content in teaser.findAll('div'):
                if teaser_content.find('p'):
                     story['content'].append(teaser_content.find('p').text)

            for body_content in body.findAll('div'):
                if body_content.find('p'):
                     story['content'].append(body_content.find('p').text)

            articles.append(story)

        return articles

In [5]:
class SueddeutscheZeitung(NewsSource):

    def __init__(self, seed_url, crawler):
        super().__init__('Sueddeutsche Zeitung', seed_url, crawler)
        
    def filter_links(self, links):
        return set(filter(lambda l: '/thema/' not in l, links))
        
    def scrape(self):
        super().scrape()
        articles = []
        for i, URL in enumerate(self.links):
            r = urllib.request.urlopen(URL) 
            sll = HTMLParser(r.read())

            article_header = sll.css_first('.sz-article__header')
            article_intro = sll.css_first('.sz-article__intro')
            main_article = sll.css_first('.sz-article__body')
            
            if article_header is None or main_article is None:
                print(i+1,'/',len(self.links),'Skipping invalid article:', URL)
                continue
            
            print(i+1,'/',len(self.links),URL)
            timestamp = article_header.css_first('time').attributes['datetime']

            story = {}
            story['content'] = []
            story['headline'] = ''
            story['time-stamp'] = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S').strftime("%m/%d/%Y, %H:%M:%S")
            story['url'] = URL
            story['journal'] = self.journal

            for span in article_header.css_first('h2').css('span'):
                story['headline'] += span.text(deep=True, separator='') + ' '
            story['headline'] = story['headline'].strip()
            
            if not article_intro is None:
                for paragraph in article_intro.css('p'):
                    story['content'].append(paragraph.text(deep=True, separator=''))
            
            for paragraph in main_article.css('p'):
                story['content'].append(paragraph.text(deep=True, separator=''))

            articles.append(story)

        self.output = articles
        
        

In [6]:
class BBC(NewsSource):
    
    def __init__(self, seed_url, crawler):
        super().__init__('BBC', seed_url, crawler)
        
    def filter_links(self, links):
        return set(filter(lambda l: 
                          ('/topics/' not in l)
                            and
                          ('/av/' not in l), links))
    
    def scrape(self):
        super().scrape()
        articles = []
        
        for i, URL in enumerate(self.links):
            r = urllib.request.urlopen(URL) 
            sll = HTMLParser(r.read())
            
            print(i+1,'/',len(self.links),URL)
            
            main_article = sll.css_first('.story-body__inner,.story-body')
            headline = sll.css_first('.story-body__h1,.story-headline,.unit__title')
            
            date_line = sll.css_first('.date')
            timestamp = -1
            if date_line is None:
                timestamp = datetime.strptime(sll.css_first('meta[property="rnews:datePublished"]').attributes['content'], '%Y/%m/%d %H:%M:%S')
            else:
                timestamp = datetime.fromtimestamp(int(date_line.attributes['data-seconds']))
            
            story = {}
            story['content'] = []
            story['headline'] = headline.text(deep=True, separator='')
            story['time-stamp'] = timestamp.strftime("%m/%d/%Y, %H:%M:%S")
            story['url'] = URL
            story['journal'] = self.journal
            
            for paragraph in main_article.css('p'):
                story['content'].append(paragraph.text(deep=True, separator=''))
            
            articles.append(story)
            
        self.output = articles

In [7]:
class FoxNews(NewsSource):
    
    def __init__(self, seed_url, crawler):
        super().__init__('FoxNews', seed_url, crawler)
        
    def filter_links(self, links):
        return set(filter(lambda l: 
                          ('/category/' not in l) and
                          ('.print' not in l)
                          
                          , links))
    
    
    def scrape(self):
        super().scrape()
        articles = []
        
        for i, URL in enumerate(self.links):
            r = urllib.request.urlopen(URL) 
            sll = HTMLParser(r.read())
            
            print(i+1,'/',len(self.links),URL)
            
            headline = sll.css_first('meta[name="dc.title"]').attributes['content']
            timestamp = parse(sll.css_first('meta[name="dcterms.created"]').attributes['content'])
            main_article = sll.css_first('.article-body')
            
            story = {}
            story['content'] = []
            story['headline'] = headline
            story['time-stamp'] = timestamp.strftime("%m/%d/%Y, %H:%M:%S")
            story['url'] = URL
            story['journal'] = self.journal
            
            
            for paragraph in main_article.css('p'):
                story['content'].append(paragraph.text(deep=True, separator=''))
            
            articles.append(story)
            
        self.output = articles

In [12]:
class Breitbart(NewsSource):
    def __init__(self, seed_url, crawler):
        super().__init__('Breitbart', seed_url, crawler)
    
    def filter_links(self, links):
        return set(filter(lambda l: 
                          ('/tag/' not in l)                          
                          , links))
    
    def scrape(self):
        super().scrape()
        articles = []
        
        for i, URL in enumerate(self.links):
            r = urllib.request.urlopen(URL) 
            sll = HTMLParser(r.read())
            
            print(i+1,'/',len(self.links),URL)
            
            headline = sll.css_first('meta[property="og:title"]').attributes['content']
            main_article = sll.css_first('.entry-content')
            timestamp = parse(sll.css_first('meta[property="article:published_time"],meta[name="pubdate"]').attributes['content'])
            
            story = {}
            story['content'] = []
            story['headline'] = headline
            story['time-stamp'] = timestamp.strftime("%m/%d/%Y, %H:%M:%S")
            story['url'] = URL
            story['journal'] = self.journal
            
            
            for paragraph in main_article.css('p'):
                story['content'].append(paragraph.text(deep=True, separator=''))
            
            articles.append(story)
            
        self.output = articles

In [13]:
crawler = Crawler(['coronavirus', 'Coronavirus', 'virus'], 200)

In [None]:
sdz_url = 'https://www.sueddeutsche.de/politik/coronavirus-italien-quarantaene-1.4816985'
sdz = SueddeutscheZeitung(sdz_url, crawler)
sdz.scrape()
sdz.export('sdz')

In [None]:
bbc_url = 'https://www.bbc.com/news/world-us-canada-51637481'
bbc = BBC(bbc_url, crawler)
bbc.scrape()
bbc.export('bbc')

In [None]:
fox_url = 'https://www.foxnews.com/politics/whistleblower-says-hhs-sent-workers-to-handle-possible-coronavirus-patients-without-gear-training'
fox = FoxNews(fox_url, crawler)
fox.scrape()
fox.export('fox')

In [14]:
breitbart_url = 'https://www.breitbart.com/national-security/2020/02/27/report-police-stealing-food-donated-to-chinese-city-under-coronavirus-lockdown/'
breitbart = Breitbart(breitbart_url, crawler)
breitbart.scrape()
breitbart.export('breitbart')

Crawling for sources using seed...
https://www.breitbart.com/national-security/2020/02/27/report-police-stealing-food-donated-to-chinese-city-under-coronavirus-lockdown/
crawling 1 / 200
crawling 2 / 200
crawling 3 / 200
crawling 4 / 200
crawling 5 / 200
crawling 6 / 200
crawling 7 / 200
crawling 8 / 200
crawling 9 / 200
crawling 10 / 200
crawling 11 / 200
crawling 12 / 200
crawling 13 / 200
crawling 14 / 200
crawling 15 / 200
crawling 16 / 200
crawling 17 / 200
crawling 18 / 200
crawling 19 / 200
crawling 20 / 200
crawling 21 / 200
crawling 22 / 200
crawling 23 / 200
crawling 24 / 200
crawling 25 / 200
crawling 26 / 200
crawling 27 / 200
crawling 28 / 200
crawling 29 / 200
crawling 30 / 200
crawling 31 / 200
crawling 32 / 200
crawling 33 / 200
crawling 34 / 200
crawling 35 / 200
crawling 36 / 200
crawling 37 / 200
crawling 38 / 200
crawling 39 / 200
crawling 40 / 200
crawling 41 / 200
crawling 42 / 200
crawling 43 / 200
crawling 44 / 200
crawling 45 / 200
crawling 46 / 200
crawling 47

37 / 266 https://www.breitbart.com/national-security/2020/02/21/china-bullies-amazon-over-coronavirus-made-in-china-t-shirt/
38 / 266 https://www.breitbart.com/national-security/2020/02/27/japan-to-close-schools-nationwide-in-bid-to-contain-coronavirus/
39 / 266 https://www.breitbart.com/europe/2020/02/26/sweden-not-prepared-for-major-coronavirus-outbreak-says-whistle-blower-doctor/
40 / 266 https://www.breitbart.com/border/2016/05/19/texas-medical-branch-clones-zika-virus-for-potential-vaccine/
41 / 266 https://www.breitbart.com/national-security/2020/02/17/xi-jinping-demands-obedience-to-battle-coronavirus-shifts-blame-to-local-officials/
42 / 266 https://www.breitbart.com/radio/2020/02/25/steven-mosher-most-likely-explanation-for-coronavirus-is-wuhan-bioweapons-lab/
43 / 266 https://www.breitbart.com/big-government/2016/04/01/cdc-urges-u-s-to-prepare-for-arrival-of-zika-virus/
44 / 266 https://www.breitbart.com/border/2020/02/26/83-new-yorkers-in-voluntary-isolation-for-coronavirus-

104 / 266 https://www.breitbart.com/asia/2020/02/24/study-china-lied-wuhan-virus-did-not-originate-in-meat-market/
105 / 266 https://www.breitbart.com/national-security/2015/10/17/pope-francis-hypocrisy-ambiguity-virus/
106 / 266 https://www.breitbart.com/health/2020/02/26/report-flight-attendant-working-from-lax-diagnosed-with-coronavirus/
107 / 266 https://www.breitbart.com/economy/2020/02/21/coronavirus-drag-on-u-s-economy-appears-bigger-than-expected/
108 / 266 https://www.breitbart.com/national-security/2020/02/21/china-applauds-pregnant-ill-women-returning-to-coronavirus-front-lines/
109 / 266 https://www.breitbart.com/national-security/2016/01/13/zika-virus/
110 / 266 https://www.breitbart.com/politics/2020/01/28/philly-school-ends-chinese-exchange-program-as-u-s-schools-confront-coronavirus/
111 / 266 https://www.breitbart.com/border/2016/05/02/new-zika-virus-test-available/
112 / 266 https://www.breitbart.com/tech/2020/01/31/facebook-cracks-down-on-coronavirus-conspiracy-theor

172 / 266 https://www.breitbart.com/middle-east/2020/02/25/netanyahu-coronvirus-no-reason-to-postpone-elections/
173 / 266 https://www.breitbart.com/politics/2020/02/26/watch-live-president-donald-trump-gives-coronavirus-update/
174 / 266 https://www.breitbart.com/middle-east/2020/01/30/israel-bars-flights-from-china-over-coronavirus-fears/
175 / 266 https://www.breitbart.com/europe/2020/02/25/italy-expects-at-least-e5-billion-loss-in-tourism-over-coronavirus/
176 / 266 https://www.breitbart.com/politics/2020/02/25/mitt-romney-trump-administration-substantially-unprepared-for-coronavirus/
177 / 266 https://www.breitbart.com/clips/2020/02/26/bloomberg-on-coronavirus-we-are-as-exposed-to-this-kind-of-thing-as-weve-ever-been/
178 / 266 https://www.breitbart.com/politics/2020/02/25/cdc-warns-coronavirus-spread-usa-disruption-to-everyday-life-might-be-severe/
179 / 266 https://www.breitbart.com/middle-east/2020/02/26/hassan-rouhani-iran-to-make-thousands-of-dubious-homemade-coronavirus-test

239 / 266 https://www.breitbart.com/politics/2020/02/27/scientist-coronavirus-not-airborne-but-extremely-contagious/
240 / 266 https://www.breitbart.com/middle-east/2020/02/27/saudi-arabia-suspends-entry-for-pilgrims-over-coronavirus/
241 / 266 https://www.breitbart.com/health/2020/02/25/italy-coronavirus-death-toll-rises-to-seven-as-confirmed-cases-stand-at-283/
242 / 266 https://www.breitbart.com/europe/2020/02/25/amid-italy-coronavirus-outbreak-ideology-driven-left-push-open-borders/
243 / 266 https://www.breitbart.com/national-security/2020/02/19/report-china-deploys-hundreds-internet-censors-shut-down-coronavirus-talk/
244 / 266 https://www.breitbart.com/national-security/2020/02/26/pompeo-u-s-deeply-concerned-iran-suppressing-coronavirus-information/
245 / 266 https://www.breitbart.com/health/2020/02/07/cruise-ship-passengers-hospitalized-nj-caution-coronavirus/
246 / 266 https://www.breitbart.com/news/xi-says-china-facing-big-test-with-virus-global-impact-spreads/
247 / 266 http