In [1]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess
import datetime

In [2]:
date = datetime.datetime.today().strftime('%Y%m%d')

In [3]:
class APSpider(scrapy.Spider):
    name = 'Associated Press'
    uri = 'data/ap_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    start_urls = ['https://www.apnews.com']
    
    def parse(self, response):
        for link in response.xpath('//a[@class="main-story-extract" or @class="content-container"]/@href').extract():
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
            
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="Article"]/p/text()').extract():
            if para.count('\u201c') == para.count('\u201d'):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [4]:
class ReutSpider(scrapy.Spider):
    name = 'Reuters'
    uri = 'data/reut_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.reuters.com']
    
    def parse(self, response):
        for link in response.xpath('//h3[@class="article-heading"]/a/@href | h2[@class="story-title"]/a/@href').extract():
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="StandardArticleBody_body"]/p/text()').extract():
            if para.count('\u201c') == para.count('\u201d'):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [5]:
class CSMSpider(scrapy.Spider):
    name = 'Christian Science Monitor'
    uri = 'data/csm_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.csmonitor.com/USA']
    
    def parse(self, response):
        for link in response.xpath('//h3[@class="story_headline"]/a/@href').extract():
            link = response.urljoin(link)
            if "csmlists" not in link and "Photo-Galleries" not in link:
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//p/text()').extract():
            if para.count('\u201c') == para.count('\u201d'):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [6]:
class NPRSpider(scrapy.Spider):
    name = 'National Public Radio'
    uri = 'data/npr_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.npr.org/sections/news']
    
    def parse(self, response):
        for link in response.xpath('//h2[@class="title"]/a/@href').extract():
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//p/text()').extract():
            if para.count('\u201c') == para.count('\u201d'):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [7]:
class HillSpider(scrapy.Spider):
    name = 'National Public Radio'
    uri = 'data/hill_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://thehill.com']
    
    def parse(self, response):
        for link in response.xpath('//span[@class="field-content"]/a/@href | //h4/a/@href \
                                   | //ul[@class="more_headlines"]//a/@href').extract():
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//p/text()').extract():
            if para.count('\u201c') == para.count('\u201d'):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [8]:
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'LOG_ENABLED': False,
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'Vince Giorno (Thinkful Data Science Bootcamp)' 
})

#process.crawl(APSpider)
#process.crawl(ReutSpider)
#process.crawl(CSMSpider)
#process.crawl(NPRSpider)
process.crawl(HillSpider)
process.start()
print('Success!')

https://thehill.com/homenews/administration/419864-george-w-bush-chokes-back-tears-in-eulogy-for-father
https://thehill.com/homenews/administration/419838-trump-joins-presidents-club-for-the-first-time-shakes-hand-with
https://thehill.com/blogs/in-the-know/in-the-know/419847-george-w-bush-michelle-obama-share-moment-at-state-funeral
https://thehill.com/homenews/morning-report/419777-the-hills-morning-report
https://thehill.com/policy/national-security/419769-mueller-requests-no-prison-time-for-flynn-citing-his-substantial
https://thehill.com/homenews/administration/419797-trump-touts-china-actions-on-trade-one-day-after-markets-crumble
https://thehill.com/homenews/campaign/419755-california-primary-threatens-to-change-game-for-democrats
https://thehill.com/homenews/state-watch/419793-wisconsin-senate-passes-limits-on-incoming-dem-governor
https://thehill.com/homenews/senate/419760-criminal-justice-reform-splits-2020-democrats
https://thehill.com/policy/defense/419751-trump-gop-rift-gro