In [1]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess
import datetime

In [2]:
date = datetime.datetime.today().strftime('%Y%m%d')

In [3]:
class APSpider(scrapy.Spider):
    name = 'Associated Press'
    uri = 'data/ap_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    start_urls = ['https://www.apnews.com']
    
    def parse(self, response):
        for link in response.xpath('//a[@class="main-story-extract" or @class="content-container"]/@href').extract():
            article_id = link
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': article_id})
            
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="Article"]/p/text()').extract():
            if para.count('\u201c') == para.count('\u201d'):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [4]:
class ReutSpider(scrapy.Spider):
    name = 'Reuters'
    uri = 'data/reut_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.reuters.com']
    
    def parse(self, response):
        for link in response.xpath('//h3[@class="article-heading"]/a/@href | h2[@class="story-title"]/a/@href').extract():
            link = response.urljoin(link)
            article_id = link
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': article_id})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="StandardArticleBody_body"]/p/text()').extract():
            if para.count('\u201c') == para.count('\u201d'):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [5]:
class CSMSpider(scrapy.Spider):
    name = 'Christian Science Monitor'
    uri = 'data/csm_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.csmonitor.com/USA']
    
    def parse(self, response):
        for link in response.xpath('//h3[@class="story_headline"]/a[not(@title="ID  jtPhotos")]/@href').extract():
            link = response.urljoin(link)
            article_id = link
            if "csmlists" not in link and "Photo-Galleries" not in link:
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': article_id})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//p/text()').extract():
            if para.count('\u201c') == para.count('\u201d'):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [6]:
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'LOG_ENABLED': False,
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'Vince Giorno (Thinkful Data Science Bootcamp)' 
})

process.crawl(APSpider)
process.crawl(ReutSpider)
process.crawl(CSMSpider)
process.start()
print('Success!')

Success!
