In [1]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess
import datetime

In [2]:
date = datetime.datetime.today().strftime('%Y%m%d')

In [3]:
class APSpider(scrapy.Spider):
    name = 'Associated Press'
    uri = 'data/ap_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    start_urls = ['https://www.apnews.com']
    
    def parse(self, response):
        for link in response.xpath('//a[@class="main-story-extract" or @class="content-container"]/@href').extract():
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
            
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="Article"]/p//text()').extract():
            if (not '___' in para and not 'AP\u2019' in para and not 'AP ' in para and not 'http' in para
                and not 'Associated Press' in para and para.count('\u201c') == para.count('\u201d') 
                and para.count('"') % 2 == 0):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [4]:
class ReutSpider(scrapy.Spider):
    name = 'Reuters'
    uri = 'data/reut_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.reuters.com']
    
    def parse(self, response):
        for link in response.xpath('//h3[@class="article-heading"]/a/@href | h2[@class="story-title"]/a/@href').extract():
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="StandardArticleBody_body"]/p//text()').extract():
            if "tmsnrt" not in para and para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [5]:
class CSMSpider(scrapy.Spider):
    name = 'Christian Science Monitor'
    uri = 'data/csm_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.csmonitor.com/USA']
    
    def parse(self, response):
        for link in response.xpath('//h3[@class="story_headline"]/a/@href').extract():
            link = response.urljoin(link)
            if "csmlists" not in link and "Photo-Galleries" not in link:
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//p[not(@id="contact-dialog-text")]//text()').extract():
            if ('Link copied' in para or 'Get unlimited' in para or 'Log in' in para or 'Less noise' in para
                or 'Already a Monitor' in para or 'unlimited digital access' in para
                or para.count('\u201c') != para.count('\u201d') or para.count('"') % 2 != 0):
                continue
            article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [6]:
class NPRSpider(scrapy.Spider):
    name = 'National Public Radio'
    uri = 'data/npr_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.npr.org/sections/news']
    
    def parse(self, response):
        for link in response.xpath('//h2[@class="title"]/a/@href').extract():
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@id="storytext"]/p//text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [7]:
class HillSpider(scrapy.Spider):
    name = 'National Public Radio'
    uri = 'data/hill_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://thehill.com']
    
    def parse(self, response):
        for link in response.xpath('//span[@class="field-content"]/a/@href | //h4/a/@href \
                                   | //ul[@class="more_headlines"]//a/@href').extract():
            if 'overnights' not in link and 'morning-report' not in link:
                link = response.urljoin(link)
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="field-items"]/div/p/text() \
                                   | //div[@class="field-items"]/div/p/a/text() \
                                   | //div[@class="field-items"]/div/p/span/text() \
                                   | //div[@class="field-items"]/div/p/span/a/text() \
                                   | //div[@class="field-items"]/div/p/span/span/a/text() \
                                   | //div[@class="field-items"]/div/div/text() \
                                   | //div[@class="field-items"]/div/div/a/text() \
                                   | //div[@class="field-items"]/div/div/span/text() \
                                   | //div[@class="field-items"]/div/div/span/a/text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [8]:
class NRSpider(scrapy.Spider):
    name = 'National Review'
    uri = 'data/nr_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.nationalreview.com']
    
    def parse(self, response):
        for link in response.xpath('//h4/a/@href').extract():
            if "nationalreview.com" in link:
                if "photos" in link or "videos" in link or "magazine" in link or "interview" in link or "podcasts" in link:
                    continue
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="article-content"]/p//text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [9]:
class ABCSpider(scrapy.Spider):
    name = 'National Review'
    uri = 'data/abc_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://abcnews.go.com/']
    
    def parse(self, response):
        for link in response.xpath('//h1/a/@href').extract():
            if "void" not in link and link != "https://abcnews.go.com":
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="article-copy"]/p[count(em)=0 and count(strong)=0]//text()').extract():
            if '\u2014\u2014\u2014' in para:
                break
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [10]:
class PolcoSpider(scrapy.Spider):
    name = 'Politico'
    uri = 'data/polco_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.politico.com/']
    
    def parse(self, response):
        for link in response.xpath('//h1/a/@href').extract():
            if ("gallery" not in link and "list" not in link and "nerdcast" not in link and 
                "video" not in link and "news-tips" not in link and link != "https://www.politico.com/"):
                link = response.urljoin(link)
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})

    def parse_article(self, response):
        article= ''
        for para in response.xpath('//article/div[1]/section[2]/div/div/p[not(@class="story-continued") and \
                                   not(count(em)) and not(count(i))]//text() | \
                        //div[@class="story-text has-sidebar"]/p[not(@class="story-continued")]//text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [11]:
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'LOG_ENABLED': False,
    'AUTOTHROTTLE_ENABLED': True,
    'HTTPCACHE_ENABLED': True,
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'Vince Giorno (Thinkful Data Science Bootcamp)' 
})

#process.crawl(APSpider)
#process.crawl(ReutSpider)
#process.crawl(CSMSpider)
#process.crawl(NPRSpider)
#process.crawl(HillSpider)
#process.crawl(NRSpider)
#process.crawl(ABCSpider)
#process.crawl(PolcoSpider)

process.start()
print('Success!')

Success!
