In [1]:
import scrapy
import re
from scrapy.crawler import CrawlerProcess
import datetime

In [2]:
date = datetime.datetime.today().strftime('%Y%m%d')

In [3]:
class APSpider(scrapy.Spider):
    name = 'Associated Press'
    uri = 'data/ap_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    start_urls = ['https://www.apnews.com']
    
    def parse(self, response):
        for link in response.xpath('//a[@class="main-story-extract" or @class="content-container"]/@href').extract():
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
            
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="Article"]/p[text()]//text()').extract():
            if (not '___' in para and not 'AP\u2019' in para and not 'AP ' in para and not 'http' in para
                and not 'Associated Press' in para and para.count('\u201c') == para.count('\u201d') 
                and para.count('"') % 2 == 0):
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [4]:
class ReutSpider(scrapy.Spider):
    name = 'Reuters'
    uri = 'data/reut_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.reuters.com']
    
    def parse(self, response):
        for link in response.xpath('//h3[@class="article-heading"]/a/@href | h2[@class="story-title"]/a/@href').extract():
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="StandardArticleBody_body"]/p[text()]//text()').extract():
            if "tmsnrt" not in para and para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [5]:
class CSMSpider(scrapy.Spider):
    name = 'Christian Science Monitor'
    uri = 'data/csm_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.csmonitor.com/{}'.format(section) for section in 
                  ['USA', 'World', 'Commentary', 'Business', 'Science', 'Environment', 'Technology', 'The-Culture']]
    
    def parse(self, response):
        for link in response.xpath('//h3[@class="story-headline"]/a/@href').extract():
            link = response.urljoin(link)
            if "csmlists" not in link and "Photo-Galleries" not in link and link.count("/") > 4:
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//p[not(@id="contact-dialog-text")]//text()').extract():
            if ('Link copied' in para or 'Get unlimited' in para or 'Log in' in para or 'Less noise' in para
                or 'Already a Monitor' in para or 'unlimited digital access' in para
                or para.count('\u201c') != para.count('\u201d') or para.count('"') % 2 != 0):
                continue
            article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [6]:
class NPRSpider(scrapy.Spider):
    name = 'National Public Radio'
    uri = 'data/npr_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.npr.org/sections/news']
    
    def parse(self, response):
        for link in response.xpath('//h2[@class="title"]/a/@href').extract():
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@id="storytext"]/p[text()]//text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [7]:
class HillSpider(scrapy.Spider):
    name = 'National Public Radio'
    uri = 'data/hill_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://thehill.com']
    
    def parse(self, response):
        for link in response.xpath('//span[@class="field-content"]/a/@href | //h4/a/@href \
                                   | //ul[@class="more_headlines"]//a/@href').extract():
            if 'overnights' not in link and 'morning-report' not in link:
                link = response.urljoin(link)
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="field-items"]/div/p/text() \
                                   | //div[@class="field-items"]/div/p/a/text() \
                                   | //div[@class="field-items"]/div/p/span/text() \
                                   | //div[@class="field-items"]/div/p/span/a/text() \
                                   | //div[@class="field-items"]/div/p/span/span/a/text() \
                                   | //div[@class="field-items"]/div/div/text() \
                                   | //div[@class="field-items"]/div/div/a/text() \
                                   | //div[@class="field-items"]/div/div/span/text() \
                                   | //div[@class="field-items"]/div/div/span/a/text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [8]:
class NRSpider(scrapy.Spider):
    name = 'National Review'
    uri = 'data/nr_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.nationalreview.com']
    
    def parse(self, response):
        for link in response.xpath('//h4/a/@href').extract():
            if "nationalreview.com" in link:
                if "photos" in link or "videos" in link or "magazine" in link or "interview" in link or "podcasts" in link:
                    continue
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="article-content"]/p[text()]//text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [9]:
class ABCSpider(scrapy.Spider):
    name = 'National Review'
    uri = 'data/abc_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://abcnews.go.com/']
    
    def parse(self, response):
        for link in response.xpath('//h1/a/@href').extract():
            if "void" not in link and link != "https://abcnews.go.com":
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="article-copy"]/p[text()]//text()').extract():
            if '\u2014\u2014\u2014' in para:
                break
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [10]:
class PolcoSpider(scrapy.Spider):
    name = 'Politico'
    uri = 'data/polco_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.politico.com/']
    
    def parse(self, response):
        for link in response.xpath('//h1/a/@href').extract():
            if ("gallery" not in link and "list" not in link and "nerdcast" not in link and "video" not in link and
                "news-tips" not in link and "permalink" not in link and link != "https://www.politico.com/"):
                link = response.urljoin(link)
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})

    def parse_article(self, response):
        article= ''
        for para in response.xpath('//article//p[text() and not(@class="byline" or @class="timestamp")]//text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [11]:
class CBSSpider(scrapy.Spider):
    name = 'CBS News'
    uri = 'data/cbs_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.cbsnews.com']
    
    def parse(self, response):
        for link in response.xpath('//article/a/@href').extract():
            if "pictures" in link or "video" in link or "interview" in link or "transcript" in link or "/live/" in link:
                continue
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
            
        for link in response.xpath('//a[span[contains(text(), "More")]]/@href').extract():
            link = response.urljoin(link)
            if link != 'https://www.cbsnews.com':
                yield scrapy.Request(link, callback=self.parse_2)
                
                
    def parse_2(self, response):
        for link in response.xpath('//h3/a/@href | //li[h2]/a/@href | //a[div/h4]/@href | //a[h3]/@href').extract():
            if "/video/" in link or "/pictures/" in link:
                continue
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})


    def parse_article(self, response):
        article= ''
        for para in response.xpath('//article//p[text() and not(@*)]//text()').extract():
            if para.count('\u201c') != para.count('\u201d') or para.count('"') % 2 != 0:
                continue
            article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [12]:
class NBCSpider(scrapy.Spider):
    name = 'NBC News'
    uri = 'data/nbc_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.nbcnews.com']
    
    def parse(self, response):
        for link in response.xpath('//h2[count(../*)=1]/a/@href | //h3/a/@href').extract():
            if "nbcnews.com" in link:
                if "video/" in link or "watch/" in link or "youtu.be" in link or link.count("/") == 3:
                    continue
                link = response.urljoin(link)
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
        #for link in response.xpath('//a[contains(text(), "More")]/@href').extract():
            #if link.count("/") <= 3 and "sports" not in link:
                #link = response.urljoin(link)
                #yield scrapy.Request(link, callback=self.parse_2)
                
                
    def parse_2(self, response):
        for link in response.xpath('//h2[count(../*)=1]/a/@href | //h3/a/@href').extract():
            if "nbcnews.com" in link:
                if "video/" in link or "watch/" in link or "youtu.be" in link or link.count("/") == 3:
                    continue
                link = response.urljoin(link)
                yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})

     
    def parse_article(self, response):
        article= ''
        for para in response.xpath('//article//p[text() and not(contains(concat(" ", @class, " "), "articleByline__bio"))]//text()').extract():
            if para.count('\u201c') == para.count('\u201d') and para.count('"') % 2 == 0:
                article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [13]:
class AlJaSpider(scrapy.Spider):
    name = 'Al Jazeera'
    uri = 'data/alja_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://aljazeera.com/{}'.format(section) for section in 
                  ['news', 'investigations', 'indepth/features', 'indepth/opinion']]
    
    def parse(self, response):
        for link in response.xpath('//h1/a/@href | //h2/a/@href | //a[h2]/@href | //a[h4]//@href | \
                                   //a[img]/@href | //div[@class="aside-container"]//a/@href').extract():
            link = response.urljoin(link)
            if ("podcasts" in link or "remix" in link or "index.html" in link or "html" not in link
                or "profile/" in link or "topics/" in link or "inpictures" in link or "programmes/" in link):
                continue
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})

    def parse_article(self, response):
        article= ''
        for para in response.xpath('//div[@class="article-p-wrapper"]/p//text()').extract():
            if ('editorial stance' in para or 'reports from' in para or 
                para.count('\u201c') != para.count('\u201d') or para.count('"') % 2 != 0):
                continue
            article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [14]:
class BuzzSpider(scrapy.Spider):
    name = 'Buzzfeed News'
    uri = 'data/buzz_{}.json'.format(date)
    custom_settings = {'FEED_URI': uri}
    
    start_urls = ['https://www.buzzfeed{}'.format(tail) for tail in ['news.com','news.com/section/world','.com/investigations']]

    
    def parse(self, response):
        for link in response.xpath('//article/div[1]/a/@href | //div[@data-module="card-article" or \
                                   @data-module="featured-package"]/a/@href | //footer/div/a/@href').extract():
            link = response.urljoin(link)
            yield scrapy.Request(link, callback=self.parse_article, meta={'article_id': link})
            

    def parse_article(self, response):
        article= ''
        for para in response.xpath('//p[text() and count(@*)=0]//text()').extract():
            if (para.count('\u201c') != para.count('\u201d') or para.count('"') % 2 != 0 or 
                "doesn't support JavaScript" in para:
                continue
            article = article + para + ' '
        yield {
            'id': response.meta['article_id'],
            'article': article.strip()
        }

In [15]:
process = CrawlerProcess({
    'FEED_FORMAT': 'json',
    'LOG_ENABLED': False,
    'AUTOTHROTTLE_ENABLED': True,
    'ROBOTSTXT_OBEY': True,
    'USER_AGENT': 'Vince Giorno (Thinkful Data Science Bootcamp)' 
})

#process.crawl(APSpider)
#process.crawl(ReutSpider)
#process.crawl(CSMSpider)
#process.crawl(NPRSpider)
#process.crawl(HillSpider)
#process.crawl(NRSpider)
#process.crawl(ABCSpider)
#process.crawl(PolcoSpider)
#process.crawl(CBSSpider)
#process.crawl(NBCSpider)
#process.crawl(AlJaSpider)
process.crawl(BuzzSpider)

process.start()
print('Success!')

Success!
