In [None]:
import requests as rq  # WS mod
from bs4 import BeautifulSoup
import re

In [None]:
def getPage(url):  # WS for macosx?
    """
    Utilty function used to get a Beautiful Soup object from a given URL
    """
    session = rq.Session()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
    try:
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        return None
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs

## Dealing with different website layouts

In [None]:
class Content:
    def __init__(self, url, title, body):
        self.url   = url
        self.title = title
        self.body  = body

def getPage(url):
    req = rq.get(url)
    return BeautifulSoup(req.text, 'html.parser')

def scrapeNYTimes(url):
    bs    = getPage(url)
    title = bs.find('h1').text
    lines = bs.select('div.StoryBodyCompanionColumn div p')
    body  = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

def scrapeBrookings(url):
    bs    = getPage(url)
    title = bs.find('h1').text
    body  = bs.find('div', {'class', 'post-body'}).text
    return Content(url, title, body)

In [None]:
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

In [None]:
# WS this is behind a paywall
url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

In [None]:
class Content:
    """
    Common base class for all articles/pages
    """
    def __init__(self, url, title, body):
        self.url   = url
        self.title = title
        self.body  = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

class Website:
    """ 
    Contains information about website structure
    """
    def __init__(self, name, url, titleTag, bodyTag):
        self.name     = name
        self.url      = url
        self.titleTag = titleTag
        self.bodyTag  = bodyTag

In [None]:
class Crawler:

    def getPage(self, url):
        try:
            req = rq.get(url)
        except rq.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        """
        Utilty function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, site, url):
        """
        Extract content from a given page URL
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()
            else:
                print('title and body not found')  # WS
        else:
            print('url not found')  # WS

In [None]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
    ['New York Times', 'http://nytimes.com', 'h1', 'div.StoryBodyCompanionColumn div p']
]
websites = []
for row in siteData:
    #websites.append(Website(row[0], row[1], row[2], row[3]))
    websites.append(Website(*row))  # WS list expansion: this works

In [None]:
crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')  # this fails

In [None]:
crawler.parse(
    websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')  # this fails

In [None]:
# this works
crawler.parse(
    websites[2],
    'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')

In [None]:
# this fails
crawler.parse(
    websites[3], 
    'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')

## Crawling through sites with search

In [None]:
class Content:
    """Common base class for all articles/pages"""
    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.url   = url
        self.title = title
        self.body  = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('WS turned off printing body for testing\n')  # WS
        # print('BODY:\n{}'.format(self.body))

In [None]:
class Website:
    """Contains information about website structure"""
    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name          = name
        self.url           = url
        self.searchUrl     = searchUrl
        self.resultListing = resultListing
        self.resultUrl     = resultUrl
        self.absoluteUrl   = absoluteUrl
        self.titleTag      = titleTag
        self.bodyTag       = bodyTag

In [None]:
class Crawler:

    def getPage(self, url):
        try:
            req = rq.get(url)
        except rq.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ''

    def search(self, topic, site):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            try:  # WS
                url = result.select(site.resultUrl)[0].attrs['href']
            except:
                print('something wrong with finding URL')  # WS
                return
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body  = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, url, title, body)  #  WS fixed order
                content.print()
            else:
                print('title and body not found')  # WS

In [None]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
        'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]
sites = []
for row in siteData:
    #sites.append(Website(row[0], row[1], row[2],
    #                     row[3], row[4], row[5], row[6], row[7]))
    sites.append(Website(*row)) # WS

In [None]:
topics = ['python', 'data science']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        print('checking site {}.'.format(targetSite.name)) # WS
        crawler.search(topic, targetSite)

## Crawling Sites through Links

In [None]:
class Website:

    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name          = name
        self.url           = url
        self.targetPattern = targetPattern
        self.absoluteUrl   = absoluteUrl
        self.titleTag      = titleTag
        self.bodyTag       = bodyTag

class Content:

    def __init__(self, url, title, body):
        self.url   = url
        self.title = title
        self.body  = body

    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('WS turned off printing body for testing\n')  # WS
        #print('BODY:\n{}'.format(self.body))

In [None]:
class Crawler:
    def __init__(self, site):
        self.site    = site
        self.visited = []

    def getPage(self, url):
        try:
            req = rq.get(url)
        except rq.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

    def crawl(self):
        """
        Get pages from website home page
        """
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)

In [None]:
reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)',
                  False, 'h1', 'div.StandardArticleBody_body_1gnLA')
crawler = Crawler(reuters)
crawler.crawl()  # this produces nothing: try Brookings, or study a reuters page to fix
# stopped here 10/27/22: pick up from here

## Crawling multiple page types

In [None]:
class Website:
    """Common base class for all articles/pages"""

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        

In [None]:
class Product(Website):
    """Contains information for scraping a product page"""

    def __init__(self, name, url, titleTag, productNumber, price):
        Website.__init__(self, name, url, TitleTag)
        self.productNumberTag = productNumberTag
        self.priceTag = priceTag

class Article(Website):
    """Contains information for scraping an article page"""

    def __init__(self, name, url, titleTag, bodyTag, dateTag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag

In [None]:

def parsePage(url):
    
    if '/ideas/' in url:
        

oreilly = Website('O\'Reilly', 'https://oreilly.com', 'h1' '')        