<a href="https://colab.research.google.com/github/tuananh1006/Web-Scraping-Book/blob/main/CourseraCrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from bs4 import BeautifulSoup
from urllib.request import urlopen


In [9]:
class Website:
    """Contains information about website structure"""

    def __init__(self, name, url, searchUrl, resultListing,resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl=absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [10]:
class Content:
    """Common base class for all articles/pages"""

    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self):
        """
        Flexible printing function controls output
        """
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [11]:
class Crawler:
    def __init__(self, website):
        self.site = website
        self.found = {}
    def getPage(url):
      html=urlopen(url).read()
      return BeautifulSoup(html,'html.parser')
    def safeGet(bs, selector):
        """
        Utility function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        selectedElems = bs.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''
    def getContent(self, topic, url):
        """
        Extract content from a given page URL
        """
        bs = Crawler.getPage(url)
        if bs is not None:
            title = Crawler.safeGet(bs, self.site.titleTag)
            body = Crawler.safeGet(bs, self.site.bodyTag)
            return Content(topic, url, title, body)
        return Content(topic, url, '', '')

    def search(self, topic):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = Crawler.getPage(self.site.searchUrl + topic)
        searchResults = bs.select(self.site.resultListing)
        for result in searchResults:
            url = result.select(self.site.resultUrl)[0].attrs['href']
            # Check to see whether it's a relative or an absolute URL
            url = url if self.site.absoluteUrl else self.site.url + url
            if url not in self.found:
                self.found[url] = self.getContent(topic, url)
            self.found[url].print()
            print('*'*20)

In [12]:
siteData = [
    ['Reuters', 'http://coursera.org',
     'https://www.coursera.org/search?query=',
     'div.css-1n87rlq', 'div.cds-ProductCard-content a',
      False, 'h1.cds-119.cds-Typography-base.css-1xy8ceb.cds-121', 'div.content-inner p']]


In [13]:
sites = []
for name, url, search, rListing, rUrl, absUrl, tt, bt in siteData:
    sites.append(Website(name, url, search, rListing, rUrl, absUrl, tt, bt))

crawlers = [Crawler(site) for site in sites]
topics = ['communication']

for topic in topics:
    for crawler in crawlers:
        crawler.search(topic)

New article found for topic: communication
URL: http://coursera.org/learn/communication
TITLE: Introduction to Communication Science
BODY:
Since Antiquity, scholars have appreciated the importance of communication: as social beings, we cannot exist without communication. We need to interact with people around us, to make sense of the world and to position ourselves in a wider social and cultural reality. In this course, we look at how and why communication evolved as a science and reflect on today’s dominant paradigms. The course also extends beyond the boundaries of communication science itself, exploring dimensions of history, sociology and psychology. Join our class, together with people all over the world.
Introduction to Communication Science explores some of the basic theories, models and concepts from the fields of mass, interpersonal and intrapersonal communication. The course begins with a consideration of several basic models, subsequently progressing to the history of commun

In [14]:
crawlers[0].found

{'http://coursera.org/learn/communication': <__main__.Content at 0x7e4fc20ff880>,
 'http://coursera.org/learn/active-listening-enhancing-communication-skills': <__main__.Content at 0x7e4fc20ffca0>,
 'http://coursera.org/specializations/effective-business-communication': <__main__.Content at 0x7e4fc17eec20>,
 'http://coursera.org/learn/wharton-communication-skills': <__main__.Content at 0x7e4fc1ee14b0>,
 'http://coursera.org/specializations/improve-english': <__main__.Content at 0x7e4fc1af0ac0>,
 'http://coursera.org/learn/communication-in-the-workplace': <__main__.Content at 0x7e4fc2742e90>,
 'http://coursera.org/specializations/corporatecommunications': <__main__.Content at 0x7e4fc1a80f40>,
 'http://coursera.org/learn/teamwork-skills-effective-communication': <__main__.Content at 0x7e4fc187e710>,
 'http://coursera.org/specializations/business-english': <__main__.Content at 0x7e4fc13cd600>,
 'http://coursera.org/specializations/communicationforeveryone': <__main__.Content at 0x7e4fc19d

In [17]:
class Website:
    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print(f'URL: {self.url}')
        print(f'TITLE: {self.title}')
        print(f'BODY:\n{self.body}')

In [33]:
import re
class Crawler:
    def __init__(self, site):
      self.site = site
      self.visited = {}

    def getPage(url):
      try:
            html = urlopen(url)
      except Exception as e:
            print(e)
            return None
      return BeautifulSoup(html, 'html.parser')

    def safeGet(bs, selector):
      selectedElems = bs.select(selector)
      if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
      return ''

    def getContent(self, url):
      """
      Extract content from a given page URL
      """
      bs = Crawler.getPage(url)
      if bs is not None:
          title = Crawler.safeGet(bs, self.site.titleTag)
          body = Crawler.safeGet(bs, self.site.bodyTag)
          return Content(url, title, body)
      return Content(url, '', '')

    def crawl(self):
        """
        Get pages from website home page
        """
        bs = Crawler.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
          url = targetPage.attrs['href']
          url = url if self.site.absoluteUrl else f'{self.site.url}{url}'
          if url not in self.visited:
                self.visited[url] = self.getContent(url)
                self.visited[url].print()

In [35]:
brookings = Website(
    'Coursera', 'https://coursera.org', '\/(browse)\/',
     False, 'h1', 'div.product-offerings-wrapper')
crawler = Crawler(brookings)
crawler.crawl()

URL: https://coursera.org/browse/data-science
TITLE: Data Science
BODY:
Earn Your DegreeBlack left arrow icon indicating previous slideUniversity of PittsburghMaster of Data ScienceEarn a degreeDegreeUniversity of MichiganMaster of Applied Data ScienceEarn a degreeDegreeUniversity of Illinois Urbana-ChampaignMaster of Computer Science in Data ScienceEarn a degreeDegreeUniversity of Colorado BoulderMaster of Science in Data ScienceEarn a degreeDegreeNortheastern University Master of Science in Data Analytics EngineeringEarn a degreeDegreeIllinois TechMaster of Data ScienceEarn a degreeDegreeBall State UniversityMaster of Science in Data ScienceEarn a degreeDegreeO.P. Jindal Global UniversityMBA in Business AnalyticsEarn a degreeDegreePontificia Universidad Católica de ChileMagíster en Ciencia de DatosEarn a degreeDegreeUniversity of LeedsMSc Data Science (Statistics)Earn a degreeDegreeIndian Institute of Technology GuwahatiBachelor of Science in Data Science & AIEarn a degreeDegreeImper

In [36]:
class Website:
    def __init__(self, name, url, titleTag, bodyTag, pageType):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        self.pageType = pageType

In [37]:
class Product(Website):
    """Contains information for scraping a product page"""
    def __init__(self, name, url, titleTag, productNumberTag, priceTag):
        Website.__init__(self, name, url, titleTag)
        self.productNumberTag = productNumberTag
        self.priceTag = priceTag

class Article(Website):
    """Contains information for scraping an article page"""
    def __init__(self, name, url, titleTag, bodyTag, dateTag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag