# Inheriting the Spider

A class is roughly a collection of related variables and functions housed together. Sometimes one class likes to use methods from another class, and so we will inherit methods from a different class. That's what we do in the spider class.

In [1]:
# # Import scrapy library
# import scrapy

# # Create the spider class
# class YourSpider(scrapy.Spider):
#   name = "your_spider"
#   # start_requests method
#   def start_requests(self):
#     pass
#   # parse method
#   def parse(self, response):
#     pass
  
# # Inspect Your Class
# inspect_class(YourSpider)

# Hurl the URLs

We've written a function inspect_class which will print out the list of elements you have in the urls variable within the start_requests method.

In [2]:
# # Import scrapy library
# import scrapy

# # Create the spider class
# class YourSpider( scrapy.Spider ):
#   name = "your_spider"
#   # start_requests method
#   def start_requests( self ):
#     urls = ["https://www.datacamp.com" ,"https://scrapy.org"]
#     for url in urls:
#       yield url
#   # parse method
#   def parse( self, response ):
#     pass
  
# # Inspect Your Class
# inspect_class( YourSpider )

# Self Referencing is Classy

 within the spider class, we always input the argument self in the start_requests and parse methods (just look in the sample code in this exercise!). This allows us to reference between methods within the class. That is, if we want to refer to the method parse within the start_requests method, we would need to write self.parse rather than just parse; what writing self does is tell the code: "Look in the same class as start_requests for a method called parse to use."

In [3]:
# # Import scrapy library
# import scrapy

# # Create the spider class
# class YourSpider( scrapy.Spider ):
#   name = "your_spider"
#   # start_requests method
#   def start_requests( self ):
#     self.print_msg( "Hello World!" )
#   # parse method
#   def parse( self, response ):
#     pass
#   # print_msg method
#   def print_msg( self, msg ):
#     print( "Calling start_requests in YourSpider prints out:", msg )
  
# # Inspect Your Class
# inspect_class( YourSpider )

# Starting with Start Requests

we have another toy-model spider which doesn't actually scrape anything, but gives you a chance to play with the start_requests method so to start becomming familiar with the arguments to pass into the scrapy.Request call within start_requests

In [4]:
# # Import scrapy library
# import scrapy

# # Create the spider class
# class YourSpider( scrapy.Spider ):
#   name = "your_spider"
#   # start_requests method
#   def start_requests( self ):
#     yield scrapy.Request( url = "https://www.datacamp.com", callback = self.parse )
#   # parse method
#   def parse( self, response ):
#     pass
  
# # Inspect Your Class
# inspect_class( YourSpider )

# Pen Names

Your job will be to create the list of extracted author names in the parse method of the spider.

Two things you should know:

- You will be using the response object and the css method here.
- The course author names are defined by the text within the paragraph p elements belonging to the class course-block__author-name

In [5]:
# # Import the scrapy library
# import scrapy

# # Create the Spider class
# class DCspider( scrapy.Spider ):
#   name = 'dcspider'
#   # start_requests method
#   def start_requests( self ):
#     yield scrapy.Request( url = url_short, callback = self.parse )
#   # parse method
#   def parse( self, url ):
#     # Create an extracted list of course author names
#     author_names = url.css('p.course-block__author-name::text').extract()
#     # Here we will just return the list of Authors
#     return author_names
  
# # Inspect the spider
# inspect_spider( DCspider )

# Crawler Time

first chance to play with a spider which will crawl between sites (by first collecting links from one site, and following those links to parse new sites). This spider starts at the shortened DataCamp course directory, then extracts the links of the courses in the parse method; from there, it will follow those links to extract the course descriptions from each course page in the parse_descr method, and put these descriptions into the list course_descrs. Your job is to complete the code so that the spider runs as desired!

In [6]:
# # Import the scrapy library
# import scrapy

# # Create the Spider class
# class DCdescr( scrapy.Spider ):
#   name = 'dcdescr'
#   # start_requests method
#   def start_requests( self ):
#     yield scrapy.Request( url = url_short, callback = self.parse )
  
#   # First parse method
#   def parse( self, response ):
#     links = response.css( 'div.course-block > a::attr(href)' ).extract()
#     # Follow each of the extracted links
#     for link in links:
#       yield response.follow( url = link, callback = self.parse_descr )
      
#   # Second parsing method
#   def parse_descr( self, response ):
#     # Extract course description
#     course_descr = response.css( 'p.course__description::text' ).extract_first()
#     # For now, just yield the course description
#     yield course_descr


# # Inspect the spider
# inspect_spider( DCdescr )

# Time to Run

We will go through creating an entire web-crawler to access course information from each course in the DataCamp course directory.

In [None]:
# # Import scrapy
# import scrapy

# # Import the CrawlerProcess: for running the spider
# from scrapy.crawler import CrawlerProcess

# # Create the Spider class
# class DC_Chapter_Spider(scrapy.Spider):
#   name = "dc_chapter_spider"
#   # start_requests method
#   def start_requests(self):
#     yield scrapy.Request(url = url_short,
#                          callback = self.parse_front)
#   # First parsing method
#   def parse_front(self, response):
#     course_blocks = response.css('div.course-block')
#     course_links = course_blocks.xpath('./a/@href')
#     links_to_follow = course_links.extract()
#     for url in links_to_follow:
#       yield response.follow(url = url,
#                             callback = self.parse_pages)
#   # Second parsing method
#   def parse_pages(self, response):
#     crs_title = response.xpath('//h1[contains(@class,"title")]/text()')
#     crs_title_ext = crs_title.extract_first().strip()
#     ch_titles = response.css('h4.chapter__title::text')
#     ch_titles_ext = [t.strip() for t in ch_titles.extract()]
#     dc_dict[ crs_title_ext ] = ch_titles_ext

# # Initialize the dictionary **outside** of the Spider class
# dc_dict = dict()

# # Run the Spider
# process = CrawlerProcess()
# process.crawl(DC_Chapter_Spider)
# process.start()

# # Print a preview of courses
# previewCourses(dc_dict)

# DataCamp Descriptions

you are asked to create a CSS Locator string direct to the text of the course description. All you need to know is that from the course page, the course description text is within a paragraph p element which belongs to the class course__description (two underlines).

In [7]:
# # Import scrapy
# import scrapy

# # Import the CrawlerProcess: for running the spider
# from scrapy.crawler import CrawlerProcess

# # Create the Spider class
# class DC_Description_Spider(scrapy.Spider):
#   name = "dc_chapter_spider"
#   # start_requests method
#   def start_requests(self):
#     yield scrapy.Request(url = url_short,
#                          callback = self.parse_front)
#   # First parsing method
#   def parse_front(self, response):
#     course_blocks = response.css('div.course-block')
#     course_links = course_blocks.xpath('./a/@href')
#     links_to_follow = course_links.extract()
#     for url in links_to_follow:
#       yield response.follow(url = url,
#                             callback = self.parse_pages)
#   # Second parsing method
#   def parse_pages(self, response):
#     # Create a SelectorList of the course titles text
#     crs_title = response.xpath('//h1[contains(@class,"title")]/text()')
#     # Extract the text and strip it clean
#     crs_title_ext = crs_title.extract_first().strip()
#     # Create a SelectorList of course descriptions text
#     crs_descr = response.css( 'p.course__description::text' )
#     # Extract the text and strip it clean
#     crs_descr_ext = crs_descr.extract_first().strip()
#     # Fill in the dictionary
#     dc_dict[crs_title_ext] = crs_descr_ext

# # Initialize the dictionary **outside** of the Spider class
# dc_dict = dict()

# # Run the Spider
# process = CrawlerProcess()
# process.crawl(DC_Description_Spider)
# process.start()

# # Print a preview of courses
# previewCourses(dc_dict)

# Capstone Crawler

you will write the parse function for a spider and then fill in a few blanks to finish off the spider.Everything you need to know is:

- The course titles are defined by the text within an h4 element whose class contains the string block__title (double underline).
- The short course descriptions are defined by the text within a paragraph p element whose class contains the string block__description (double underline).

In [8]:
# # parse method
# def parse(self, response):
#   # Extracted course titles
#   crs_titles = response.xpath('//h4[contains(@class,"block__title")]/text()').extract()
#   # Extracted course descriptions
#   crs_descrs = response.xpath('//p[contains(@class,"block__description")]/text()').extract()
#   # Fill in the dictionary
#   for crs_title, crs_descr in zip(crs_titles, crs_descrs):
#     dc_dict[crs_title] = crs_descr

In [9]:
# # Import scrapy
# import scrapy

# # Import the CrawlerProcess
# from scrapy.crawler import CrawlerProcess

# # Create the Spider class
# class YourSpider(scrapy.Spider):
#   name = 'yourspider'
#   # start_requests method
#   def start_requests( self ):
#     yield scrapy.Request(url = url_short, callback = self.parse)
      
#   def parse(self, response):
#     # My version of the parser you wrote in the previous part
#     crs_titles = response.xpath('//h4[contains(@class,"block__title")]/text()').extract()
#     crs_descrs = response.xpath('//p[contains(@class,"block__description")]/text()').extract()
#     for crs_title, crs_descr in zip( crs_titles, crs_descrs ):
#       dc_dict[crs_title] = crs_descr
    
# # Initialize the dictionary **outside** of the Spider class
# dc_dict = dict()

# # Run the Spider
# process = CrawlerProcess()
# process.crawl(YourSpider)
# process.start()

# # Print a preview of courses
# previewCourses(dc_dict)