In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

class RentsSpider(scrapy.Spider):
    # Give the spider a name
    name = 'rents'
    
    # our spider uses the below link to send request and get the source page
    start_urls = [
        'https://www.apartments.com/austin-tx/student-housing/',
        'https://www.apartments.com/saint-louis-mo/student-housing/',
#         'https://www.apartments.com/minneapolis-mn/student-housing/',
#         'https://www.apartments.com/tampa-fl/student-housing/',
#         'https://www.apartments.com/new-york-ny/student-housing/'
    ]
    
    # set data exporting 
    custom_settings = {
        'FEEDS': {
            'rents.csv': {
                'format': 'csv',
                'overwrite': True
            }
        }
    }
    
    # a list of locations where the rents are located
    locations = ["austin-tx", 
                 "saint-louis-mo",
                 "minneapolis-mn", 
                 "tampa-fl",
                 "new-york-ny"]
    
    # indicate the index of the url list - where we are currently at
    city_number = 0
    
    # how many pages for rents of the current city
    pages = 0
    
    count = 100
    
    '''
    crawler starts off by calling the following overriden method start_requests
    the methods sends the first request 
    response then is passed to a callback function named parse (below)
    '''
    def start_requests(self):
        yield scrapy.Request(url=self.start_urls[self.city_number], callback=self.parse)

            
    '''
    get the card elements and loop over each of them
        select the text within the element p.property-pricing
        select the text within the element p.property-beds
    '''
    def parse(self, response):
        for rent in response.css('section.placard-content'):
            self.count+=1
            if self.count>100:
                break
            yield {
                "location": self.locations[self.city_number],
                "price": rent.css("p.property-pricing::text").get(),
                "beds": rent.css("p.property-beds::text").get(),
            }
            
             
        # follow to the next page of the link: still webscraping for the same city
        next_page = response.css('a.next::attr("href")').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)
        
        # continue with the remaining url in the start_urls list, moving to the next city in the list
        self.index += 1
        if (self.index < len(self.start_urls)):
            yield scrapy.Request(url=self.start_urls[self.city_number], callback=self.parse)
        

       
        

In [2]:
process = CrawlerProcess()
process.crawl(RentsSpider)
process.start()

2022-05-28 23:50:29 [scrapy.utils.log] INFO: Scrapy 2.6.1 started (bot: scrapybot)
2022-05-28 23:50:29 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 22.2.0, Python 3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 3.4.8, Platform Windows-10-10.0.22000-SP0
2022-05-28 23:50:29 [scrapy.crawler] INFO: Overridden settings:
{}
2022-05-28 23:50:29 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-05-28 23:50:29 [scrapy.extensions.telnet] INFO: Telnet Password: c70ff0dc3ce6599c
2022-05-28 23:50:29 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2022-05-28 23:50:30 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloade