# Web Scrap apartments.com for house rents using Scrapy

### Scarpy
#### Advantages: 
fast, clean, large amount of data, scrapy shell

#### Disavantages: 
request and response, not support browser automation (clicking a button, ...)


In [None]:
import scrapy
from scrapy.crawler import CrawlerProcess

# Override the scrapy.Spider class
class RentsSpider(scrapy.Spider):
    # Give the spider a name
    name = 'rents'
    
    # our spider uses the below link to send request and get the source page
    start_urls = [
        'https://www.apartments.com/austin-tx/student-housing/',
        'https://www.apartments.com/saint-louis-mo/student-housing/',
        'https://www.apartments.com/minneapolis-mn/student-housing/',
        'https://www.apartments.com/tampa-fl/student-housing/',
        'https://www.apartments.com/new-york-ny/student-housing/'
    ]
    
    # set data exporting 
    custom_settings = {
        'FEEDS': {
            'rents.csv': {
                'format': 'csv',
                'overwrite': True
            }
        }
    }
    
    # a list of locations where the rents are located
    locations = ["austin-tx", 
                 "saint-louis-mo",
                 "minneapolis-mn", 
                 "tampa-fl",
                 "new-york-ny"]
    
    # indicate the index of the url list - where we are currently at
    index = 0
    
    # the page number of the current city
    page_number = 1
    
    '''
    crawler starts off by calling the following overriden method start_requests
    the methods sends the first request 
    response then is passed as an arg to a callback function named parse (see below)
    '''
    def start_requests(self):
        yield scrapy.Request(url=self.start_urls[self.index], callback=self.parse)

            
    '''
    get the card elements and loop over each of them
        select the text within the element p.property-pricing
        select the text within the element p.property-beds
    '''
    def parse(self, response):
        for rent in response.css('section.placard-content'):
            yield {
                "location": self.locations[self.index],
                "price": rent.css("p.property-pricing::text").get(),
                "beds": rent.css("p.property-beds::text").get(),
            }
            
             
        # follow to the next page of the link: still webscraping for the same city
        next_page_exist = response.css('p.property-pricing').get()
        if next_page_exist is not None:
            # update the page_number
            self.page_number += 1
            
            # create next page URL
            next_page = "{original_url}{page_number}/".format(original_url=self.start_urls[self.index], page_number=self.page_number)
            yield response.follow(next_page, self.parse)
        else:   
            # if the next page is exhausted, continue with the remaining url in the start_urls list, moving to the next city in the list
            self.index += 1
            if (self.index < len(self.start_urls)):
                self.page_number = 1
                yield scrapy.Request(url=self.start_urls[self.index], callback=self.parse)
        

       
        

In [None]:
process = CrawlerProcess()
process.crawl(RentsSpider)
process.start()