In [15]:
import requests                         # for sending HTTP requests

from tqdm.notebook import tqdm, trange  # for progress bars

from scrapy import Selector             # for parsing HTML content
import pandas as pd

In [16]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
def fetch_page(url):
  """
  """
  
  try:
    response = requests.get(url, timeout=None, headers= headers) 
  except Exception as e:
    print('Error', e)
    pass
  if not response.ok:
    print("Something went wrong", response.status_code)
    pass
  html = response.content
  
  return Selector(text=html)

In [17]:
def extract_all_listings(page_sel): 
  """
  Inputs: 
    page_sel: a Selector object which contains 
      the contents of the Zillow website of houses
  Outputs:
    rental_list: a list of listings contained within page_sel
  """
  objects = page_sel.xpath('//div[contains(@class, "StyledPropertyCardDataWrapper")]')
  return objects


In [18]:
def get_listing_info(listing_sel):
  """
  Inputs:
    listing_sel: a Selector object which contains the content
      of one listing
  Outputs:
    unit_dict: a dictionary which contains price, number of bedrooms, 
      number of bathrooms, sqft, postal code
  """
  address = listing_sel.css("a ::text").get()
  zip_code = address[-5:]
  price = listing_sel.css("div span ::text").get()[1:]
  bbs = listing_sel.css("ul li b ::text").getall()
  bed = bbs[0]
  bath = bbs[1]
  sqft = bbs[2]
  return {'address': address, 'zip_code': zip_code, 'price': price, 'num_bedrooms': bed, 
          'num_bathrooms': bath, 'sqft': sqft}

In [26]:
sel = fetch_page('https://www.zillow.com/boston-ma/1_p/')

In [28]:

objects = extract_all_listings(sel)
# print(sel)
info_list = [get_listing_info(x) for x in objects]
info_df = pd.DataFrame(info_list)
display(info_df)
# display(info_df.info)

Unnamed: 0,address,zip_code,price,num_bedrooms,num_bathrooms,sqft
0,"130 Commonwealth Ave, Boston, MA 02116",2116,25990000,6,10,10022
1,"82 Woodley Ave, West Roxbury, MA 02132",2132,599999,3,2,1726
2,"37-37A Spring Park Ave, Jamaica Plain, MA 02130",2130,950000,3,2,1156
3,"118 Blake St, Hyde Park, MA 02136",2136,539000,3,2,1709
4,"71-73 Richfield St, Dorchester, MA 02125",2125,1225000,10,4,4096
5,"124 Theodore Parker Rd, Boston, MA 02132",2132,1425000,4,3,2626
6,"437-439 Chelsea St, Boston, MA 02128",2128,4999000,16,16,8755
7,"49 Melcher St APT 501, Boston, MA 02210",2210,4750000,3,3,2964
8,"150 Staniford St APT 400, Boston, MA 02114",2114,529900,1,1,794
