In [12]:
import requests                         # for sending HTTP requests

from tqdm.notebook import tqdm, trange  # for progress bars
from selenium import webdriver
from scrapy import Selector             # for parsing HTML content
import pandas as pd
import time

In [None]:
url = 'https://www.zillow.com/boston-ma/1_p/'

In [6]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
def fetch_page(url, delay=1):
  """
  """
  time.sleep(delay)
  try:
    response = requests.get(url, timeout=None, headers= headers) 
  except Exception as e:
    print('Error', e)
    pass
  if not response.ok:
    print("Something went wrong", response.status_code)
    pass
  html = response.content
  
  return Selector(text=html)

In [7]:
def extract_all_listings(page_sel): 
  """
  Inputs: 
    page_sel: a Selector object which contains 
      the contents of the Zillow website of houses
  Outputs:
    rental_list: a list of listings contained within page_sel
  """
  objects = page_sel.xpath('//div[contains(@class, "StyledPropertyCardDataWrapper")]')
  return objects


In [8]:
def get_listing_info(listing_sel):
  """
  Inputs:
    listing_sel: a Selector object which contains the content
      of one listing
  Outputs:
    unit_dict: a dictionary which contains price, number of bedrooms, 
      number of bathrooms, sqft, postal code
  """
  address = listing_sel.css("a ::text").get()
  zip_code = address[-5:]
  price = listing_sel.css("div span ::text").get()[1:]
  bbs = listing_sel.css("ul li b ::text").getall()
  bed = bbs[0]
  bath = bbs[1]
  sqft = bbs[2]
  return {'address': address, 'zip_code': zip_code, 'price': price, 'num_bedrooms': bed, 
          'num_bathrooms': bath, 'sqft': sqft}

In [13]:
sel = fetch_page('https://www.zillow.com/boston-ma')

In [14]:

objects = extract_all_listings(sel)
# print(sel)
info_list = [get_listing_info(x) for x in objects]
info_df = pd.DataFrame(info_list)
display(info_df)
# display(info_df.info)

Unnamed: 0,address,zip_code,price,num_bedrooms,num_bathrooms,sqft
0,"355 Congress St #2, Boston, MA 02210",2210,4499000,4,4,2845
1,"118 Blake St, Hyde Park, MA 02136",2136,539000,3,2,1709
2,"37-37A Spring Park Ave, Jamaica Plain, MA 02130",2130,950000,3,2,1156
3,"49 Melcher St APT 501, Boston, MA 02210",2210,4750000,3,3,2964
4,"1515 Vfw Pkwy TRAILER E21, West Roxbury, MA 02132",2132,174900,2,1,510
5,"391 Hyde Park Ave APT 210, Roslindale, MA 02131",2131,485000,2,2,1166
6,"130 Commonwealth Ave, Boston, MA 02116",2116,25990000,6,10,10022
7,"82 Woodley Ave, West Roxbury, MA 02132",2132,599999,3,2,1726
8,"121 M St APT 3, South Boston, MA 02127",2127,1099000,2,2,1357
