# Collecting Zillow data

## Initial setup

### Import packages

In [18]:
import requests                         # for sending HTTP requestss
from scrapy import Selector             # for parsing HTML content
import pandas as pd
import time
import json
import os

## Creating Functions

### Makin basic page fetching function

In [19]:

def fetch_page(url, delay=1):
  # add headers to avoid 403 error
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
  }
  #allow site to load
  time.sleep(delay)
  try:
    response = requests.get(url, timeout=None, headers= headers) 
  except Exception as e:
    print('Error', e)
    pass
  if not response.ok:
    print("Something went wrong", response.status_code)
    pass
  html = response.content
  #return the selector which contaisn the html of the page
  return Selector(text=html)

### Get zips from zipcodes.org

In [20]:
def get_all_zips (sel):
  """
  Returns a list of all the zip codes by 
  scraping the zipcode.org site given the url in form
  'https://zipcode.org/city/{state}/{city}'
  """
  #get the container which holds the list of zip codes
  block = sel.css('div.HTML_Block')[5]
  #get all zip codes in that container by getting the href attribute since the format is cleaner
  zip_list = block.css('a::attr(href)').getall()
  #remove the "/" from the beginning of each zip code href
  adjusted_zip_list = [x[1:] for x in zip_list]
 
  return adjusted_zip_list


### Fetch Zillow data

In [21]:
def fetch_by_script(full_page_selector):
  """
  Inputs a Zillow page Selector and returns a DataFrame of the listings information
  by scraping the script tag which contains the JSON data
  """
  #find the script which loads in the page with data
  script_text = full_page_selector.css('script#__NEXT_DATA__').get()
  #split the script by necessary chars to get the JSON data alone
  no_first = script_text.split('>')[1]
  no_last = no_first.split('<')[0]
  listing_json = json.loads(no_last)
  #dive into the JSON to get the listing data
  cat_1 = listing_json['props']['pageProps']['searchPageState']['cat1']['searchResults']['listResults']
  page_df = pd.DataFrame(cat_1)
  return(page_df)

### Get all Zillow pages with the same Zillow parameter

In [22]:
def rec_fetch_all_pages(url):
  """
  Inputs: url - the relative url of first page of Zillow listing with some input
  Outputs: nothing, but saves the data to a json file in the data folder of that page and all subsequent 
  pages of the listing which would be accessed by hitting the next arrow
  """
  #get the full url of the page
  full_url = 'https://www.zillow.com' + url
  #get the selector of the page
  sel = fetch_page(full_url)
  #get the data from the page
  wanted_df = fetch_by_script(sel)

  #check if the url is longer than 16 characters, if so, then it is not the first page,
  #so edit the url to save the file with a more readable name
  if len(url) > 16:
    edited_url = url[:17] + '_' + url[17:]
  else:
    edited_url = url

  

  #if df is empty, no listings in that area, so I don't want to save the file
  if wanted_df.empty:
    # print("empty")
    return
  
  #i used AI to save the df to a json in my data folder and check to make sure file exists
  if wanted_df is not None:
    output_dir = '../data/raw/zillow'
    if not os.path.exists(output_dir):
      os.makedirs(output_dir)
    file_path = os.path.join(output_dir, f'zillow_{edited_url.replace("/","")}.json')
    wanted_df.to_json(file_path, orient='records', lines=True)   
  else:
    print(f'No data found for {url}')
  # try to get url for the next page if it exists
  try:
    footer = sel.xpath('//ul[contains(@class, "PaginationList")]/li[contains(@class, "PaginationJumpItem")]').css('::attr(href)').getall()[1]
  except:
    #if it doesn't exist, return to break the recursion
    return 
  if footer is None:
    return 
  
  #basically, zillow only lets you get 20 pages, and the next arrow would point
  #to the 20th page even if it should point to 21st
  if footer == url:
    return 
  return rec_fetch_all_pages(footer)

## Running Functions

In [23]:

zip_url = 'https://zipcode.org/city/MA/BOSTON'
#get boston zip lise
boston_zip_list = get_all_zips(fetch_page(zip_url))


Some pages in the boston_zip_list just pull up the entirety of Boston when I send the query, so I will remove them. I think because these Zips don't cover an actual area, but rather they are one building like a PO box or company

In [24]:
#note that I did this manually, but if recreating, you could run NB01-Data-Census first, and only use the 
#zip codes which have some population
values_to_remove = ['02112', '02117', '02123', '02133', '02196', '02205',
          '02212', '02283', '02211', '02217', '02241', '02284',
          '02293', '02295', '02297', '02298', '02206', '02204']

for value in values_to_remove:
  if value in boston_zip_list:
    boston_zip_list.remove(value)

In [25]:
for x in boston_zip_list:
  rec_fetch_all_pages("/ma-" + x)