In [61]:
import httpx
import lxml.html
import json
import csv

def complete_link(url):
    """
    This function ensures that the URL is complete. If the URL is incomplete (starts with '/'),
    it will prepend the base URL ('https://www.zillow.com').
    """
    base_url = "https://www.zillow.com"
    if url.startswith("https://www.zillow.com"):
        return url
    else:
        # If it's an incomplete URL, prepend the base URL
        return base_url + url


def get_zillow_listings(url):
    """
    This function scrapes Zillow listings from the provided URL and returns a list of listings
    with relevant details such as address, status type, latitude, longitude, etc.
    """
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "www.zillow.com",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.15"
    }

    with httpx.Client(headers=headers, follow_redirects=True) as client:
        try:
            response = client.get(url)
            response.raise_for_status()  # Check for HTTP errors
        except httpx.RequestError as e:
            print(f"An error occurred while requesting the page: {e}")
            return None
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e}")
            return None

    html = response.text
    root = lxml.html.fromstring(html)
    script_element = root.xpath('//script[@id="__NEXT_DATA__"]')

    if script_element:
        script_content = script_element[0].text_content()
        try:
            json_data = json.loads(script_content)
        except json.JSONDecodeError:
            print("Error decoding JSON")
            return None
        
        listings = []
        try:
            listings = json_data['props']['pageProps']['searchPageState']['cat1']['searchResults']['listResults']
        except KeyError:
            print("Error extracting listings")
        
        listings_data = []
        for listing in listings:
            # Extract and complete detailUrl
            address = listing.get("address", "")
            detail_url = listing.get("detailUrl", "")
            detail_url = complete_link(detail_url)  # Ensure the link is complete

            status_type = listing.get("statusType", "")
            address_zipcode = ""
            if isinstance(listing.get("address"), dict):
                address_zipcode = listing["address"].get("zipcode", "")

            lat = ""
            lon = ""
            if isinstance(listing.get("latLong"), dict):
                lat = listing["latLong"].get("latitude", "")
                lon = listing["latLong"].get("longitude", "")

            # Prepare the data for this listing
            listing_data = {
                "address": address,
                "detailUrl": detail_url,
                "statusType": status_type,
                "addressZipcode": address_zipcode,
                "latitude": lat,
                "longitude": lon
            }
            listings_data.append(listing_data)

        return listings_data
    else:
        print("Script tag with id '__NEXT_DATA__' not found.")
        return None


def save_listings_to_csv(listings_data, filename="listings.csv"):
    """
    This function takes a list of listings data and saves it to a CSV file.
    """
    # Define the fieldnames (columns) for the CSV file
    fieldnames = ["address", "detailUrl", "statusType", "addressZipcode", "latitude", "longitude"]

    # Open the file in write mode
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # Write the header (column names)
        writer.writeheader()

        # Write each row of data
        for listing in listings_data:
            writer.writerow(listing)

    print(f"Data saved to {filename}")


# Example usage:
url = "https://www.zillow.com/chicago-il/rentals/"
listings = get_zillow_listings(url)

if listings:
    save_listings_to_csv(listings)


Data saved to listings.csv


In [62]:
import time
import httpx
import lxml.html
import json


# ---------------------------- Utility Functions ----------------------------

def complete_link(url: str) -> str:
    """
    Ensures the URL is complete. If the URL is incomplete (starts with '/'),
    it will prepend the base URL ('https://www.zillow.com').
    
    :param url: The URL to complete.
    :return: The complete URL.
    """
    base_url = "https://www.zillow.com"
    if url.startswith("https://www.zillow.com"):
        return url
    return base_url + url


# -------------------------- Web Scraping Functions ------------------------

def fetch_page(url: str) -> str:
    """
    Fetches the content of a webpage given a URL using httpx.
    
    :param url: The URL to fetch the page from.
    :return: The HTML content of the page.
    """
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "www.zillow.com",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.15"
    }

    with httpx.Client(headers=headers, follow_redirects=True) as client:
        try:
            response = client.get(url)
            response.raise_for_status()  # Raise an exception for HTTP errors
            return response.text
        except httpx.RequestError as e:
            print(f"An error occurred while requesting the page: {e}")
            return ""
        except httpx.HTTPStatusError as e:
            print(f"HTTP error occurred: {e}")
            return ""


def parse_script_content(html: str) -> dict:
    """
    Parses the HTML content and extracts the script tag with the '__NEXT_DATA__' ID.
    Returns the JSON content inside that script tag.
    
    :param html: The HTML content of the page.
    :return: A dictionary containing the JSON data, or an empty dictionary if not found.
    """
    root = lxml.html.fromstring(html)
    script_element = root.xpath('//script[@id="__NEXT_DATA__"]')

    if script_element:
        try:
            script_content = script_element[0].text_content()
            return json.loads(script_content)
        except json.JSONDecodeError:
            print("Error decoding JSON from script tag")
            return {}
    print("Script tag with id '__NEXT_DATA__' not found.")
    return {}


def extract_listings(json_data: dict) -> list:
    """
    Extracts the list of property listings from the parsed JSON data.
    
    :param json_data: The parsed JSON data.
    :return: A list of listings.
    """
    try:
        listings = json_data['props']['pageProps']['searchPageState']['cat1']['searchResults']['listResults']
        return listings
    except KeyError:
        print("Error extracting listings from JSON.")
        return []


# ---------------------------- Pagination Function ----------------------------

def nextpage(json_data):
    """
    Extracts the URL for the next page from the JSON data.
    
    :param json_data: The parsed JSON data.
    :return: The next URL to scrape.
    """
    try:
        next_page = json_data['props']['pageProps']['searchPageState']['cat1']['searchList']['pagination']['nextUrl']
        next_url = 'https://www.zillow.com' + str(next_page)
        return next_url
    except KeyError:
        print("No more pages found.")
        return None


def paginate(url: str, max_pages: int = 10):
    """
    Paginate through multiple pages to extract listing URLs.
    
    :param url: The starting URL for pagination.
    :param max_pages: The maximum number of pages to paginate through (optional).
    :return: A list of all collected listing URLs.
    """
    all_listing_urls = []
    current_page = 1
    while url and current_page <= max_pages:
        print(f"Fetching page {current_page}...")
        
        # Fetch the page
        html = fetch_page(url)
        if not html:
            break
        
        # Parse the page content
        json_data = parse_script_content(html)
        if not json_data:
            break
        
        # Extract listings and add their URLs to the list
        listings = extract_listings(json_data)
        for listing in listings:
            detail_url = complete_link(listing.get("detailUrl", ""))
            all_listing_urls.append(detail_url)
        
        # Move to the next page
        next_url = nextpage(json_data)
        if not next_url:
            break
        url = next_url
        
        # Sleep for 5 seconds before fetching the next page
        time.sleep(5)
        current_page += 1

    return all_listing_urls


# ---------------------------- Main Function ----------------------------

def main(url: str):
    """
    The main function to scrape all the listings from paginated pages.
    
    :param url: The URL to scrape the listings from.
    """
    all_listing_urls = paginate(url)
    if all_listing_urls:
        print(f"Total listings collected: {len(all_listing_urls)}")
        # You can save the URLs to a CSV or process them further if needed
        for listing_url in all_listing_urls:
            print(listing_url)
    else:
        print("No listings found.")


# ---------------------------- Example Usage ----------------------------

url = "https://www.zillow.com/chicago-il/rentals/"
main(url)


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Total listings collected: 410
https://www.zillow.com/apartments/chicago-il/3600-w-franklin-blvd/5XdyyY/
https://www.zillow.com/apartments/chicago-il/arthur-on-aberdeen/CPWz/
https://www.zillow.com/apartments/chicago-il/3912-28-n.-pine-grove/BqXt/
https://www.zillow.com/apartments/chicago-il/laflin-place/BK9z86/
https://www.zillow.com/apartments/chicago-il/2000-w.-summerdale-ave/ChWCCP/
https://www.zillow.com/apartments/chicago-il/845-w.-fulton-market-st./ChWFNf/
https://www.zillow.com/apartments/chicago-il/541-w-oakdale/5Xg2b5/
https://www.zillow.com/apartments/chicago-il/the-lydian/CgH9Yn/
https://www.zillow.com/apartments/chicago-il/1613-w-belmont-ave-chicago-il/Bcq7/
https://www.zillow.com/apartments/chicago-il/oakdale-terrace-525-w-oakdale-ave/5Xhfr9/
https://www.zillow.com/apartments/chicago-i