In [23]:
import time
import httpx
import lxml.html
import json
import csv


# ---------------------------- Utility Functions ----------------------------

def complete_link(url: str) -> str:
    """
    Ensures the URL is complete. If the URL is incomplete (starts with '/'),
    it will prepend the base URL ('https://www.zillow.com').

    :param url: The URL to complete.
    :return: The complete URL.
    """
    base_url = "https://www.zillow.com"
    if url.startswith("https://www.zillow.com"):
        return url
    return base_url + url


# -------------------------- Web Scraping Functions ------------------------

def fetch_page(url: str) -> str:
    """
    Fetches the content of a webpage given a URL using httpx.

    :param url: The URL to fetch the page from.
    :return: The HTML content of the page.
    """
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.9",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Host": "www.zillow.com",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.6 Safari/605.1.15"
    }

    with httpx.Client(headers=headers, follow_redirects=True) as client:
        response = client.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.text


def parse_script_content(html: str) -> dict:
    """
    Parses the HTML content and extracts the script tag with the '__NEXT_DATA__' ID.
    Returns the JSON content inside that script tag.

    :param html: The HTML content of the page.
    :return: A dictionary containing the JSON data, or an empty dictionary if not found.
    """
    root = lxml.html.fromstring(html)
    script_element = root.xpath('//script[@id="__NEXT_DATA__"]')

    if script_element:
        script_content = script_element[0].text_content()
        return json.loads(script_content)
        
    print("Script tag with id '__NEXT_DATA__' not found.")
    return {}


def extract_listings(json_data: dict) -> list:
    """
    Extracts the list of property listings from the parsed JSON data.

    :param json_data: The parsed JSON data.
    :return: A list of listings.
    """
    listings = json_data['props']['pageProps']['searchPageState']['cat1']['searchResults']['listResults']
    return listings



# ---------------------------- Pagination Function ----------------------------

def nextpage_from_xpath(response_text):
    """
    Extracts the URL for the next page using XPath.

    :param response_text: The HTML response text of the page.
    :return: The next URL to scrape or None if there is no next page.
    """
    try:
        tree = lxml.html.fromstring(response_text)
        #next_page_element = tree.xpath('//*[@id="grid-search-results"]/div[2]/nav/ul/li[10]/a/@href')  # Extract href attribute
        # XPath targeting the "Next" pagination button by rel="next" and class name
        next_page_element = tree.xpath('//a[@rel="next" and contains(@class, "PaginationButton-c11n-8-109-1__sc-1i6hxyy-0")]/@href')

        
        #grid-search-results > div.search-pagination > nav > ul > li:nth-child(8) > a
        #//*[@id="grid-search-results"]/div[2]/nav/ul/li[8]/a
        
        if next_page_element:
            next_url = complete_link(next_page_element[0])  # Ensure correct completion
            return next_url
        else:
            print("No more pages found.")
            return None
    except Exception as e:
        print(f"Error extracting next page: {e}")
        return None


def paginate(url: str, max_pages: int = 20):
    """
    Paginate through multiple pages to extract listing URLs.

    :param url: The starting URL for pagination.
    :param max_pages: The maximum number of pages to paginate through (optional).
    :return: A list of all collected listing URLs.
    """
    all_listings = []
    current_page = 1

    while url and current_page <= max_pages:
        print(f"Fetching page {current_page}... {url}")

        # Fetch the page
        html = fetch_page(url)
        if not html:
            break

        # Parse the page content
        json_data = parse_script_content(html)
        if not json_data:
            break

        # Extract listings and add their URLs to the list
        # Extract rental listings
        listings = extract_listings(json_data)
        for listing in listings:
            address = listing.get("address", "")
            detail_url = complete_link(listing.get("detailUrl", ""))
            status_type = listing.get("statusType", "")
            lat, lon = listing.get("latLong", {}).get("latitude", ""), listing.get("latLong", {}).get("longitude", "")
            zip_code = listing.get("addressZipcode", "")
            price = listing.get('price', None)
            livingarea = listing.get('area', None)
            status = listing.get('statusType', None)
            listingkey = listing.get('id', None)
            bedrooms = listing.get("beds", None)  # Correct field for bedrooms
            bathrooms = listing.get("baths", None)  # Correct field for bathrooms

            # Store listing data
            all_listings.append({
                "address": address,
                "detailUrl": detail_url,
                "statusType": status_type,
                "zipcode": zip_code,
                "latitude": lat,
                "longitude": lon,
                "price": price,
                "livingarea": livingarea,
                "status": status,
                "listingkey": listingkey,
                "bedrooms": bedrooms,  # Now correctly mapped
                "bathrooms": bathrooms  # Now correctly mapped
            })

        # Move to the next page
        next_url = nextpage_from_xpath(html)  # FIXED: Using HTML instead of JSON
        if next_url == url:
            break
        url = next_url

        # Sleep before fetching the next page to avoid getting blocked (adjust as needed)
        time.sleep(2)
        current_page += 1

    return all_listings

# ---------------------------- CSV Output ----------------------------

def save_listings_to_csv(listings, filename="zillow_listings.csv"):
    """
    Saves rental listings to a CSV file.
    """
    fieldnames = ["address", "detailUrl", "statusType", "zipcode", "latitude", 
                  "longitude", "price","livingarea", "status", "listingkey",
                  "bedrooms","bathrooms"]

    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(listings)

    print(f"✅ Data saved to {filename}")



# ---------------------------- Main Function ----------------------------

def main(zip_codes: list):
    """
    The main function to scrape listings from Zillow for each ZIP code.

    :param zip_codes: A list of ZIP codes to scrape.
    """
    all_results = []

    for zip_code in zip_codes:
        print(f"\n📌 Scraping ZIP Code: {zip_code}\n")
        url = f"https://www.zillow.com/chicago-il-{zip_code}/rentals/"
        
        listings = paginate(url)
        if listings:
            all_results.extend(listings)
            print(f"✅ {len(listings)} listings found in ZIP {zip_code}\n")
        else:
            print(f"❌ No listings found in ZIP {zip_code}\n")
                  
        # Sleep for a bit between ZIP codes to avoid getting blocked
        time.sleep(10)

    # Save all results to CSV
    save_listings_to_csv(all_results)


# ---------------------------- Example Usage ----------------------------

if __name__ == "__main__":
    chicago_zip_codes = [
     "60601", "60602", "60603", "60604", "60605", "60606", "60607", "60608", "60609",
    "60610", "60611", "60612", "60613", "60614", "60615", "60616", "60617", "60618", "60619",
    "60620", "60621", "60622", "60623", "60624", "60625", "60626", "60628", "60629", "60630",
    "60631", "60632", "60633", "60634", "60636", "60637", "60638", "60639", "60640", "60641",
    "60642", "60643", "60644", "60645", "60646", "60647", "60649", "60651", "60652", "60653",
    "60654", "60655", "60656", "60657", "60659", "60660", "60661", #"60664", 
    "60666", "60668",
    "60669", "60670", "60673", "60674", "60675", "60677", "60678", "60680", "60681", "60682",
    "60684", "60685", "60686", "60687", "60688", "60689", "60690", "60691", "60693", "60694",
    "60695", "60696", "60697", "60699", "60701"
]


    all_data = main(chicago_zip_codes)



📌 Scraping ZIP Code: 60601

Fetching page 1... https://www.zillow.com/chicago-il-60601/rentals/
No more pages found.
✅ 35 listings found in ZIP 60601


📌 Scraping ZIP Code: 60602

Fetching page 1... https://www.zillow.com/chicago-il-60602/rentals/
No more pages found.
✅ 7 listings found in ZIP 60602


📌 Scraping ZIP Code: 60603

Fetching page 1... https://www.zillow.com/chicago-il-60603/rentals/
No more pages found.
✅ 6 listings found in ZIP 60603


📌 Scraping ZIP Code: 60604

Fetching page 1... https://www.zillow.com/chicago-il-60604/rentals/
No more pages found.
✅ 3 listings found in ZIP 60604


📌 Scraping ZIP Code: 60605

Fetching page 1... https://www.zillow.com/chicago-il-60605/rentals/
Fetching page 2... https://www.zillow.com/chicago-il-60605/rentals/2_p/
Fetching page 3... https://www.zillow.com/chicago-il-60605/rentals/3_p/
✅ 96 listings found in ZIP 60605


📌 Scraping ZIP Code: 60606

Fetching page 1... https://www.zillow.com/chicago-il-60606/rentals/
No more pages found.
✅ 