In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

import requests
import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

urls = ["https://www.zolo.ca/toronto-real-estate"] + \
        ["https://www.zolo.ca/toronto-real-estate/page-" + str(i) for i in range(2, 62)]

dataframes = []

options = Options()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
driver = webdriver.Chrome(options=options)

for url in urls:
    driver.get(url)

    # Wait until a card listings are loaded.
    try:
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//ul[@class='card-listing--values truncate list-unstyled xs-flex-order-1 xs-mb05']"))
        )
        print("Page fully loaded.")
    
    except Exception as e:
        print("Error waiting for page to load:", e)

    # Get html source
    main_page_source = driver.page_source

    # Beautiful-soup the HTML content
    soup = BeautifulSoup(main_page_source, 'html.parser')

    # Find all listing cards in the main page --> have tags named "article"
    listings = soup.find_all('article')

    highlight_data = []

    # Loop through each listing on each page/url
    for listing in listings:
        
        room_name = None
        room_dimension = None
        room_property = None

        room_names = []
        room_dimensions = []
        room_properties = []

        # Find all tags with href attributes --> will provide link to each listing on current page.
        links = listing.find_all("a", href=True)

        # To avoid errors when href tag is actually not for the link but for another html element
        valid_links = [link['href'] for link in links if 'https://www.zolo.ca/' in link['href']]

        if valid_links:
            # Extract from list (list will only have one element, the link)
            sub_url = valid_links[0]

            # Navigate to the sub URL
            driver.get(sub_url)

        
            try:
            # Wait for the element to be present if it exists
                WebDriverWait(driver, 40).until(
                    EC.presence_of_element_located((By.XPATH, "//dd[@class='column-value']"))
                )
            except:
                # If the <dd> class_='column-value' is not found within the timeout
                print("Element not found within the given time.")

            # Get page source (HTML)
            sub_page_source = driver.page_source

            # Beautiful-soup the HTML content of the subpage
            sub_soup = BeautifulSoup(sub_page_source, 'html.parser')

            highlight_info = {}
            
            # This is for the info above the description. They are all under "dt" tags
            high_level_labels = sub_soup.find_all("dt", class_="column-label")
            
            for label in high_level_labels:
            # The value is the next "dd" tag's "span" text (usually)
                try:
                    value = label.find_next("dd", class_="column-value").find("span", class_="priv")
        
                    # Get the text content for label and value if they exist
                    label_text = label.text.strip()
                    value_text = value.text.strip() if value else None
        
                    if label_text and value_text:
                        highlight_info[label_text] = value_text
                    else:
                    # For missing values
                        highlight_info[label_text] = None
                        
                except AttributeError:
                    # Handle the case where there might be no corresponding dd or span
                    print(f"Skipping label {label.text.strip()} due to missing data.")
                
                continue
            
            # Entries after the description.
            labels = sub_soup.find_all("div", class_="column-label")
            values = sub_soup.find_all("div", class_="column-value")
        
            highlight_info = {}
            for label, value in zip(labels, values):
                label_text = label.text.strip()
                value_text = value.text.strip()
                highlight_info[label_text] = value_text

            # More listing info
            price = sub_soup.find('div', class_="xs-text-2 heavy xs-inline xs-mr1").text.strip() if sub_soup.find('div', class_="xs-text-2 heavy xs-inline xs-mr1") else None
            address = sub_soup.find('h1', class_="address xs-text-4 sm-text-3 truncate heavy").text.strip() if sub_soup.find('h1', class_="address xs-text-4 sm-text-3 truncate heavy") else None

            # Extract room information
            rooms = sub_soup.find_all("section", class_="listing-rooms tables md-mb5")
            print(rooms)

            for room_section in rooms:
                rows = room_section.find_all("tr")

                # Iterate through each row and extract room name, dimension, and properties

                for row in rows:
                    print(row)

                    room_data = row.find_all("span", class_="priv")

                    if len(room_data) >= 2:
                        room_name = room_data[0].text.strip()  # First span for the room name
                        room_dimension = room_data[1].text.strip()  # Second span for the room dimension

                        # If there's a third span, it will be for the room's properties
                        room_property = room_data[2].text.strip() if len(room_data) > 2 else None

                        print(f"Room Name: {room_name}")
                        print(f"Room Dimension: {room_dimension}")
                        print(f"Room Properties: {room_property}")

                        room_names.append(room_name)
                        room_dimensions.append(room_dimension)
                        room_properties.append(room_property)


                    else:
                        print("Not enough data in row.")


            listing_info = {
                'price': price,
                'address': address,
                "rooms": room_names,
                "room dimensions": room_dimensions,
                "room_properties": room_properties
            }

            # Merge listing info with highlight info
            listing_info.update(highlight_info)

            # Append to highlight_data list
            highlight_data.append(listing_info)
        else:
            print("No valid href found for this 'listing' --> skip")

    # Create a dataframe for this URL from each of the listings
    df = pd.DataFrame(highlight_data)

    # Append this dataframe to the total list of dfs from all URLs
    dataframes.append(df)

# Concatenate all dataframes for each page into a single dataframe. Keeps all columns and add NaN where necessary.
final_df = pd.concat(dataframes, ignore_index=True)
final_df.reset_index(drop=True, inplace=True)

# Save to CSV file.
final_df.to_csv("new_zolo_listings.csv", index=False)