In [2]:
from bs4 import BeautifulSoup
from pathlib import Path
current_dir = Path.cwd().parent

# Load the HTML file
file_path = current_dir/"output.html"  # Adjust the path if needed
with open(file_path, "r", encoding="utf-8") as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")
data=[]
# Find all listing containers
listings = soup.find_all("article", class_="pc-listing-card")

for listing in listings:
    # Extract the link
    link_tag = listing.find("a", class_="photo-wrapper")
    link = link_tag["href"] if link_tag else None

    # Extract the strikethrough price
    strikethrough_price_tag = listing.find("span", class_="line-through")
    strikethrough_price = strikethrough_price_tag.text.strip() if strikethrough_price_tag else None

    # Extract the sold price
    sold_price_tag = listing.find("span", class_="special")
    sold_price = sold_price_tag.text.strip() if sold_price_tag else None

    # Extract the property type
    property_type_tag = listing.find("p", class_="type")
    property_type = property_type_tag.text.strip() if property_type_tag else None

    # Extract the address
    address_tag = listing.find("h3", class_="address")
    address = address_tag.text.strip() if address_tag else None

    # Extract bedrooms, bathrooms, and garages
    bedrooms_tag = listing.find("p", string=lambda text: text and "bedroom" in text.lower())
    bedrooms = bedrooms_tag.text.split("-")[1].strip() if bedrooms_tag else None

    bathrooms_tag = listing.find("p", string=lambda text: text and "bathroom" in text.lower())
    bathrooms = bathrooms_tag.text.split("-")[1].strip() if bathrooms_tag else None

    garages_tag = listing.find("p", string=lambda text: text and "garage" in text.lower())
    garages = garages_tag.text.split("-")[1].strip() if garages_tag else None

    # Append extracted data to the list
    data.append({
        "link": link,
        "strikethrough_price": strikethrough_price,
        "sold_price": sold_price,
        "property_type": property_type,
        "address": address,
        "bedrooms": bedrooms,
        "bathrooms": bathrooms,
        "garages": garages,
    })

# Display the extracted data
import pandas as pd
df = pd.DataFrame(data)
print(df)
df.to_csv("extracted_houses_paginated.csv", index=False)



                                                 link strikethrough_price  \
0   /on/etobicoke-real-estate/508-40-richview-rd/h...            $729,888   
1   /on/toronto-real-estate/2611-8-eglinton-ave-e/...            $659,800   
2   /on/markham-real-estate/25-ryler-way/home/9w8o...          $1,256,900   
3   /on/toronto-real-estate/ph03-159-wellesley-st-...            $549,990   
4   /on/richmond-hill-real-estate/204-330-red-mapl...            $600,000   
5   /on/vaughan-real-estate/113-solway-ave/home/Lz...          $1,338,888   
6   /on/etobicoke-real-estate/405-35-fontenay-crt/...            $590,888   
7   /on/toronto-real-estate/12-51-florence-st/home...            $699,000   
8   /on/markham-real-estate/266-16-elgin-st/home/g...            $779,000   
9   /on/toronto-real-estate/241-lansdowne-ave/home...          $2,795,000   
10  /on/vaughan-real-estate/146-laurelhurst-cres/h...          $1,099,900   
11  /on/toronto-real-estate/53-wilson-park-rd/home...          $2,199,000   