In [172]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import time

In [173]:
# The main website, a listing a real estate properties in Toronto

website = "https://www.royallepage.ca/en/on/toronto/properties/"

In [174]:
# This header will emulate a browser request somewhat, making it less likely to be blocked by the site

headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'})

In [175]:
#Iterate through the website pages
for page_number in range(1, 20):
    url = f"{website}{page_number}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    listings = soup.find_all('li', {'class': 'card-group__item'})
    # Iterate through the individual real estate properties
    for listing in listings:
        price = listing.find('span', {'class': 'currency'}).text[1:] + listing.find('span', {'class': 'price'}).text
        address = listing.find('address', {'class': 'address-1'}).text.strip()
        bedrooms_span = listing.find('span', {'class': 'beds'})
        # Check if bedrooms are listed
        if bedrooms_span:
            bedrooms = int(bedrooms_span.text.strip())
        else:
            bedrooms = 'Not listed'
        bathrooms_span = listing.find('span', {'class': 'baths'})
        # Check if bathrooms are listed
        if bathrooms_span:
            bathrooms = float(bathrooms_span.text.strip())
        else:
            bathrooms = 'Not listed'
        postal_code_span = listing.find('img', alt=re.compile(r'Toronto, ON [A-Z]\d[A-Z] \d[A-Z]\d'))
        # Check if postal codes are listed
        if postal_code_span:
            postal_code_pattern = r'Toronto, ON ([A-Z]\d[A-Z] \d[A-Z]\d)'
            postal_code = re.search(postal_code_pattern, postal_code_span['alt']).group(1)
        else:
            postal_code = 'Not listed'
        row = {'price': price, 'address': address, 'bedrooms': bedrooms, 'bathrooms': bathrooms, 'postal_code': postal_code}
        rows.append(row)
    # Add a delay between requests so as to not overwhelm the website
    time.sleep(1)

In [183]:
# Create the dataframe

df = pd.DataFrame(rows)

In [182]:
print(df)

          price                     address    bedrooms   bathrooms  \
0    $2,599,900              599 Spadina Rd           4  Not listed   
1    $1,999,999               61 Cameron St           6  Not listed   
2    $1,750,000         #106 -4750 Yonge St  Not listed  Not listed   
3    $1,688,000              140 Bogert Ave           3  Not listed   
4    $1,520,000       150 Harlandale Avenue           4  Not listed   
..          ...                         ...         ...         ...   
271   $$735,000            53 Templeton Crt  Not listed  Not listed   
272   $$729,000           #703 -87 Peter St  Not listed  Not listed   
273   $$709,000           #7 -38 Gibson Ave  Not listed  Not listed   
274   $$699,000          45 English Ivy Way  Not listed  Not listed   
275   $$699,000  #1201 -2015 Sheppard Ave E  Not listed  Not listed   

    postal_code  
0       M5P 2X1  
1       M5T 2H1  
2    Not listed  
3       M2N 1K8  
4       M2N 1P4  
..          ...  
271     M1E 2C3  
272

In [184]:
# Export the dataframe to a CSV file

df.to_csv('toronto_real_estate.csv', index=False)