# WebScraping Dataset

The following code is used to scrap actual data from a renowned house rental website in the UK. For privacy reasons the url is masked.

In [29]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import urllib.parse
import pandas as pd
import numpy as np
import time

Function to get the address

In [2]:
def get_address(new_soup):

    # This will find the <h1> tag and the provided class from the soup that is extracted from the website.
    # The address is in the <h1> tag because the address is the 'title'
    address = new_soup.find("h1", attrs={'class':"_2uQQ3SV0eMHL1P6t5ZDo2q"}).text
    
    return address

Function to get the Furnish Type

In [3]:
def get_furnish_type(new_soup):

    #Finding the <dt> tag that contains 'Furnish type'
    furnish_type_dt = new_soup.find("dt", string=lambda text: "Furnish type" in text)
    if furnish_type_dt:  
        furnish_type_dd = furnish_type_dt.find_next("dd") # Then finding the <dd> tag
        if furnish_type_dd:
            furnish_type = furnish_type_dd.get_text(strip=True) # Then extracting the text inside the <dd> tag
        else:
            furnish_type = "No information"
    else:
        furnish_type = "No information"

    return furnish_type

Function to get the APartment type

In [4]:
def get_apartment_type(new_soup):

    # Finding all the dd tagged elements with the provided class
    dd_elements = new_soup.find_all('dd', class_='_3ZGPwl2N1mHAJH3cbltyWn') 
    
    # Iterating through these <dd> elements to find the one that contains 'PROPERTY TYPE'
    for dd in dd_elements:
        
        # Checking if the sibling <dt> tag contains 'PROPERTY TYPE'
        if dd.find_previous_sibling('dt').text.strip() == 'PROPERTY TYPE':
        
            # Now finding the <p> tag with the class that holds the PROPERTY TYPE
            p_tag = dd.find('p', class_='_1hV1kqpVceE9m-QrX_hWDN')
            
            if p_tag:
                apartment_type = p_tag.text.strip()
                break
    else:
        apartment_type = None

    return apartment_type

Function to the the number of bedrooms

In [5]:
def get_bedrooms(new_soup):

    # Finding all the dd tagged elements with the provided class
    dd_elements = new_soup.find_all('dd', class_='_3ZGPwl2N1mHAJH3cbltyWn')     
    
    for dd in dd_elements:
        
        # Checking if the sibling <dt> tag contains 'BEDROOMS'
        if dd.find_previous_sibling('dt').text.strip() == 'BEDROOMS':
            
            # Now finding the <p> tag with the class that holds the number of bedrooms
            p_tag = dd.find('p', class_='_1hV1kqpVceE9m-QrX_hWDN')
            
            if p_tag:
                number_of_bedrooms = p_tag.text.strip()
                break
    else:
        number_of_bedrooms = None
    
    return number_of_bedrooms

Function to get the number of bathrooms

In [6]:
def get_bathroom(new_soup):

    # Finding all the dd tagged elements with the provided class
    dd_elements = new_soup.find_all('dd', class_='_3ZGPwl2N1mHAJH3cbltyWn')     
    
    for dd in dd_elements:
        
        # Checking if the sibling <dt> tag contains 'BATHROOMS'
        if dd.find_previous_sibling('dt').text.strip() == 'BATHROOMS':
           
            # Now finding the <p> tag with the class that holds the number of bathrooms
            p_tag = dd.find('p', class_='_1hV1kqpVceE9m-QrX_hWDN')
            
            if p_tag:
                number_of_bathrooms = p_tag.text.strip()
                break
    else:
        number_of_bathrooms = None
    
    return number_of_bathrooms

Function to get the Rent

In [7]:
def get_price(new_soup):

    # Finding all the <div> tagged elements with the provided class
    div_elements = new_soup.find_all('div', class_='_1gfnqJ3Vtd1z40MlC0MzXu') 

    div_with_price = new_soup.find('div', class_='_1gfnqJ3Vtd1z40MlC0MzXu')
    
    if div_with_price:
        
        # Finding the <span> inside the <div>
        price_span = div_with_price.find('span')
        
        if price_span:
            price_text = price_span.text.strip()
        else:
            price_text = None
    else:
        price_text = None

    return price_text

Function to check if the current page is the last page

In [13]:
def is_last_page(soup):
    
    # Looking for the disabled 'Next' button in the parsed HTML
    next_button = soup.find("button", {"data-test": "pagination-next", "disabled": True})
    
    return bool(next_button)

Function to check if the 'next' button on the page is present

In [20]:
def get_next_button(driver):
    
    try:
        # Waiting for the next button to be clickable
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[data-test='pagination-next']"))
        )
        # Scrolling the next button into view
        driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
        return next_button
    except TimeoutException:
        
        # Returning None if the next button is not found within 10 seconds
        return None

In [26]:
def get_driver():
    # Initializing and return a headless Edge WebDriver.
    edge_options = webdriver.EdgeOptions()
    edge_options.use_chromium = True
    edge_options.add_argument("--headless")  # Running browser in the background
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=edge_options)
    
    return driver

Function to check the property cards

In [27]:
def check_property_cards(soup):
    
    # Checks if there are property cards on the page by looking for divs with the 'propertySearch' class.
    listings = soup.find_all("div", class_="propertySearch")
    
    return len(listings) > 0

## Main Function

All the above functions will be called one by one.

The dataframe will then be saved into a csv file

In [30]:
if __name__ == '__main__':
    edge_options = webdriver.EdgeOptions()
    edge_options.use_chromium = True
    edge_options.add_argument("--headless")
    
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=edge_options)

    base_url = 'https://www.xxxxxxxxxxxxxxx.co.uk/property-to-rent/find.html'
    params = {
        'locationIdentifier': 'REGION^87490',
        'index': 0,
        'propertyTypes': '',
        'includeLetAgreed': 'false',
        'mustHave': '',
        'dontShow': '',
        'furnishTypes': '',
        'keywords': ''
    }

    d = {"address":[], "furnish_type":[], "apartment_type":[], "bedrooms":[], "bathrooms":[], "rent":[]}

    while True:
        # Modifying the URL with the current index
        driver.get(f"{base_url}?{urllib.parse.urlencode(params)}")
        time.sleep(2)  # Allow the page to load completely
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Checking if there are no properties found, then break
        if not soup.find("a", attrs={'class': "propertyCard-link"}):
            break

        links = soup.find_all("a", attrs={'class': "propertyCard-link"})

        for link in links:
            href = link.get('href')
            property_url = f"https://www.xxxxxxxxx.co.uk{href}"

            driver.get(property_url)
            time.sleep(2)  # Wait for the page to load
            property_html = driver.page_source
            new_soup = BeautifulSoup(property_html, 'html.parser')

            d['address'].append(get_address(new_soup))
            d['furnish_type'].append(get_furnish_type(new_soup))
            d['apartment_type'].append(get_apartment_type(new_soup))
            d['bedrooms'].append(get_bedrooms(new_soup))
            d['bathrooms'].append(get_bathroom(new_soup))
            d['rent'].append(get_price(new_soup))

        # Incrementing the index for the next page by the number of listings per page
        params['index'] += 24

    driver.quit()

    # Creating a DataFrame and save to CSV
    london_property_df = pd.DataFrame.from_dict(d)
    london_property_df['address'].replace('', np.nan, inplace=True)
    london_property_df.dropna(subset=['address'], inplace=True)
    london_property_df.to_csv("london_property.csv", header=True, index=False)

In [31]:
london_property_df.shape

(2100, 6)