### Web scrapping for [Bayut Website](https://www.bayut.sa/)

In [3]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [23]:
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2


## Using BeatifulSoup

In [5]:
url = 'https://www.bayut.sa/en/for-sale/properties/riyadh/?gad_source=1&gclid=Cj0KCQiA8q--BhDiARIsAP9tKI1q83-uzx0h4q3l-T7KX0_wS3wKC2_GUuulMmtFuNkHIOSLTmYrcJ0aAjYAEALw_wcB'
page = requests.get(url)
print(page.status_code)

200


In [8]:
print(type(page.text))
#print(page.text)

<class 'str'>


In [10]:
soup = BeautifulSoup(page.content, 'html.parser')
print(type(soup))
#soup

<class 'bs4.BeautifulSoup'>


In [12]:
page_title = soup.title.text # gets you the text of the <title>(...)</title>
page_title

'Properties for Sale in Riyadh | Bayut KSA'

## Scrabbing Using Selenium

In [27]:

# ================
#  CONFIGURATION
# ================
START_PAGE = 2
END_PAGE = 702  # or any max page you want
BASE_URL = "https://www.bayut.sa/en/for-sale/properties/ksa/page-{page_num}/"

# --------------------------------
#  1) Setup Selenium WebDriver
# --------------------------------
chrome_options = Options()
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# --------------------------------
#  2) Prepare Data Storage
# --------------------------------
all_data = []

# --------------------------------
#  3) Loop Through Pages
# --------------------------------
for page_num in range(START_PAGE, END_PAGE + 1):
    url = BASE_URL.format(page_num=page_num)
    print(f"Scraping page {page_num} of {END_PAGE}: {url}")

    driver.get(url)


    # 3a) Accept cookies if there's a banner
    try:
        accept_btn = WebDriverWait(driver).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        accept_btn.click()
    except:
        pass  # no cookie banner

    # 3b) Wait for listings to appear
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li[aria-label='Listing']"))
        )
    except:
        print(f"No listings found on page {page_num}. Skipping...")
        continue

    # 3c) Get all listings
    listings = driver.find_elements(By.CSS_SELECTOR, "li[aria-label='Listing']")
    print(f"Found {len(listings)} listings on page {page_num}")

    for prop in listings:
        try:
            price = prop.find_element(By.CSS_SELECTOR, "span[aria-label='Price']").text.strip()
        except:
            price = "N/A"

        try:
            property_type = prop.find_element(By.CSS_SELECTOR, "span[aria-label='Type']").text.strip()
        except:
            property_type = "N/A"

        try:
            beds = prop.find_element(By.CSS_SELECTOR, "span[aria-label='Beds']").text.strip()
        except:
            beds = "N/A"

        try:
            baths = prop.find_element(By.CSS_SELECTOR, "span[aria-label='Baths']").text.strip()
        except:
            baths = "N/A"

        try:
            area = prop.find_element(By.CSS_SELECTOR, "h4.cfac7e1b._85ddb82f").text.strip()
        except:
            area = "N/A"

        try:
            location = prop.find_element(By.CSS_SELECTOR, "h3._4402bd70").text.strip()
        except:
            location = "N/A"

        all_data.append([
            page_num,
            price,
            property_type,
            beds,
            baths,
            area,
            location
        ])
    

# --------------------------------
#  4) Close the Browser
# --------------------------------
driver.quit()

# --------------------------------
#  5) Convert to Pandas DataFrame
# --------------------------------
df1 = pd.DataFrame(all_data, columns=[
    "Page",
    "Price",
    "Type",
    "Beds",
    "Baths",
    "Area",
    "Location"
])

print(df1.head())
print(f"Total records scraped: {len(df1)}")

# Save to CSV
df1.to_csv("bayut_properties.csv", index=False)

Scraping page 2 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-2/
Found 25 listings on page 2
Scraping page 3 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-3/
Found 25 listings on page 3
Scraping page 4 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-4/
Found 25 listings on page 4
Scraping page 5 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-5/
Found 25 listings on page 5
Scraping page 6 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-6/
Found 25 listings on page 6
Scraping page 7 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-7/
Found 25 listings on page 7
Scraping page 8 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-8/
Found 25 listings on page 8
Scraping page 9 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-9/
Found 25 listings on page 9
Scraping page 10 of 702: https://www.bayut.sa/en/for-sale/properties/ksa/page-10/
Found 25 listings on page 10
Scraping page 11 of 702: 

In [29]:
df1

Unnamed: 0,Page,Price,Type,Beds,Baths,Area,Location
0,2,680000,Apartment,6,4,230 Sq. M.,"Al Rayaan, North Jeddah, Jeddah"
1,2,450000,Apartment,4,4,900 Sq. M.,"Al Safa, North Jeddah, Jeddah"
2,2,450000,Apartment,4,3,136 Sq. M.,"Bryman, North Jeddah, Jeddah"
3,2,580000,Apartment,4,3,134 Sq. M.,"Al Manar, North Jeddah, Jeddah"
4,2,670000,Apartment,5,3,162 Sq. M.,"Al Nuzhah, North Jeddah, Jeddah"
...,...,...,...,...,...,...,...
17499,701,1200000,Apartment,4,4,262 Sq. M.,"Al Marwah, North Jeddah, Jeddah"
17500,702,520000,Apartment,3,2,176 Sq. M.,"Al Maealaa, Ahad Rafidah"
17501,702,630000,Apartment,6,4,236 Sq. M.,"Al jameen, Khamis Mushait"
17502,702,680000,Villa,5,4,250 Sq. M.,"Al Muruj, Unayzah"


In [33]:
df1.duplicated().sum()

1493

In [35]:
df1.isnull().sum()

Page        0
Price       0
Type        0
Beds        0
Baths       0
Area        0
Location    0
dtype: int64

In [37]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17504 entries, 0 to 17503
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Page      17504 non-null  int64 
 1   Price     17504 non-null  object
 2   Type      17504 non-null  object
 3   Beds      17504 non-null  object
 4   Baths     17504 non-null  object
 5   Area      17504 non-null  object
 6   Location  17504 non-null  object
dtypes: int64(1), object(6)
memory usage: 957.4+ KB
