In [50]:
import os
import pandas as pd
from pathlib import Path

from tqdm import tqdm

ROOT_DIR = Path(os.getcwd()).parent
DATA_DIR = ROOT_DIR / 'data'

In [3]:
from src import utils

In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select

In [5]:
driver = webdriver.Chrome()
driver.get("https://www.brickz.my/transactions/residential/kuala-lumpur/")

Find element by CSS selector and click. A NoSuchElementException will be thrown if the pagination is at the end.

In [6]:
try:
    driver.find_element(
        By.CSS_SELECTOR, 
        "#post-467083 > div:nth-child(3) > div.ptd_list_table_title.table > div.ptd_table_toolbar > div > a.next.page-numbers"
    ).click()
except:
    print("At the end of the pagination.")

For the table of projects/townships, the CSS elements are structured as follows:
- project_name: `"#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(1) > a"`
- location: `"#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(1) > span"`
- url_link: `"#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(1) > a"`
- tenure: `"#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(2) > span"`
- median_psf: `"#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(3) > span"`
- median_price: `"#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(4) > span"`
- filed_transactions: `"#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(5) > a"`

Notice that the number in `td:nth-child(X)` increments for column, therefore the `tr:nth-child(X)` is the row increment. Additionally, each page shows a maximum of 10 projects/townships. Using this information, we can use a loop to increment through the 10 rows of the table. Also we wrap this within a try-except block so that it catches the exception if there are less than 10 rows in the final page.

In [14]:
project_name = driver.find_element(By.CSS_SELECTOR, "#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(1) > a").text
location = driver.find_element(By.CSS_SELECTOR, "#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(1) > span").text
url_link = driver.find_element(By.CSS_SELECTOR, "#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(1) > a").get_attribute("href")
tenure = driver.find_element(By.CSS_SELECTOR, "#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(2) > span").text
median_psf = driver.find_element(By.CSS_SELECTOR, '#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(3) > span').text
median_price = driver.find_element(By.CSS_SELECTOR, '#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(4) > span').text
filed_transactions = driver.find_element(By.CSS_SELECTOR, '#ptd_list_table > tbody > tr:nth-child(1) > td:nth-child(5) > a').text

In [21]:
projects = []
for i in range(10):
    project_name = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(1) > a").text
    location = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(1) > span").text
    url_link = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(1) > a").get_attribute("href")
    tenure = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(2) > span").text
    median_psf = driver.find_element(By.CSS_SELECTOR, f'#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(3) > span').text
    median_price = driver.find_element(By.CSS_SELECTOR, f'#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(4) > span').text
    filed_transactions = driver.find_element(By.CSS_SELECTOR, f'#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(5) > a').text
    
    projects.append([project_name, location, url_link, tenure, median_psf, median_price, filed_transactions])

projects[:2]

[['OVERSEAS UNION GARDEN',
  'OLD KLANG ROAD, KUALA LUMPUR',
  'https://www.brickz.my/transactions/residential/kuala-lumpur/old-klang-road/overseas-union-garden/landed/',
  'FREEHOLD',
  '519',
  '900,000',
  '35 Transactions'],
 ['SRI PENARA',
  'CHERAS, KUALA LUMPUR',
  'https://www.brickz.my/transactions/residential/kuala-lumpur/cheras/sri-penara/non-landed/',
  'LEASEHOLD',
  '395',
  '255,000',
  '32 Transactions']]

In [22]:
column_names = ['project_name', 'location', 'url_link', 'tenure', 'median_psf', 'median_price', 'filed_transactions']

df = pd.DataFrame(projects, columns=column_names)
df

Unnamed: 0,project_name,location,url_link,tenure,median_psf,median_price,filed_transactions
0,OVERSEAS UNION GARDEN,"OLD KLANG ROAD, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,519,900000,35 Transactions
1,SRI PENARA,"CHERAS, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,LEASEHOLD,395,255000,32 Transactions
2,TAMAN SRI SINAR,"SEGAMBUT, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,659,418000,31 Transactions
3,TAMAN MIDAH,"CHERAS, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,484,805000,30 Transactions
4,DAMANSARA HEIGHTS (BUKIT DAMANSARA),"DAMANSARA HEIGHTS, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,684,3525000,30 Transactions
5,WANGSA MAJU SEKSYEN 1,"WANGSA MAJU, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,LEASEHOLD,386,205000,30 Transactions
6,ENDAH REGAL,"SRI PETALING, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,LEASEHOLD,368,420000,29 Transactions
7,RESIDENSI 22 MONT KIARA,"MONT KIARA, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,957,2055000,28 Transactions
8,ANGKASA,"CHERAS, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,410,380000,25 Transactions
9,LAKE FIELD - MEADOWS & GLADES,"SUNGAI BESI, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,LEASEHOLD,687,1300000,25 Transactions


Now let's combine the page navigation click and table scraping together:
```
Start selenium

Go to https://www.brickz.my/transactions/residential/kuala-lumpur/

Create an empty list

For each page:
    For each row in the table:
        Scrape information from each column using CSS selector
        Append to the empty list
    Click next page

Stop selenium

Make the list into a dataframe with column names
```

In [35]:
%%time
driver = webdriver.Chrome()
driver.get("https://www.brickz.my/transactions/residential/kuala-lumpur/?range=1909+May-")

projects = []
next_page = True

while next_page:
    for i in range(10):
        try:
            project_name = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(1) > a").text
            location = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(1) > span").text
            url_link = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(1) > a").get_attribute("href")
            tenure = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(2) > span").text
            median_psf = driver.find_element(By.CSS_SELECTOR, f'#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(3) > span').text
            median_price = driver.find_element(By.CSS_SELECTOR, f'#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(4) > span').text
            filed_transactions = driver.find_element(By.CSS_SELECTOR, f'#ptd_list_table > tbody > tr:nth-child({i+1}) > td:nth-child(5) > a').text
            
            projects.append([project_name, location, url_link, tenure, median_psf, median_price, filed_transactions])
        except:
            print("Less than 10 rows available.")
    
    try:
        driver.find_element(
            By.CSS_SELECTOR, 
            "#post-467083 > div:nth-child(3) > div.ptd_list_table_title.table > div.ptd_table_toolbar > div > a.next.page-numbers"
        ).click()
    except:
        next_page = False
        print("At the end of the pagination.")
driver.close()

column_names = ['project_name', 'location', 'url_link', 'tenure', 'median_psf', 'median_price', 'filed_transactions']
df = pd.DataFrame(projects, columns=column_names)

Less than 10 rows available.
Less than 10 rows available.
Less than 10 rows available.
Less than 10 rows available.
Less than 10 rows available.
Less than 10 rows available.
Less than 10 rows available.
Less than 10 rows available.
Less than 10 rows available.
Less than 10 rows available.
At the end of the pagination.
CPU times: total: 49 s
Wall time: 1h 12min 55s


A total time of 72m55.4s or 1h12m55s was used to scrape the list of projects/townships table.

In [38]:
df['filed_transactions'] = df['filed_transactions'].str.replace(' Transaction*', '', regex=True)
df

Unnamed: 0,project_name,location,url_link,tenure,median_psf,median_price,filed_transactions
0,BANDAR BARU SRI PETALING,"SRI PETALING, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,LEASEHOLD,201,332500,3952
1,TAMAN TUN DR ISMAIL,"TAMAN TUN DR ISMAIL, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,"FREEHOLD, LEASEHOLD",334,820000,2849
2,TAMAN MELATI,"SETAPAK, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,78,52500,2633
3,DAMANSARA HEIGHTS (BUKIT DAMANSARA),"DAMANSARA HEIGHTS, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,"FREEHOLD, LEASEHOLD",409,2380000,2214
4,BANDAR BARU WANGSA MAJU,"WANGSA MAJU, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,LEASEHOLD,145,80000,1833
...,...,...,...,...,...,...,...
1615,TAMAN GOODWOOD,"KUCHAI LAMA, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,125,220000,15
1616,TAMAN NAM FONG,"OLD KLANG ROAD, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,LEASEHOLD,146,250000,15
1617,RESIDENSI PANTAI SENTRAL 2,"PANTAI, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,LEASEHOLD,923,1247220,15
1618,LAMAN BAYU,"SEGAMBUT, KUALA LUMPUR",https://www.brickz.my/transactions/residential...,FREEHOLD,675,3080000,15


In [45]:
df['median_psf'] = df['median_psf'].str.replace(',', '', regex=True).astype(int)
df['median_price'] = df['median_price'].str.replace(',', '', regex=True).astype(float)
df['filed_transactions'] = df['filed_transactions'].str.replace(',', '', regex=True).astype(int)

df.to_csv(DATA_DIR / 'townships.tsv', index=False, sep='\t')
df.to_excel(DATA_DIR / 'townships.xlsx', index=False)

There are 333,387 transactions filed in Brickz.my for Kuala Lumpur. In Brickz.my website, it was reported that there are a total of 333,494 transactions from May 1909 to Jun 2023.

In [46]:
df['filed_transactions'].sum()

333387

Next, we need to scrape the data for each township. We utilise the same approach as the previous dataset.

Let's find out the CSS selectors for each column. For the table of transactions, the CSS elements are structured as follows:
- spa_date: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(1)"`
- address: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(2)"`
- building_type: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(3)"`
- tenure: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(4)"`
- floors: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(5)"`
- rooms: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(6)"`
- land_area: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td.ptd_numeric.ptd_multiSize"`
- built_up: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(8)"`
- price_psf: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(9)"`
- price: `"#ptd_list_detail_table > tbody > tr:nth-child(1) > td:nth-child(10)"`

For this table:
- Notice that the number in `td:nth-child(X)` increments for column, therefore the `tr:nth-child(X)` is the row increment
- Additionally, each page shows a maximum of 20 transactions

In [55]:
driver = webdriver.Chrome()

transactions = []

for project_name, url in tqdm(zip(df['project_name'], df['url_link']), total=len(df['url_link'])):
    
    driver.get(url)

    next_page = True

    while next_page:
        for i in range(20):
            try:
                spa_date = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(1)").text
                address = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(2)").text
                building_type = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(3)").text
                tenure = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(4)").text
                floors = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(5)").text
                rooms = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(6)").text
                land_area = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td.ptd_numeric.ptd_multiSize").text
                built_up = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(8)").text
                price_psf = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(9)").text
                price = driver.find_element(By.CSS_SELECTOR, f"#ptd_list_detail_table > tbody > tr:nth-child({i+1}) > td:nth-child(10)").text
                
                transactions.append([project_name, spa_date, address, building_type, tenure, floors, rooms, land_area, built_up, price_psf, price])
            except:
                pass
                # print("Less than 20 rows available.")
        
        try:
            driver.find_element(
                By.CSS_SELECTOR, 
                "#post-467083 > div:nth-child(3) > div.ptd_list_table_title.table > div.ptd_table_toolbar > div > a.next.page-numbers"
            ).click()
        except:
            next_page = False
            # print("At the end of the pagination.")
    
driver.close()

column_names = ['project_name', 'spa_date', 'address', 'building_type', 'tenure', 'floors', 'rooms', 'land_area', 'built_up', 'price_psf', 'price']
df2 = pd.DataFrame(transactions, columns=column_names)

100%|██████████| 1620/1620 [19:19:17<00:00, 42.94s/it]  


It took 19h19m17s total and 42.94s/iteration to scrape through 1620 projects. However, we only ended up with 300+ projects which could suggest that the scraper did skip some URLs which took too long to reach.

In [56]:
df2

Unnamed: 0,project_name,spa_date,address,building_type,tenure,floors,rooms,land_area,built_up,price_psf,price
0,BANDAR BARU SRI PETALING,09/06/2023,"✕✕✕, JALAN PIKRAMA",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,1,,"2,196 ft²",,342,750000
1,BANDAR BARU SRI PETALING,01/06/2023,"✕✕. ✕✕, JALAN PERLAK 3",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,398,300000
2,BANDAR BARU SRI PETALING,29/05/2023,"✕✕ ✕, JALAN 12/149L",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2½,,"3,197 ft²",,188,600000
3,BANDAR BARU SRI PETALING,25/05/2023,"✕✕. ✕✕✕, JALAN PASAI",TERRACE HOUSE - INTERMEDIATE,LEASEHOLD,2,,753 ft²,,531,400000
4,BANDAR BARU SRI PETALING,22/05/2023,"✕✕, JALAN SRI PETALING 5",SEMI-D,LEASEHOLD,2½,,"4,801 ft²",,250,1200000
...,...,...,...,...,...,...,...,...,...,...,...
76771,LAMAN BAYU,30/10/2012,"✕✕, JALAN SERI BAYU LAMAN BAYU",BUNGALOW,FREEHOLD,3½,8,"4,673 ft²","4,297 ft²",698,3260000
76772,LAMAN BAYU,25/10/2012,"✕✕, 22/38A",BUNGALOW,FREEHOLD,3½,8,"4,692 ft²","4,297 ft²",682,3200000
76773,LAMAN BAYU,15/08/2012,"✕✕, LAMAN BAYU",BUNGALOW,FREEHOLD,3,5,"4,514 ft²","3,050 ft²",695,3138880
76774,LAMAN BAYU,18/04/2011,"✕, JALAN SERI BAYU",BUNGALOW,FREEHOLD,3,8,"4,514 ft²","3,050 ft²",640,2888880


In [57]:
df2.to_csv(DATA_DIR / 'transactions.tsv', index=False, sep='\t')
df2.to_excel(DATA_DIR / 'transactions_KL.xlsx', index=False)

In [58]:
df2['project_name'].value_counts()

project_name
BANDAR BARU SRI PETALING               3952
TAMAN TUN DR ISMAIL                    2849
DAMANSARA HEIGHTS (BUKIT DAMANSARA)    2214
KEPONG BARU                            1951
TAMAN BUKIT MALURI                     1718
                                       ... 
TAMAN BUKIT TIARA                        15
JALAN TIONG NAM                          15
TAMAN GOODWOOD                           15
TAMAN NAM FONG                           15
LAMAN BAYU                               15
Name: count, Length: 371, dtype: int64