Author: Muhammad Rafi Haidar

Kontak: 18221134@std.stei.itb.ac.id

Program untuk melakukan scraping data mengenai properti di raywhite.co.id

In [14]:
# Library yang digunakan
from bs4 import BeautifulSoup
from lxml import etree
import pandas as pd
import requests
import re
from tqdm import tqdm

# XPATH yang dipakai di laman tujuan
# Informasi properti
TITLE_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[1]/h1'
LOCATION_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[1]/p[2]'
PRICE_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[1]/div[1]/div/label[2]'
# Spesifikasi properti
LISTING_ID_XPATH = '//*[@id="detail-sale"]/div/div/div[2]/div[3]/table/tbody/tr[2]/td[3]'
LIVE_ID_XPATH = '//*[@id="detail-sale"]/div/div/div[2]/div[3]/table/tbody/tr[1]/td[3]'
SPEC_XPATH = '//*[@id="detail-sale"]/div/div/div[2]/div[3]/table/tbody/tr[{}]/td[{}]'
# Informasi agen properti
REALTOR_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[2]/div/h5/a'
REALTOR_OFFICE_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[2]/div/div[1]/a'
PHONE_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[2]/div/div[2]/a[1]'

In [8]:
# Fungsi dan Prosedur

# isConvertibleInt(String string) -> Boolean
# Mengecek apakah string dapat diubah ke integer
def isConvertibleInt(string):
    try:
        int(string)
        return True
    except ValueError:
        return False

# extract_specification(lxml.etree._Element dom) -> List of any
# Melakukan scraping di bagian spesifikasi properti
def extract_specification(dom):
    retval = [0, 0, 0, 0, None, 0, 0, 0]

    for i in range(1, 9):
        try:
            label_element = dom.xpath(SPEC_XPATH.format(i, 2))[0]
            label = label_element.text.strip() if label_element is not None else ""
        except IndexError:
            return retval
        
        label_element = dom.xpath(SPEC_XPATH.format(i, 2))[0]
        label = label_element.text.strip() if label_element is not None else ""
        value_element = dom.xpath(SPEC_XPATH.format(i, 3))[0]
        value = value_element.text.strip().replace(": ", "") if value_element is not None else ""
        
        if label == 'Listing ID':
            retval[0] = value
        elif label == 'Live ID':
            retval[1] = value.replace(":\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "")
        elif label == 'Building Size':
            retval[2] = value.replace(" m", "")
        elif label == 'Land Size':
            retval[3] = value.replace(" m", "")
        elif label == 'Certificate':
            retval[4] = value
        elif label == 'Bedroom':
            retval[5] = value
        elif label == 'Bathroom':
            retval[6] = value
        elif label == 'Carport':
            retval[7] = value

    if retval[2] != 0:
        retval[2] = int(retval[2])

    if retval[3] != 0:
        retval[3] = int(retval[3])

    return retval

In [11]:
# Program Utama
# URL tidak perlu diubah
# TYPE dipilih berdasarkan tipe properti yang ingin di-scrape
# PAGE tergantung berapa banyak jumlah entry ang ingin di-scrape 

# ------ PENTING ------
# - 1 PAGE = 39 ENTRY -
# - 1 CSV  = 15 PAGE  -
# ---------------------

URL = 'https://www.raywhite.co.id/jual?tipe={}&order=newest&limit=39&page={}'
TYPE = 0
OUTPUT_CSV = 4
PAGE = 15 * OUTPUT_CSV

# Tipe properti yang ingin di-scrape (tanah belum tersedia saat ini)
property_type = ['Apartment', 'Commercial', 'Factory', 'House', 'Office', 'Shophouse', 'Villa', 'Warehouse']

# Entries on raywhite.co.id as of 01/07/2023:
# [0] Apartment = 8524 - 14 csv
# [1] Commercial = 9925 - 19 csv
# [2] Factory = 647 - 1 csv
# [3] House = 80798 - 138 csv
# [4] Office = 704 - 1 csv
# [5] Shophouse = 3856 - 6 csv
# [6] Villa = 1288 - 2 csv
# [7] Warehouse = 2824 - 4 csv

# List sementara
house_list = []

# Counter
count2 = 0
last = 1

# Loop utama
for i in range(1, PAGE+1):
    count1 = 0
    print('Halaman ke-' + str(i) + '\nEntry yang sudah di-scrape: ', end='')
    
    response = requests.get(URL.format(property_type[TYPE], i))

    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        unfiltered_urls = soup.find_all('a', href=True)
        filtered_urls = [a['href'] for a in unfiltered_urls if a['href'].startswith('https://www.raywhite.co.id/properti/')]

        for house in filtered_urls:
            count1 += 1
            print(str(count1) + ' ', end='')
            
            house_content = requests.get(house).content
            house_soup = BeautifulSoup(house_content, 'html.parser')
            xml_parsed = etree.HTML(str(house_soup))

            # Judul
            title_element = xml_parsed.xpath(TITLE_XPATH)[0]
            title = title_element.text.strip() if title_element is not None else ""

            # Harga
            price_card = house_soup.find('div', class_="btn-group btn-group-sm")
            value_usd = price_card.find_all('input')[1]['value']
            value_usd = value_usd.replace(",", "")
        
            if isConvertibleInt(value_usd):
                value_usd = int(value_usd)
            else:
                value_usd = 0

            # Lokasi
            location_element = xml_parsed.xpath(LOCATION_XPATH)[0]
            location = location_element.text.strip() if location_element is not None else ""
            city, province = location.split(", ")

            # Agen
            realtor_element = xml_parsed.xpath(REALTOR_XPATH)[0]
            realtor = realtor_element.text.strip() if realtor_element is not None else ""

            # Kantor Agen
            office_element = xml_parsed.xpath(REALTOR_OFFICE_XPATH)[0]
            office = office_element.text.strip() if office_element is not None else ""

            # Status Negosiasi
            value_element = house_soup.find('p', class_="h3 mb-3").text.strip()
            pattern = re.compile(r"nego", re.IGNORECASE)
            match = re.search(pattern, value_element)
            negotiable = bool(match)

            # Spesifikasi Properti
            specification = extract_specification(xml_parsed)

            # Kontak Agen
            phone_element = xml_parsed.xpath(PHONE_XPATH)
            if phone_element:
                href = phone_element[0].get('href')
                phone = href.split(':')[1].lstrip('+')
                if "/" in phone:
                    phone = phone.split("/")[0]
                if phone == "":
                    phone = 0
            else:
                phone = 0
            
            house_item = {
            'listing_id': specification[0],
            'type': property_type[TYPE].lower(),
            'title': title,
            'province': province,
            'city': city,
            'value_usd': value_usd,
            'value_idr': value_usd * 15068,
            'negotiable': negotiable,
            'live_id': specification[1],
            'building_size': specification[2],
            'land_size': specification[3],
            'certificate': specification[4],
            'bedroom': specification[5],
            'bathroom': specification[6],
            'carport': specification[7],
            'realtor': realtor,
            'realtor_office': office,
            'contact': phone
        }
            house_list.append(house_item)

            count2 += 1

            if count2 % 585 == 0:  
                df = pd.DataFrame(house_list)
                csv_filename = f'raywhite_{property_type[TYPE].lower()}_{last}.csv'  
                df.to_csv(csv_filename, index=False)

                count2 = 0
                last += 1

                print(f"\nList telah diubah menjadi DataFrame dan disimpan sebagai {csv_filename}")

                house_list = [] 

    else:
        print('Koneksi Gagal')	

    print('')	
																	
if len(house_list) > 0:
    df = pd.DataFrame(house_list)
    csv_filename = f'raywhite_{property_type[TYPE].lower()}_{last}.csv' 
    df.to_csv(csv_filename, index=False)
    print(f"\nList terakhir telah diubah menjadi DataFrame dan disimpan sebagai {csv_filename}")

print(df)


Halaman ke-1
Entry yang sudah di-scrape: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Halaman ke-2
Entry yang sudah di-scrape: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Halaman ke-3
Entry yang sudah di-scrape: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Halaman ke-4
Entry yang sudah di-scrape: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Halaman ke-5
Entry yang sudah di-scrape: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Halaman ke-6
Entry yang sudah di-scrape: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Halaman ke-7
Entry yang sudah di-scrape: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23

In [18]:
# Program Utama
# URL tidak perlu diubah
# TYPE dipilih berdasarkan tipe properti yang ingin di-scrape
# PAGE tergantung berapa banyak jumlah entry ang ingin di-scrape 

# ------ PENTING ------
# - 1 PAGE = 39 ENTRY -
# - 1 CSV  = 15 PAGE  -
# ---------------------

URL = 'https://www.raywhite.co.id/jual?tipe={}&order=newest&limit=39&page={}'
TYPE = 7
OUTPUT_CSV = 4
PAGE = 15 * OUTPUT_CSV

# Tipe properti yang ingin di-scrape (tanah belum tersedia saat ini)
property_type = ['Apartment', 'Commercial', 'Factory', 'House', 'Office', 'Shophouse', 'Villa', 'Warehouse']

# Entries on raywhite.co.id as of 01/07/2023:
# [0] Apartment = 8524 - 14 csv
# [1] Commercial = 9925 - 19 csv
# [2] Factory = 647 - 1 csv
# [3] House = 80798 - 138 csv
# [4] Office = 704 - 1 csv
# [5] Shophouse = 3856 - 6 csv
# [6] Villa = 1288 - 2 csv
# [7] Warehouse = 2824 - 4 csv

# List Sementara
house_list = []

# Counter
entry_counter = 0
document_counter = 1

# Progress Bar Agar Enak Dilihat Mata
pbar = tqdm(total=PAGE * 39, ncols=80)

for i in range(1, PAGE+1):
    response = requests.get(URL.format(property_type[TYPE], i))
    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        unfiltered_urls = soup.find_all('a', href=True)
        filtered_urls = [a['href'] for a in unfiltered_urls if a['href'].startswith('https://www.raywhite.co.id/properti/')]

        for house in filtered_urls:
            house_content = requests.get(house).content
            house_soup = BeautifulSoup(house_content, 'html.parser')
            xml_parsed = etree.HTML(str(house_soup))

            title_element = xml_parsed.xpath(TITLE_XPATH)[0]
            title = title_element.text.strip() if title_element is not None else ""

            price_card = house_soup.find('div', class_="btn-group btn-group-sm")
            value_usd = price_card.find_all('input')[1]['value']
            value_usd = value_usd.replace(",", "")
        
            if isConvertibleInt(value_usd):
                value_usd = int(value_usd)
            else:
                value_usd = 0

            location_element = xml_parsed.xpath(LOCATION_XPATH)[0]
            location = location_element.text.strip() if location_element is not None else ""
            city, province = location.split(", ")

            realtor_element = xml_parsed.xpath(REALTOR_XPATH)[0]
            realtor = realtor_element.text.strip() if realtor_element is not None else ""

            office_element = xml_parsed.xpath(REALTOR_OFFICE_XPATH)[0]
            office = office_element.text.strip() if office_element is not None else ""

            value_element = house_soup.find('p', class_="h3 mb-3").text.strip()
            pattern = re.compile(r"nego", re.IGNORECASE)
            match = re.search(pattern, value_element)
            negotiable = bool(match)

            specification = extract_specification(xml_parsed)

            phone_element = xml_parsed.xpath(PHONE_XPATH)
            if phone_element:
                href = phone_element[0].get('href')
                phone = href.split(':')[1].lstrip('+')
                if "/" in phone:
                    phone = phone.split("/")[0]
                if phone == "":
                    phone = 0
            else:
                phone = 0
            
            house_item = {
                'listing_id': specification[0],
                'type': property_type[TYPE].lower(),
                'title': title,
                'province': province,
                'city': city,
                'value_usd': value_usd,
                'value_idr': value_usd * 15068,
                'negotiable': negotiable,
                'live_id': specification[1],
                'building_size': specification[2],
                'land_size': specification[3],
                'certificate': specification[4],
                'bedroom': specification[5],
                'bathroom': specification[6],
                'carport': specification[7],
                'realtor': realtor,
                'realtor_office': office,
                'contact': phone
            }

            house_list.append(house_item)

            entry_counter += 1

            pbar.update(1)

            if entry_counter % 585 == 0:  
                df = pd.DataFrame(house_list)
                csv_filename = f'raywhite_{property_type[TYPE].lower()}_{document_counter}.csv'  
                df.to_csv(csv_filename, index=False)

                print(f"\nList terkini telah diubah menjadi DataFrame dan disimpan sebagai {csv_filename}\n")


                entry_counter = 0
                document_counter += 1

                house_list = []

    else:
        print('Koneksi Gagal')

pbar.close()

if len(house_list) > 0:
    df = pd.DataFrame(house_list)
    csv_filename = f'raywhite_{property_type[TYPE].lower()}_{document_counter}.csv' 
    df.to_csv(csv_filename, index=False)
    print(f"\nList terakhir telah diubah menjadi DataFrame dan disimpan sebagai {csv_filename}\n")

print(df)

 14%|█████▍                                  | 318/2340 [00:49<05:17,  6.38it/s]
100%|█████████████████████████████████████████| 585/585 [01:19<00:00,  7.36it/s]


List terkini telah diubah menjadi DataFrame dan disimpan sebagai raywhite_warehouse_1.csv
    listing_id       type                                              title  \
0       395302  warehouse  DIJUAL RUKO RAYA RUNGKUT ALANG ALANG SURABAYA ...   
1       395299  warehouse        Tanah Bangunan Komersial Menteng Jarang Ada   
2       395239  warehouse                 Apartemen Commercial Jakarta Barat   
3       395113  warehouse  Gudang\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t...   
4       395103  warehouse                   DIJUAL EX PABRIK KAYU DI  GRESIK   
..         ...        ...                                                ...   
580     367089  warehouse              Ruko Trace, Ruko Ramai Depan Al Azhar   
581     366947  warehouse      Gudang Cocok Untuk Workshop Di Cikalong Wetan   
582     366948  warehouse      Gudang Cocok Untuk Workshop Di Cikalong Wetan   
583     366961  warehouse          DiSewakan Gedung Pusat kota Kota Sidoarjo   
584     366865  warehouse    


