Author: Muhammad Rafi Haidar

Kontak: 18221134@std.stei.itb.ac.id

Program untuk melakukan scraping data properti yang dijual di di raywhite.co.id

In [1]:
# Library yang digunakan
from bs4 import BeautifulSoup
from lxml import etree
import json
import simplejson
import os
import pandas as pd
import requests
import re
from tqdm import tqdm

pd.io.json._json.loads = lambda s, *a, **kw: json.loads(s)
pd.io.json._json.loads = lambda s, *a, **kw: simplejson.loads(s)
pd.io.json._json.loads = lambda s, *a, **kw: pd.json_normalize(simplejson.loads(s))

# URL menuju laman listing
URL = 'https://www.raywhite.co.id/jual?tipe={}&order=newest&limit=39&page={}'

# XPATH yang dipakai di laman tujuan
TITLE_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[1]/h1'
LOCATION_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[1]/p[2]'
SPEC_XPATH = '//*[@id="detail-sale"]/div/div/div[2]/div[3]/table/tbody/tr[{}]/td[{}]'
REALTOR_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[2]/div/h5/a'
REALTOR_OFFICE_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[2]/div/div[1]/a'
PHONE_XPATH = '//*[@id="detail-sale"]/div/div/div[1]/div/div/div[2]/div/div[2]/a[1]'

In [2]:
# Fungsi untuk scraping

# extract_title(lxml.etree._Element tree) -> String
# Melakukan scraping judul properti
def extract_title(tree):
    # Mengambil elemen dengan XPATH tertentu di element tree
    title_element = tree.xpath(TITLE_XPATH)[0]
    title = title_element.text.strip().replace('\r\n\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t', ' ') if title_element is not None else ""

    return title

# extract_value_usd(Beautifulsoup Soup) -> Int
# Melakukan scraping harga properti
def extract_value_usd(soup):
    price_card = soup.find('div', class_="btn-group btn-group-sm")
    value_usd = price_card.find_all('input')[1]['value']
    value_usd = value_usd.replace(",", "")

    try:
        value_usd = int(value_usd)
    except ValueError:
        value_usd = 0
        
    return value_usd

# extract_location(lxml.etree._Element tree) -> Tuple of (String, String)
# Melakukan scraping lokasi properti
def extract_location(tree):
    # Mengambil elemen dengan XPATH tertentu di element tree
    location_element = tree.xpath(LOCATION_XPATH)[0]
    location = location_element.text.strip() if location_element is not None else ""
    city, province = location.split(", ")

    return (city, province)


# extract_realtor_id(Beautifulsoup Soup) -> Int
# Melakukan scraping ID agen yang menjual properti
def extract_realtor_id(soup):
    url_list = soup.find_all('a', href=True)
    agent_url = [a['href'] for a in url_list if a['href'].startswith('https://www.raywhite.co.id/agent/')]
    try:
        id_str = agent_url[0].split('/')[-2]
        return int(id_str)
    except (IndexError, ValueError):
        return 0
    

# extract_realtor(lxml.etree._Element tree) -> String
# Melakukan scraping nama agen yang menjual properti
def extract_realtor(tree):
    # Mengambil elemen dengan XPATH tertentu di element tree
    realtor_element = tree.xpath(REALTOR_XPATH)[0]
    
    return realtor_element.text.strip() if realtor_element is not None else ""

# extract_office(lxml.etree._Element tree) -> String
# Melakukan scraping kantor agen yang menjual properti
def extract_office(tree):
    # Mengambil elemen dengan XPATH tertentu di element tree
    office_element = tree.xpath(REALTOR_OFFICE_XPATH)[0]

    return office_element.text.strip() if office_element is not None else ""

# extract_negotiable(Beautifulsoup Soup) -> Bool
# Melakukan scraping status negosiasi harga properti
def extract_negotiable(soup):
    value_element = soup.find('p', class_="h3 mb-3").text.strip()
    pattern = re.compile(r"nego", re.IGNORECASE)
    match = re.search(pattern, value_element)

    return bool(match)

# extract_phone(lxml.etree._Element tree) -> Int
# Melakukan scraping nomor kontak agen yang menjual properti
def extract_phone(tree):
    # Mengambil elemen dengan XPATH tertentu di element tree
    phone_element = tree.xpath(PHONE_XPATH)

    if phone_element:
        href = phone_element[0].get('href')
        phone = href.split(':')[1]

        # Hanya mengambil nomor pertama
        if "/" in phone:
            phone = phone.split("/")[0]

        # Nomor kosong tidak akan diambil
        if (phone == "") or (len(phone) == 0):
            phone = None
            
    else:
        phone = None
    
    # Formatting nomor telepon
    if (phone != None) and (phone[0] != '+'):
        phone = '+' + phone

    return phone

# extract_specification(lxml.etree._Element tree) -> List of any
# Melakukan scraping spesifikasi properti
def extract_specification(tree):
    retval = [None, None, None, None, None, None, None, None]

    for i in range(1, 9):
        try:
            # Mengambil elemen dengan XPATH tertentu di element tree
            label_element = tree.xpath(SPEC_XPATH.format(i, 2))[0]
            label = label_element.text.strip() if label_element is not None else ""
        except IndexError:
            return retval
        
        # Mengambil elemen dengan XPATH tertentu di element tree
        value_element = tree.xpath(SPEC_XPATH.format(i, 3))[0]
        value = value_element.text.strip().replace(": ", "") if value_element is not None else ""
        
        if label == 'Listing ID':
            retval[0] = value
            # Ubah ke integer
            if (retval[0] != None) and (retval[0] != ''):
                retval[0] = int(retval[0])
        elif label == 'Live ID':
            retval[1] = value.replace(":\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", "")
        elif label == 'Building Size':
            retval[2] = value.replace(" m", "")
            # Ubah ke integer
            if (retval[2] != None) and (retval[2] != ''):
                retval[2] = int(retval[2])
        elif label == 'Land Size':
            retval[3] = value.replace(" m", "")
            # Ubah ke integer
            if (retval[3] != None) and (retval[3] != ''):
                retval[3] = int(retval[3])
        elif label == 'Certificate':
            retval[4] = value
        elif label == 'Bedroom':
            retval[5] = value
            # Ubah ke integer
            if (retval[5] != None) and (retval[5] != ''):
                retval[5] = int(retval[5])
        elif label == 'Bathroom':
            retval[6] = value
            # Ubah ke integer
            if (retval[6] != None) and (retval[6] != ''):
                retval[6] = int(retval[6])
        elif label == 'Carport':
            retval[7] = value
            # Ubah ke integer
            if (retval[7] != None) and (retval[7] != ''):
                retval[7] = int(retval[7])

    return retval

# extract(lxml.etree._Element tree, Beautifulsoup soup, String type) -> Dictionary
# Melakukan scraping properti
def extract_property(tree, soup, propertyType):
    title = extract_title(tree)
    location = extract_location(tree)
    value_usd = extract_value_usd(soup)
    realtor_id = extract_realtor_id(soup)
    realtor = extract_realtor(tree)
    office = extract_office(tree) 
    negotiable = extract_negotiable(soup)
    phone = extract_phone(tree)
    specification = extract_specification(tree)

    return {
                'listing_id': specification[0], #0
                'live_id': specification[1], #1
                'type': propertyType.lower(), #2
                'title': title, #3
                'province': location[1], #4
                'city': location[0], #5
                'value_usd': value_usd, #6
                'value_idr': value_usd * 15068, #7
                'negotiable': negotiable, #8
                'building_size': specification[2], #9
                'land_size': specification[3], #10
                'certificate': specification[4], #11
                'bedroom': specification[5], #12
                'bathroom': specification[6], #13
                'carport': specification[7], #14
                'realtor_id': realtor_id, #15
                'realtor': realtor, #16
                'realtor_phone': phone, #17
                'realtor_office': office #18
            }

In [4]:
# Fungsi lainnya

# save_to_csv(pd.DataFrame dataframe, String propertyType, Int documentCounter)
# Membuat berkas CSV untuk mencatat kemajuan proses scraping per properti
def save_to_csv(dataframe, propertyType, documentCounter):
    output_dir = r'C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data\csv_backup'
    csv_filename = os.path.join(output_dir, f'raywhite_{propertyType.lower()}_{documentCounter}.csv')
    dataframe.to_csv(csv_filename, index=False)

# save_to_json(pd.DataFrame dataframe, String propertyType, Int documentCounter)
# Membuat berkas JSON untuk mencatat kemajuan proses scraping per properti
def save_to_json(dataframe, propertyType, documentCounter):
    output_dir = r'C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data'
    json_filename = os.path.join(output_dir, f'raywhite_{propertyType.lower()}_{documentCounter}.json')
    dataframe.to_json(json_filename, orient='records')

# Fungsi untuk menghilangkan outlier berdasarkan harga
def clean_value_outlier(group):
    Q1 = group['value_usd'].quantile(0.25)
    Q3 = group['value_usd'].quantile(0.75)
    IQR = Q3 - Q1
    low = Q1 - (1.5 * IQR)
    up = Q3 + (1.5 * IQR)
    return group[(group['value_usd'] >= low) & (group['value_usd'] <= up)]

# filter(Dictionary entry) -> Boolea
# Melakukan proses filtering untuk memastikan entry bersifat valid
def filter(entry):
    conditions = [
        entry['listing_id'] != None and entry['listing_id'] != '', # Listing ID harus ada
        entry['title'] != '' and entry['title'] != None, # Judul properti harus ada dan bukan empty string
        entry['province'] != '' and entry['province'] != None, # Provinsi dari lokasi properti harus ada dan bukan empty string
        entry['city'] != '' and entry['city'] != None, # Kota dari lokasi properti harus ada dan bukan empty string
        entry['value_usd'] > 0, # Harga properti harus lebih dari 0 USD
        entry['negotiable'] != None, # Status negosiasi properti harus jelas 
        entry['certificate'] != None, # Sertifikat properti harus ada
        entry['realtor_id'] != 0, # ID agen properti harus ada
        entry['realtor'] != '' and entry['realtor'] != None, # Nama agen properti harus ada dan bukan empty string
        entry['realtor_phone'] != None, # Agen properti harus mempunyai nomor telepon
        entry['realtor_office'] != '' and entry['realtor_office'] != None # Kantor agen properti harus ada dan bukan empty string
    ]

    return all(conditions)

In [40]:
# Program Utama

# -------- PENTING --------
# -  1 PAGE  = 39 ENTRY   -
# -  1 JSON  = 15 PAGE    -
# -  1 JSON  = 585 ENTRY  -
# -------------------------

# JSON cadangan akan dibuat setiap 585 entry (15 main page) telah di-parse pada tipe properti tertentu untuk redundancy dan memastikan kemajuan tetap tercatat walaupun terjadi kegagalan
# JSON yang dimaksud di bawah adalah JSON cadangan, JSON gabungan akan tetap dibuat saat program berakhir
# CSV backup dan gabungan akan dibuat untuk redundancy
PAGE_PER_JSON = 15

# Entries on raywhite.co.id as of 01/07/2023:
# [0] Apartment = 8524 - 14 JSON can be extracted
# [1] Commercial = 9925 - 19 JSON can be extracted
# [2] Factory = 647 - 1 JSON can be extracted
# [3] House = 80798 - 138 JSON can be extracted
# [4] Office = 704 - 1 JSON can be extracted
# [5] Shophouse = 3856 - 6 JSON can be extracted
# [6] Villa = 1288 - 2 JSON can be extracted
# [7] Warehouse = 2824 - 4 JSON can be extracted

# Jumlah JSON cadangan yang ingin di scrape dari setiap tipe properti
APPARTMENT = 2
COMMERCIAL = 2
FACTORY = 1
HOUSE = 20
OFFICE = 1
SHOPHOUSE = 1
VILLA = 1
WAREHOUSE = 1

# Tipe properti yang ingin di-scrape (tanah belum tersedia saat ini)
PROPERTY_TYPE = ('Apartment', 'Commercial', 'Factory', 'House', 'Office', 'Shophouse', 'Villa', 'Warehouse')
SCRAPING_TARGET = (APPARTMENT, COMMERCIAL, FACTORY, HOUSE, OFFICE, SHOPHOUSE, VILLA, WAREHOUSE)

# Untuk ethical scraping
# Nama menggunakan alias untuk alasan keamanan dan kerahasiaan
# Alamat email yang tercantum bukan alamat email pribadi ataupun alamat email instansi untuk alasan keamanan dan kerahasiaan
# Pemilik website tetap dapat menggunakan alamat email tersebut untuk menghubungi penulis
HEADER = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.5735.199 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.9',
    'Name': 'Campus Fox',
    'Email': 'rubahkampus@protonmail.com'
}

# List Akhir
final_list = []

# Counter Akhir Untuk Dokumentasi
main_page_counter = 0

for propertyType in range(1, len(PROPERTY_TYPE)):
    print(f'Melakukan scraping pada tipe properti {PROPERTY_TYPE[propertyType]}')

    pageParsed = SCRAPING_TARGET[propertyType] * PAGE_PER_JSON

    # List Sementara
    temp_list = []

    # Counter Sementara
    entry_counter = 0
    document_counter = 1

    # Progress Bar Agar Kemajuan Mudah Dilihat Mata
    pbar = tqdm(total=pageParsed*39, ncols=80)

    for page in range(1, pageParsed+1):
        main_page_counter += 1
        main_response = requests.get(URL.format(PROPERTY_TYPE[propertyType], page), headers=HEADER)
        
        if main_response.status_code == 200:
            main_content = main_response.text
            main_soup = BeautifulSoup(main_content, 'html.parser')

            url_list = main_soup.find_all('a', href=True)
            property_url = [a['href'] for a in url_list if a['href'].startswith('https://www.raywhite.co.id/properti/')]

            for property in property_url:
                property_response = requests.get(property, headers=HEADER)
                if property_response.status_code == 200:
                    property_content = property_response.content

                    property_soup = BeautifulSoup(property_content, 'html.parser')
                    xml_parsed = etree.HTML(str(property_soup))
                    
                    property_item = extract_property(xml_parsed, property_soup, PROPERTY_TYPE[propertyType])

                    # Cek validitas entry
                    if filter(property_item):
                        temp_list.append(property_item)
                        final_list.append(property_item)

                    entry_counter += 1
                    
                    pbar.update(1)

                    if entry_counter % 585 == 0:  
                        df = pd.DataFrame(temp_list).drop_duplicates()
                        save_to_json(df, PROPERTY_TYPE[propertyType], document_counter)
                        save_to_csv(df, PROPERTY_TYPE[propertyType], document_counter) # Untuk redundancy
                        print(f'Kemajuan scraping pada tipe properti {PROPERTY_TYPE[propertyType]} telah diubah menjadi dataframe cadangan dan disimpan di raywhite_{PROPERTY_TYPE[propertyType]}_{document_counter}.[json|csv]')

                        entry_counter = 0
                        document_counter += 1
                        temp_list = []
                
                else:
                    print('Koneksi Gagal')

        else:
            print('Koneksi Gagal')

    pbar.close()
    print('---------------------------------------------------------------------------------------------------')

if len(temp_list) > 0:
    df = pd.DataFrame(temp_list).drop_duplicates()
    save_to_json(df, PROPERTY_TYPE[propertyType], document_counter)
    save_to_csv(df, PROPERTY_TYPE[propertyType], document_counter) # Untuk redundancy

# Pembuatan JSON dan CSV gabungan
df_final = pd.DataFrame(final_list).drop_duplicates()

output_dir_json = r'C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data'
output_dir_csv = r'C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data\csv_backup'

csv_filename = os.path.join(output_dir_csv, f'raywhite_merged.csv')
json_filename = os.path.join(output_dir_json, f'raywhite_merged.json')

df_final.to_csv(csv_filename, index=False)
df_final.to_json(json_filename, orient='records')

print('Proses scraping telah selesai!\n')

print(f'Dokumentasi:')
print('Banyak entry yang di-scrape: ', main_page_counter * 39)
print('Banyak entry valid valid di dataframe akhir: ', len(df_final) - 1)
print('Persentase entry yang valid: '+str(((len(df_final) - 1) / (main_page_counter * 39)) * 100)+'%')

print(f"\nHasil scraping telah diubah menjadi dataframe dan disimpan di:\n{csv_filename}\n{json_filename}\n")

print('Dataframe gabungan:\n')
print(df_final)

Melakukan scraping pada tipe properti Office


100%|█████████████████████████████████████████| 585/585 [01:21<00:00,  7.20it/s]


Kemajuan scraping pada tipe properti Office telah diubah menjadi dataframe cadangan dan disimpan di raywhite_Office_1.[json|csv]
---------------------------------------------------------------------------------------------------
Melakukan scraping pada tipe properti Shophouse


100%|█████████████████████████████████████████| 585/585 [01:18<00:00,  7.49it/s]


Kemajuan scraping pada tipe properti Shophouse telah diubah menjadi dataframe cadangan dan disimpan di raywhite_Shophouse_1.[json|csv]
---------------------------------------------------------------------------------------------------
Melakukan scraping pada tipe properti Villa


100%|█████████████████████████████████████████| 585/585 [01:20<00:00,  7.25it/s]


Kemajuan scraping pada tipe properti Villa telah diubah menjadi dataframe cadangan dan disimpan di raywhite_Villa_1.[json|csv]
---------------------------------------------------------------------------------------------------
Melakukan scraping pada tipe properti Warehouse


100%|█████████████████████████████████████████| 585/585 [01:16<00:00,  7.60it/s]

Kemajuan scraping pada tipe properti Warehouse telah diubah menjadi dataframe cadangan dan disimpan di raywhite_Warehouse_1.[json|csv]
---------------------------------------------------------------------------------------------------
Proses scraping telah selesai!

Dokumentasi:
Banyak entry yang di-scrape:  2340
Banyak entry valid valid di dataframe akhir:  1466
Persentase entry yang valid: 62.64957264957265%

Hasil scraping telah diubah menjadi dataframe dan disimpan di:
C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\src\csv\raywhite_merged.csv
C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data\raywhite_merged.json

Dataframe gabungan:

      listing_id    live_id       type  \
0         395945  L23385527     office   
1         395834  L23378878     office   
2         395686  L23358645     office   
3         395093  L23269397     office   
4         394993  L23257306 




In [None]:
# Me-reset progress bar
pbar.close()

In [6]:
# Melakukan merge dari berkas JSON cadangan untuk membuat berkas JSON dan CSV (untuk redundancy) gabungan
# Dibuat apabila terjadi kegagalan koneksi di sesi scraping
# Disarankan untuk dilakukan untuk menghilangkan outlier [14/07/2023]

json_dir = r'C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data'  
csv_dir = r'C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data\csv_backup'
df_list = []

json_counter = 0

for file in os.listdir(json_dir):
    if file.endswith('.json'):
        json_counter += 1
        json_path = os.path.join(json_dir, file)
        with open(json_path, 'r') as f:
            json_data = f.read()

        df_temp = pd.read_json(json_data)
        df_list.append(df_temp)

df_merged = pd.concat(df_list, ignore_index=True).drop_duplicates()

df_final = df_merged.groupby('type').apply(clean_value_outlier).reset_index(drop=True)

json_filename = os.path.join(json_dir, f'raywhite_merged.json')
csv_filename = os.path.join(csv_dir, f'raywhite_merged.csv')

df_final.to_json(json_filename, orient='records')
df_final.to_csv(csv_filename, index=False)

print('Proses merging JSON telah selesai!\n')

print(f'Dokumentasi:')
print('Banyak JSON cadangan yang dihasilkan: ', json_counter)
print('Banyak entry valid valid di dataframe akhir: ', len(df_final) - 1)
print('Persentase entry yang valid: '+str(((len(df_final) - 1) / (json_counter * 585)) * 100)+'%')

print(f"\nHasil scraping telah diubah menjadi dataframe dan disimpan di:\n{csv_filename}\n{json_filename}\n")

print('Dataframe gabungan:\n')
print(df_final)

Proses merging JSON telah selesai!

Dokumentasi:
Banyak JSON cadangan yang dihasilkan:  29
Banyak entry valid valid di dataframe akhir:  9966
Persentase entry yang valid: 58.744473916887706%

Hasil scraping telah diubah menjadi dataframe dan disimpan di:
C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data\csv_backup\raywhite_merged.csv
C:\Users\Haidar\OneDrive - Institut Teknologi Bandung\Desktop\github\Seleksi-2023-Tugas-1\Data Scraping\data\raywhite_merged.json

Dataframe gabungan:

      listing_id    live_id       type  \
0         396531  L23431461  apartment   
1         396517  L23431285  apartment   
2         396507  L23431217  apartment   
3         396506  L23431201  apartment   
4         396482  L23430786  apartment   
...          ...        ...        ...   
9962      368235  L20447744  warehouse   
9963      368268  L20449144  warehouse   
9964      368138  L20439529  warehouse   
9965      367981  L20420512  ware