<div align="center">
  <h1>Data Scraping</h1>
  <h2>Data Hunian Rumah dan Apartemen di Jakarta Pusat</h2>
</div>

- Install dan import library

In [1]:
# # jika packages berikut belum terinstall, un-comment dan jalankan cell ini untuk menginstallnya
# %pip install --upgrade pip --quiet
# %pip install selenium beautifulsoup4 requests pandas ipython datetime --quiet

# note: jika ada yang belum terinstall lakukan command di cell baru:
# %pip install <nama package>

In [2]:
from bs4                                import BeautifulSoup
from selenium                           import webdriver
from selenium.webdriver.chrome.options  import Options
from selenium.webdriver.support.ui      import WebDriverWait
from selenium.webdriver.support         import expected_conditions as EC
from selenium.webdriver.common.by       import By
from IPython.display                    import clear_output
from datetime                           import datetime
import requests, math, json, re, time, os, random
import pandas as pd

- Header untuk scraping dan pattern untuk diambil di halaman web 

In [3]:
__USER_AGENT__ = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
__BASE_URL__ = 'https://www.rumah123.com/'
__EMAIL__ = '13522116@mahasiswa.itb.ac.id'
__TOTAL_ITEM__ = 200
__TIME_SLEEP__ = 5

__HEADERS__ = {
    'User-Agent': __USER_AGENT__,
    'From': __EMAIL__
}

""" PATTERN """
__PATTERN_PROPERTY__    = r'^/properti/jakarta-pusat/.*'
__PATTERN_AGENT__       = r'^/agen-properti/.*'
__PATTERN_PAGINATION__  = r'.*page=\d+'

""" PATH STORING """
__CURRENT_DIRECTORY__   = os.getenv('CURRENT_DIR', os.getcwd())
__PATH_FOLDER__         = os.path.abspath(os.path.join(__CURRENT_DIRECTORY__, '..', 'data'))

- Request ke halaman web dan parsing menggunakan regex untuk mendapatkan konten yang diinginkan

In [4]:
# request ke URL untuk mengambil content
def get_content(url, headers):
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.content
    else:
        return None

# mengambil link yang sesuai dengan pola
def get_parse_links(page_content, pattern):
    soup = BeautifulSoup(page_content, "html.parser")
    urls = soup.find_all("a", href=re.compile(pattern))
    filtered_urls = [f"https://www.rumah123.com{url['href']}" for url in urls]
    unique_links = list(set(filtered_urls))
    return unique_links

# mendapatkan jumlah page
def get_max_page(page_content):
    page_link = get_parse_links(page_content, __PATTERN_PAGINATION__)
    page_numbers = [int(match.group(1)) for link in page_link if (match := re.search(r'page=(\d+)', link))]
    return max(page_numbers, default=0)

- Mengambil informasi untuk hunian (rumah dan apartemen)

In [5]:
# getter
def get_id_iklan(url):
    id_iklan = url.rstrip('/').split('/')[-1]
    return id_iklan

def get_prop_value(soup, label):
    detail_element = soup.find(lambda tag: tag.name == 'p' and tag.get_text(strip=True) == label)
    if detail_element:
        sibling = detail_element.find_next_sibling('p')
        return sibling.get_text(strip=True) if sibling else None
    return None

# mengambil atribut-atribut dari hunian
def get_property(url, tipe_iklan, content):
    soup = BeautifulSoup(content, 'html.parser')
    
    tipe_properti           = get_prop_value(soup, 'Tipe Properti')
    luas_bangunan           = get_prop_value(soup, 'Luas Bangunan')
    kamar_tidur             = get_prop_value(soup, 'Kamar Tidur')
    kamar_mandi             = get_prop_value(soup, 'Kamar Mandi')
    sertifikat              = get_prop_value(soup, 'Sertifikat')
    luas_tanah              = get_prop_value(soup, 'Luas Tanah')            if tipe_properti == 'Rumah' else -1
    carport                 = get_prop_value(soup, 'Carport')               if tipe_properti == 'Rumah' else -1
    kondisi_properti        = get_prop_value(soup, 'Kondisi Properti')      if tipe_properti == 'Apartemen' else -1
    kondisi_perabotan       = get_prop_value(soup, 'Kondisi Perabotan')     if tipe_properti == 'Apartemen' else -1
    periode_kepemilikan     = get_prop_value(soup, 'Periode Sewa')          if tipe_iklan == 'sewa' else -1

    lokasi = soup.find('p', class_='text-sm text-gray-400')
    if lokasi:
        lokasi = lokasi.get_text(strip=True)
        lokasi = lokasi.split(',')[0].strip() if ',' in lokasi else lokasi.strip()
    else: return None

    harga = soup.find('p', class_='text-sm text-primary mb-1 font-semibold')
    if harga:
        harga = harga.get_text(strip=True)
        harga = re.search(r'Rp\s(.*)', harga).group(1).strip() if harga else None
    else: return None

    diperbarui = soup.find('p', class_='text-3xs text-gray-400 mb-4')
    if diperbarui:
        diperbarui = diperbarui.get_text(strip=True)
        diperbarui = re.search(r'Diperbarui(\d{1,2} \w+ \d{4})', diperbarui).group(1)
    else: return None

    elements = soup.find_all('p', class_='w-1/2 flex items-center gap-4')
    texts = [elem.find('span').get_text(strip=True) for elem in elements]
    taman = 'Ya' if 'Taman' in texts else 'Tidak'

    id_iklan = get_id_iklan(url)
    
    return {
        'id_iklan'              : id_iklan,
        'tipe_properti'         : tipe_properti,
        'luas_bangunan'         : luas_bangunan,
        'kamar_tidur'           : kamar_tidur,
        'kamar_mandi'           : kamar_mandi,
        'lokasi'                : lokasi,
        'sertifikat'            : sertifikat,
        'tipe_iklan'            : tipe_iklan,
        'periode_kepemilikan'   : periode_kepemilikan,
        'harga'                 : harga,
        'diperbarui'            : diperbarui,
        'luas_tanah'            : luas_tanah,
        'carport'               : carport,
        'taman'                 : taman,
        'kondisi_properti'      : kondisi_properti,
        'kondisi_perabotan'     : kondisi_perabotan
    }

- Mengambil informasi untuk agen dan perusahaannya

In [6]:
# inisialisasi driver
def init_driver(user_agent, url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument("--no-sandbox")
    options.add_argument('--user-agent=' + user_agent)
    options.add_argument('--window-size=1920,1080')
    options.add_argument('--disable-browser-side-navigation')
    driver = webdriver.Chrome(options=options)
    driver.get(url)
    return driver

# mengambil atribut-atribut dari agen dan perusahaan
def get_agen(url, user_agent):
    driver = init_driver(user_agent, url)
    
    try:
        xpath       = "//script[@type='application/json']"
        wait        = WebDriverWait(driver, 5)
        occured     = EC.presence_of_element_located((By.XPATH, xpath))
        script_tag  = wait.until(occured)

        json_data   = script_tag.get_attribute('innerHTML')
        data        = json.loads(json_data)
        user_info   = data['props']['pageProps']['page']['user']
        
        return {
            'id_agen'           : user_info['id'],
            'nama_agen'         : user_info['info']['fullName'],
            'nomor_telepon'     : user_info['info']['phone'],
            'terjual'           : user_info['performance']['sold'],
            'tersewa'           : user_info['performance']['rented'],
            'nama_perusahaan'   : user_info['company']['name'],
            'alamat'            : user_info['about']['addresses'][0]
        }
    
    except Exception:
        print(f'Error occurred while scraping\nURL: {url}\nCannot find element\nContinuing scraping...\n')
        return None
    
    finally:
        driver.quit()

- Display untuk status dan hasil scraping

In [7]:
__RED__     = '\033[91m'
__GREEN__   = '\033[92m'
__RESET__   = '\033[0m'

def display_log(count, total):
    log_message = (
        f'==> scraping in process: '
        f'({__RED__}{count}{__RESET__}/{__GREEN__}{total}{__RESET__})\n'
    )

    clear_output(wait=True)
    print(log_message)

def display_result(count, total, timestamp):
    log_message = (
        f'{__RED__}............................{__RESET__}\n'
        f'............................\n'
        f'{__GREEN__}............................{__RESET__}\n'
        '==== scraping completed ====\n\n'
        f'-> data obtained\t\t: ({__GREEN__}{count}{__RESET__}/{__GREEN__}{total}{__RESET__})\n'
        f'-> timestamp (started at)\t: {timestamp}\n'
    )
    print(log_message)

- Proses scraping

In [8]:
def scraping(url, headers, total_item, time_sleep):
    timestamp       = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    count_jual      = math.ceil(total_item / 2)
    count_sewa      = total_item - count_jual
    tipe_iklan      = ['jual', 'sewa']
    visited_urls    = set() 
    all_data        = []

    display_log(all_data.__len__(), total_item)
    for tipe in tipe_iklan:
        count_limit     = count_jual if tipe == 'jual' else count_sewa
        count_data      = 0
        start_url       = f"{url}{tipe}/jakarta-pusat/residensial/"
        page            = get_content(start_url, headers)

        if page:
            max_page        = get_max_page(page)
            count_page      = random.randint(1, max_page - 21)

            while count_data < count_limit:
                if count_page <= max_page:
                    complete_url = f"{start_url}?page={str(count_page)}"
                    page = get_content(complete_url, headers)

                    if page:
                        items = get_parse_links(page, __PATTERN_PROPERTY__)

                        for item in items:
                            if item not in visited_urls:
                                visited_urls.add(item)
                                content = get_content(item, headers)

                                if content:
                                    data_property   = get_property(item, tipe, content)
                                    if data_property:
                                        url_agen        = get_parse_links(content, __PATTERN_AGENT__)
                                        filtered_urls   = [url for url in url_agen if url.count('/') == 6][0]
                                        data_agen       = get_agen(filtered_urls, __USER_AGENT__)

                                        if data_agen:
                                            data = {**data_property, **data_agen}
                                            data['timestamp'] = timestamp
                                            all_data.append(data)
                                            count_data += 1
                                            display_log(all_data.__len__(), total_item)

                                            if count_data >= count_limit:
                                                break

                                else:
                                    time.sleep(time_sleep)

                    else:
                        time.sleep(time_sleep)

                    count_page += 1
                
                else:
                    break
                
    display_result(all_data.__len__(), __TOTAL_ITEM__, timestamp)
    
    return all_data

- Menggabungkan data yang baru diperoleh dari scraping dengan data sebelumnya dan mengambil yang terbaru

In [9]:
def merge_existing(data):
    PATH_FILE = os.path.join(__PATH_FOLDER__, 'data_raw.json')
    if os.path.exists(PATH_FILE):
        old_df = pd.read_json(PATH_FILE)
    else:
        old_df = pd.DataFrame()
        
    new_df      = pd.DataFrame(data)
    latest_df   = pd.concat([new_df, old_df], ignore_index=True)
    latest_df   = latest_df.drop_duplicates(subset='id_iklan', keep='first')
    latest_df.to_json(PATH_FILE, orient='records', lines=False, indent=4, date_format='iso')

- main scraping

In [10]:
def main_scraping():
    data = scraping(url        =   __BASE_URL__, 
                    headers    =   __HEADERS__,
                    total_item =   __TOTAL_ITEM__,
                    time_sleep =   __TIME_SLEEP__,
         )
    merge_existing(data)

In [11]:
# __MAIN__
main_scraping()

==> scraping in process: ([91m200[0m/[92m200[0m)

[91m............................[0m
............................
[92m............................[0m
==== scraping completed ====

-> data obtained		: ([92m200[0m/[92m200[0m)
-> timestamp (started at)	: 2024-08-02 07:31:11

