In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    JavascriptException,
    WebDriverException,
    StaleElementReferenceException,
)
from fake_useragent import UserAgent
import requests
from bs4 import BeautifulSoup
import logging
import random
import pandas as pd
import time
from tqdm.notebook import tqdm

In [2]:
districts = [
    # "jakarta",
    # "jakarta-barat",
    # "jakarta-pusat",
    # "jakarta-selatan",
    "jakarta-timur",
    # "jakarta-utara",
]

genders = [
    "Semua",
    "Khusus Putra",
    "Khusus Putri",
    "Putra dan Campur",
    "Putri dan Campur",
]

price_starts = range(0, 15000000, 100000)

var = [
    "_id",
    "price_monthly",
    "latitude",
    "longitude",
    "gender",
    "area_city_keyword",
    "area_subdistrict",
    "status",
    "size",
    "fac_room",
    "fac_share",
    "fac_bath",
    "fac_near",
    "fac_park",
    "kos_rule",
    "fac_price",
    "owner_user_id",
    "building_year",
    "is_singgahsini",
    "is_apik",
    "is_elite",
    "number_success_owner_trx",
    "number_success_kos_trx",
]

In [3]:
# Function to get a random user agent
def get_random_user_agent():
    ua = UserAgent(min_version=120.0)
    return ua.random

def get_free_proxies():
    countries = ['SG', 'VN', 'TH', 'ID']
    proxies = []
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table')
    for row in table.tbody.find_all('tr'):
        columns = row.find_all('td')
        if columns[2].text in countries:
            ip = columns[0].text
            port = columns[1].text
            proxies.append(f'{ip}:{port}')
    return proxies

# Function to get a random proxy
def get_random_proxy(proxies):
    return proxies[random.randint(0, len(proxies) - 1)]

def setup_driver(proxies):
    user_agent = get_random_user_agent()
    proxy = get_random_proxy(proxies)
    
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--window-size=1280,720")
    chrome_options.add_argument(f"user-agent={user_agent}")
    # chrome_options.add_argument(f"--proxy-server=http://{proxy}")
    driver = webdriver.Chrome(options=chrome_options)
    return driver, proxy

In [4]:
url = "https://mamikos.com/booking/"
proxies = get_free_proxies()
results = []

# Set up logging
logging.basicConfig(filename='error.log', level=logging.ERROR)

try:
    for district in tqdm(districts):
        for gender in tqdm(genders):
            while True:
                try:
                    driver, proxy = setup_driver(proxies)
                    driver.get(url + district)
                    # Wait until the filter appears
                    WebDriverWait(driver, 4).until(
                        EC.presence_of_element_located(
                            (By.CSS_SELECTOR, "div[data-v-424a8076]")
                        )
                    )
                    break
                except (WebDriverException, TimeoutException) as e:
                    logging.error(f'Proxy Failed')
                    driver.quit()
                    proxies.remove(proxy)
                    continue
            
            for price_start in tqdm(price_starts):                
                try:
                    # Fill the gender type dropdown filter
                    filterType = driver.find_element(By.ID, "filterType")
                    filterType.send_keys(gender)

                    # Fill the price filter
                    filterPriceMin = driver.find_element(By.ID, "filterPriceMin")
                    filterPriceMin.send_keys(Keys.CONTROL + "a")
                    filterPriceMin.send_keys(price_start)
                    filterPriceMax = driver.find_element(By.ID, "filterPriceMax")
                    filterPriceMax.send_keys(Keys.CONTROL + "a")
                    filterPriceMax.send_keys(price_start + 100000)

                    # Click the submit button
                    submit = driver.find_element(
                        By.CSS_SELECTOR, 'button[type="submit"]'
                    )
                    submit.click()

                    while True:
                        # Iterate through the listing cards
                        for i in range(20):
                            # Find the listing card again to avoid stale element reference
                            try:
                                # Wait until the listing cards appear
                                WebDriverWait(driver, 4).until(
                                    EC.presence_of_element_located(
                                        (By.CSS_SELECTOR, "div.listing-room-card")
                                    )
                                )
                                room_card = driver.find_element(By.CSS_SELECTOR, "div.listing-room-card") 
                                card = room_card.find_elements(By.CSS_SELECTOR, "div.kost-rc__inner")[i]
                                driver.execute_script("arguments[0].click();", card)
                            except Exception as e:
                                if e.__class__ == StaleElementReferenceException:
                                    logging.error(f'Line 64 - Stale Element')
                                elif e.__class__ == IndexError:
                                    logging.error(f'Line 66 - Index Error')
                                elif e.__class__ == TimeoutException:
                                    logging.error(f'Line 68 - Timeout')
                                else:
                                    logging.error(f'Line 70 - Other Error')
                                break

                            # Switch to the main page to avoid StaleElementReferenceException
                            driver.switch_to.window(driver.window_handles[0])

                            # Sleep to avoid too many requests
                            time.sleep(0.95 + random.random())
                            
                            # Switch to the detail page
                            driver.switch_to.window(driver.window_handles[1])
                            
                            # Execute the query to get the detail object
                            detail = {}
                            while not detail:
                                try:
                                    detail = driver.execute_script("return detail")
                                except JavascriptException as e:
                                    logging.error(f'Line 91 - Too Many Requests')
                                    time.sleep(30)
                                    driver.refresh()
                            
                            # Store the selected elements
                            detail = {key: detail[key] for key in var if key in detail}
                            detail["url"] = driver.current_url
                            results.append(detail)

                            # Close the page
                            driver.close()

                            # Switch back to the listing page
                            driver.switch_to.window(driver.window_handles[0])
                        
                        try:
                            # Find the pagination bar
                            pagination = driver.find_element(By.CSS_SELECTOR, "ul.pagination")

                            # Wait until the bar is loaded
                            WebDriverWait(driver, 4).until(
                                lambda d: pagination.find_element(By.CSS_SELECTOR, "li.active")
                            )

                            active_page = pagination.find_element(By.CSS_SELECTOR, "li.active").text
                            second_last_page = pagination.find_element(By.CSS_SELECTOR, "li:nth-last-child(2)").text

                            if active_page != second_last_page:
                                next_button = pagination.find_element(By.CSS_SELECTOR, "li:last-child a")
                                next_button.click()
                            else:
                                break
                        except NoSuchElementException as e:
                            logging.error(f'Line 124 - Pagination Failed')
                        except TimeoutException as e:
                            logging.error(f'Line 126 - Timeout')
                            break
                except TimeoutException as e:
                    logging.error(f'Line 129 - Timeout')
                    continue
            
            # Change to a new proxy
            driver.quit()
finally:
    driver.quit()
    result = pd.DataFrame(results)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

In [5]:
mamikos_raw = pd.DataFrame(results)
mamikos = mamikos_raw.drop_duplicates(subset=["_id"])
len(mamikos)

3605

In [6]:
mamikos.to_csv("mamikos_jakarta-timur.csv", index="_id")

In [7]:
jakarta_41_150 = pd.read_csv("mamikos_41-150.csv")
jakarta = pd.read_csv("mamikos_jakarta.csv")
jakarta_semua_putra_putri = pd.read_csv("mamikos_jakarta_semua_putra_putri.csv")
pusat = pd.read_csv("mamikos_jakarta-pusat.csv")
barat_semua = pd.read_csv("mamikos_jakarta-barat_semua.csv")
barat_bukan_semua = pd.read_csv("mamikos_jakarta-barat_bukan-semua.csv")
selatan = pd.read_csv("mamikos_jakarta-selatan.csv")
timur = pd.read_csv("mamikos_jakarta-timur.csv")
utara = pd.read_csv("mamikos_jakarta-utara.csv")

In [21]:
mamikos = pd.concat([jakarta_41_150, jakarta, jakarta_semua_putra_putri, pusat, barat_semua, barat_bukan_semua, selatan, timur, utara])
mamikos = mamikos.drop_duplicates(subset=["_id"]).drop(columns=["Unnamed: 0"])