In [4]:
# %pip freeze > requirements.txt
# %pip install -r requirement.txt

In [5]:
from bs4 import BeautifulSoup
from datetime import datetime

from webdriver_manager.chrome import ChromeDriverManager

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

import json
import pandas as pd
import re
import requests
import sqlite3
import time
import urllib.parse

# logging.basicConfig(filename='error.log', level=logging.ERROR)

In [6]:
def remove_spaces(input_string):
    result_string = input_string.replace(" ", "")
    return result_string

def create_search_link(query: str, lang, geo_coordinates, zoom):
    if geo_coordinates is None and zoom is not None:
        raise ValueError("geo_coordinates must be provided along with zoom")

    endpoint = urllib.parse.quote_plus(query)

    params = {'authuser': '0',
              'hl': lang,
              'entry': 'ttu',} if lang is not None else {'authuser': '0',
                                                         'entry': 'ttu',}
    
    geo_str = ''
    if geo_coordinates is not None:
        geo_coordinates = remove_spaces(geo_coordinates)
        if zoom is not None:
            geo_str = f'/@{geo_coordinates},{zoom}z'
        else:
            geo_str = f'/@{geo_coordinates}'

    url = f'https://www.google.com/maps/search/{endpoint}'
    if geo_str:
        url += geo_str
    url += f'?{urllib.parse.urlencode(params)}'

    return url

In [7]:
def db_check(query_type):
    with sqlite3.connect("./backend/data.db") as connection:
        cursor = connection.cursor()
        tables = {
            f"{query_type}": '"ID" INTEGER PRIMARY KEY NOT NULL, "NAMA" TEXT, "KOORDINAT" TEXT, "JML_RATING" INTEGER, "ALAMAT" TEXT, "TAG_GOOGLE" TEXT, "KELURAHAN" TEXT, "KECAMATAN" TEXT, "KOTA" TEXT, "PROVINSI" TEXT, "TIPE" TEXT, "IDCARI" INTEGER, "DATA_UPDATE" DATETIME',
            "randomized_pos": '"ID" INTEGER PRIMARY KEY NOT NULL, "PROPINSI" TEXT, "KOTA" TEXT, "KECAMATAN" TEXT, "KELURAHAN" TEXT, "KODEPOS" TEXT, "DATA_UPDATE"'
        }
        for table, schema in tables.items():
            cursor.execute(f'CREATE TABLE IF NOT EXISTS {table} ({schema})')

In [8]:
# df_pos = pd.read_csv('../scrape_kode_pos_indonesia/output/kode_pos.csv')
# df_pos = df_pos.fillna('-')
# df_cari = pd.DataFrame(df_pos['KOTA'].unique(), columns=['KOTA'])

df_cari = pd.read_csv('../scrape_kode_pos_indonesia/output/kode_pos.csv', dtype=str)
df_cari = df_cari.sample(frac=1).reset_index(drop=True) # randomized order

# cek database, kalau kosong isi randomized
def random_pos_check():
    with sqlite3.connect('./backend/data.db') as connection:
        cursor = connection.cursor()
        cursor.execute('SELECT COUNT(*) FROM randomized_pos')
        count = cursor.fetchone()[0]
        if count == 0:
            for i in range (0, len(df_cari)):
                provinsi = df_cari.iloc[i].iloc[0]
                kota = df_cari.iloc[i].iloc[1]
                kecamatan = df_cari.iloc[i].iloc[2]
                kelurahan = df_cari.iloc[i].iloc[3]
                kodepos = df_cari.iloc[i].iloc[4]
                update_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                cursor.execute(f'INSERT INTO randomized_pos (PROPINSI, KOTA, KECAMATAN, KELURAHAN, KODEPOS, DATA_UPDATE) VALUES (?, ?, ?, ?, ?, ?)', (provinsi, kota, kecamatan, kelurahan, kodepos, update_time))

In [9]:
def clean_table_name(jenis, filter_wilayah=''):
    jenis_table = jenis.replace(' ', '_') + '_filtered' if filter_wilayah else jenis.replace(' ', '_')
    return jenis_table

In [10]:
def create_new_df_cari(jenis_table, filter_wilayah=''):
    try:
        with sqlite3.connect('backend/data.db') as connection:
            cursor = connection.cursor()
            cursor.execute(f'SELECT IDCARI FROM {jenis_table} ORDER BY ID DESC LIMIT 1')
            last_cari = cursor.fetchone()[0]
    except Exception:
        last_cari = 0
        pass

    with sqlite3.connect('backend/data.db') as connection:
        query = f'SELECT PROPINSI, KOTA, KECAMATAN, KELURAHAN, KODEPOS, ID AS IDCARI FROM randomized_pos WHERE IDCARI > {last_cari}'
        if filter_wilayah:
            for i in filter_wilayah:
                query += f' AND {i}'
        df_cari = pd.DataFrame(pd.read_sql_query(query, connection))
    
    return df_cari

In [9]:
def map_scraper(jenis, jenis_table, df_cari):
    # proxyscrape.com
    username = "dl0kskmfsl8ssvi"
    password = "x2z4c0y1fqnvm15"
    proxy = "172.65.64.100:6060"
    proxy_auth = "{}:{}@{}".format(username, password, proxy)

    for i in range(0, len(df_cari)):
        total_time = time.time()
        provinsi = df_cari.iloc[i].iloc[0]
        kota = df_cari.iloc[i].iloc[1]
        kecamatan = df_cari.iloc[i].iloc[2]
        kelurahan = df_cari.iloc[i].iloc[3]
        idcari = int(df_cari.iloc[i].iloc[5])
        cari = f'{jenis} in {kelurahan}, {kecamatan}, {kota}, {provinsi}'
        url_cari = create_search_link(cari, None, '', 18)
    
        retry_count = 0
        while retry_count <= 60:
            try:
                proxy = {
                        "https":"http://{}".format(proxy_auth)
                    }
                response = requests.get(url_cari, proxies=proxy)
                if response.status_code == 200:
                    break
            except Exception as e:
                print(e)
                print('Proxy gagal, mencoba proxy lain')
                pass
    
            retry_count += 1
        
        if retry_count > 60:
            print('Seluruh proxy gagal')
            break
    
        data_cari = response.text
        soup_cari = BeautifulSoup(data_cari, 'html.parser')
        scripts = soup_cari.find_all('script')
    
        for script in scripts:
            if 'window.APP_INITIALIZATION_STATE' in str(script):
                data = str(script).split('=',3)[3]
                data2 = data.rsplit(';',10)[0]
                json_data = json.loads(data2)
                usaha = json_data[3][2][5:]
                json_usaha = json.loads(usaha)
                a = 1
                while True:
                    try:
                        nama = json_usaha[0][1][a][14][11]
                        koordinat = ', '.join(list(map(str, json_usaha[0][1][a][14][9][-2:])))
                        alamat = ', '.join(json_usaha[0][1][a][14][2])
                        try:
                            rating = json_usaha[0][1][a][14][4][3][1]
                            index_of_space = rating.find(" ")
                            rating_int = int(rating[:index_of_space])
                        except Exception:
                            rating_int = 0
                        updatetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        
                        try:
                            with sqlite3.connect('backend/data.db') as connection:
                                cursor = connection.cursor()
                                query = f'INSERT INTO {jenis_table} (NAMA, KOORDINAT, JML_RATING, ALAMAT, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
                                params = (nama, koordinat, rating_int, alamat, kelurahan, kecamatan, kota, provinsi, jenis, idcari, updatetime)
                                cursor.execute(query, params)

                        except Exception as e:
                            print(f'Error occurred: {str(e)} on kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {provinsi} index {a}')
                    
                        a += 1

                    except Exception:
                        break
                    
        print(f'{jenis} di kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {provinsi} selesai diinput sebanyak {a-1} data')
        print(f'Total waktu query {time.time() - total_time}')
    
    print(f'Scrape {jenis} selesai')

In [10]:
def get_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    
    # proxyscrape.com
    username = "dl0kskmfsl8ssvi"
    password = "x2z4c0y1fqnvm15"
    proxy = "rp.proxyscrape.com:6060"
    proxy_auth = "{}:{}@{}".format(username, password, proxy)

    prox = Proxy()
    prox.proxy_type = ProxyType.MANUAL
    prox.ssl_proxy = "http://{}".format(proxy_auth)
    capabilities = webdriver.DesiredCapabilities.CHROME
    prox.add_to_capabilities(capabilities)
    # chrome_options.add_argument(f'--proxy-server={proxy_auth}')

    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options, desired_capabilities=capabilities)
    except Exception:
        driver = webdriver.Chrome(service=Service('driver/124.0.6367.207/chromedriver-win32/chromedriver.exe'), options=chrome_options, desired_capabilities=capabilities)

    return driver

In [11]:
def map_scraper_with_scrolls(jenis, jenis_table, filter_wilayah, driver):
    proxy_count = 0
    query_count = 0
    cek_proxy = ''

    while proxy_count < 61:
        if cek_proxy == 'Proxy gagal':
            get_driver()
            print('Proxy baru')
        
        try:
            df_cari = create_new_df_cari(jenis_table, filter_wilayah)
            print(f'Ekspektasi jumlah query di cycle ini: {len(df_cari)}')
            
            for i in range(0, len(df_cari)):
                total_time = time.time()
                provinsi = df_cari.iloc[i].iloc[0]
                kota = df_cari.iloc[i].iloc[1]
                kecamatan = df_cari.iloc[i].iloc[2]
                kelurahan = df_cari.iloc[i].iloc[3]
                idcari = int(df_cari.iloc[i].iloc[5])
                cari = f'{jenis} in {kelurahan}, {kecamatan}, {kota}, {provinsi}'
                url_cari = create_search_link(cari, None, '', 18)

                driver.get(url_cari)

                try:
                    WebDriverWait(driver, 10).until(EC.title_contains("Google Maps"))
                    cek_proxy = ''
                except Exception:
                    cek_proxy = 'Proxy gagal'
                    break
            
                try:
                    divSideBar=driver.find_element(By.CSS_SELECTOR, "div[role='feed']")
                except Exception:
                    query_count += 1
                    print(f'Query {query_count}/{len(df_cari)} kosong kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {provinsi}')
                    print(f'Total waktu {time.time() - total_time}')
                    continue

                # actions = ActionChains(driver)

                keepScrolling=True
                while keepScrolling:
                    # actions.move_to_element(divSideBar).send_keys(Keys.PAGE_DOWN).perform()
                    # div_html = divSideBar.get_attribute('outerHTML')
                    divSideBar.send_keys(Keys.PAGE_DOWN)
                    div_html = driver.find_element(By.TAG_NAME, "html").get_attribute('outerHTML')

                    if "You've reached the end of the list." in div_html or 'Anda telah mencapai akhir daftar.' in div_html:
                        keepScrolling=False

                soup_cari = BeautifulSoup(driver.page_source, 'html.parser')
                targets = soup_cari.find("div", {'role': 'feed'}).find_all('div', {'class': False})[:-1]
                targets_no_ad = [div for div in targets if div.find('div', {'jsaction':True})]

                a = 1
                while True:
                    try:
                        nama = targets_no_ad[a].find_all("div", {'class':True})[0].find('a')['aria-label']

                        try:
                            jml_rating = int(targets_no_ad[a].find_all("div")[17].find_all("span")[4].text.strip()[1:-1].replace(',',''))
                        except:
                            jml_rating = 0

                        alamat = targets_no_ad[a].find_all('span', {'aria-hidden':'', 'aria-label':'', 'class':''})[3].text.strip()

                        try:
                            tag_google = [span for span in targets_no_ad[a].find_all('span', {'aria-label':'', 'aria-hidden':'', 'class':''}) if not span.find('span')][0].text.strip()
                        except:
                            tag_google = ''

                        coordinate = re.search(r'!3d(-?\d+\.\d+)!4d(-?\d+\.\d+)', targets_no_ad[a].find_all("div")[0].find("a")['href'])
                        longlat = f'{coordinate.group(1)}, {coordinate.group(2)}'
                        updatetime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

                        try:
                            with sqlite3.connect('backend/data.db') as connection:
                                cursor = connection.cursor()
                                query = f'INSERT INTO {jenis_table} (NAMA, KOORDINAT, JML_RATING, ALAMAT, TAG_GOOGLE, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
                                params = (nama, longlat, jml_rating, alamat, tag_google, kelurahan, kecamatan, kota, provinsi, jenis, idcari, updatetime)
                                cursor.execute(query, params)
                        except Exception as e:
                            print(f'Error occurred: {str(e)} on kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {provinsi} index {a}')

                        a += 1

                    except Exception:
                        break
                
                query_count += 1
                print(f'Query {query_count}/{len(df_cari)} {jenis} di kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {provinsi} selesai diinput sebanyak {a-1} data')
                print(f'Total waktu {time.time() - total_time}')

            if cek_proxy == 'Proxy gagal':
                proxy_count += 1
                driver.close()
                break

        except Exception:
            pass

    if proxy_count > 60 and cek_proxy == 'Proxy gagal':    
        status = 'Seluruh proxy gagal'
        driver.close()
        print(status)

    status = f'Scrape {jenis} selesai'
    if cek_proxy != 'Proxy gagal':
        driver.close()
    print(status)

In [12]:
# TODO buat database jenis

In [13]:
# TODO loop keseluruhan kode berdasarkan database jenis

jenis = 'company registry'
filter_wilayah = ['PROPINSI = "JAWA TENGAH"',]

jenis_table = clean_table_name(jenis, filter_wilayah)

db_check(jenis_table)
random_pos_check()

driver = get_driver() # driver pertama di luar function agar bisa close driver kalau manual interrupt

# PILIH SALAH SATU
# map_scraper(jenis, jenis_table, df_cari)
map_scraper_with_scrolls(jenis, jenis_table, filter_wilayah, driver)

Ekspektasi jumlah query di cycle ini: 6511
Query 1/6511 company registry di kelurahan SAMBIDUWUR kecamatan TANON kota KAB. SRAGEN provinsi JAWA TENGAH selesai diinput sebanyak 1 data
Total waktu 2.2153120040893555
Query 2/6511 kosong kelurahan SUTOPATI kecamatan KAJORAN kota KAB. MAGELANG provinsi JAWA TENGAH
Total waktu 1.409064769744873


KeyboardInterrupt: 

In [None]:
# TODO buat async function agar bisa beberapa scraper sekaligus
    # TODO ubah cek iterasi dari id cari ke kolom penanda iterasi

In [27]:
driver.close()