In [127]:
# %python -m venv env
# %env\Scripts\activate

In [128]:
# %pip freeze > requirements.txt
# %pip install -r requirements.txt

In [129]:
from bs4 import BeautifulSoup
from datetime import datetime

from webdriver_manager.chrome import ChromeDriverManager

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

import json
import pandas as pd
import pymysql
import re
import requests
import sqlite3
import time
import urllib.parse

# logging.basicConfig(filename='error.log', level=logging.ERROR)

In [130]:
def clean_table_name(jenis, filter_wilayah=''):
    propinsi = filter_wilayah['PROPINSI'].replace(' ','').lower()
    kota = filter_wilayah['KOTA'].replace(' ','').lower()
    kecamatan = filter_wilayah['KECAMATAN'].replace(' ','').lower()
    kelurahan = filter_wilayah['KELURAHAN'].replace(' ','').lower()

    jenis_table = jenis.replace(' ', '')

    if propinsi:
        jenis_table += f'_{propinsi}'
    if kota:
        jenis_table += f'_{kota}'
    if kecamatan:
        jenis_table += f'_{kecamatan}'
    if kelurahan:
        jenis_table += f'_{kelurahan}'
        
    return jenis_table

In [131]:
def db_check(database_type, table_name):
    if database_type.lower() == 'sqlite':
        with sqlite3.connect("./backend/data.db") as connection:
            cursor = connection.cursor()
            tables = {
                f"{table_name}": '"ID" INTEGER PRIMARY KEY NOT NULL, "NAMA" TEXT, "KOORDINAT" TEXT, "ALAMAT" TEXT, "RATING" REAL, "JML_RATING" INTEGER, "TAG_GOOGLE" TEXT, "KELURAHAN" TEXT, "KECAMATAN" TEXT, "KOTA" TEXT, "PROVINSI" TEXT, "TIPE" TEXT, "IDCARI" INTEGER, "DATA_UPDATE" DATETIME',
                "randomized_pos": '"ID" INTEGER PRIMARY KEY NOT NULL, "PROPINSI" TEXT, "KOTA" TEXT, "KECAMATAN" TEXT, "KELURAHAN" TEXT, "KODEPOS" TEXT, "DATA_UPDATE"'
            }
            for table, schema in tables.items():
                cursor.execute(f'CREATE TABLE IF NOT EXISTS {table} ({schema})')

    elif database_type.lower() == 'mariadb':
        host, port, user, password, database = [i.replace(' ','') for i in open('authentication/mariadb', 'r').read().split(',')]
        connection = pymysql.connect(host=host, port=int(port), user=user, password=password, database=database, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

        try:
            with connection.cursor() as cursor:
                tables = {
                    f'{table_name}': 'ID INT AUTO_INCREMENT PRIMARY KEY, NAMA TEXT, KOORDINAT TEXT, ALAMAT TEXT, RATING FLOAT, JML_RATING INT, TAG_GOOGLE TEXT, KELURAHAN TEXT, KECAMATAN TEXT, KOTA TEXT, PROVINSI TEXT, TIPE TEXT, IDCARI INT, DATA_UPDATE DATETIME',
                    'randomized_pos': 'ID INT AUTO_INCREMENT PRIMARY KEY, PROPINSI TEXT, KOTA TEXT, KECAMATAN TEXT, KELURAHAN TEXT, KODEPOS TEXT, DATA_UPDATE DATETIME'
                }
                for table, schema in tables.items():
                    cursor.execute(f'CREATE TABLE IF NOT EXISTS {table} ({schema})')
            connection.commit()
        finally:
            connection.close()

    else:
        print('Database tidak dikenal')

In [132]:
# df_pos = pd.read_csv('../scrape_kode_pos_indonesia/output/kode_pos.csv')
# df_pos = df_pos.fillna('-')
# df_cari = pd.DataFrame(df_pos['KOTA'].unique(), columns=['KOTA'])

# cek database, kalau kosong isi randomized
def random_pos_check(database_type):
    df_cari = pd.read_csv('../scrape_kode_pos_indonesia/output/kode_pos.csv', dtype=str)
    df_cari = df_cari.sample(frac=1).reset_index(drop=True) # randomized order
    df_cari.fillna('', inplace=True)

    values = []
    for i in range (len(df_cari)):
        propinsi = df_cari.iloc[i]['PROPINSI']
        kota = df_cari.iloc[i]['KOTA']
        kecamatan = df_cari.iloc[i]['KECAMATAN']
        kelurahan = df_cari.iloc[i]['KELURAHAN']
        kodepos = df_cari.iloc[i]['KODE POS']
        update_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        values.append((propinsi, kota, kecamatan, kelurahan, kodepos, update_time))

    if database_type.lower() == 'sqlite':
        with sqlite3.connect('./backend/data.db') as connection:
            cursor = connection.cursor()
            cursor.execute('SELECT COUNT(*) FROM randomized_pos')
            count = cursor.fetchone()[0]
            if count == 0:
                query = ('INSERT INTO randomized_pos (PROPINSI, KOTA, KECAMATAN, KELURAHAN, KODEPOS, DATA_UPDATE) VALUES (?, ?, ?, ?, ?, ?)')
                cursor.executemany(query, values)

    elif database_type.lower() == 'mariadb':
        host, port, user, password, database = [i.replace(' ','') for i in open('authentication/mariadb', 'r').read().split(',')]
        connection = pymysql.connect(host=host, port=int(port), user=user, password=password, database=database, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

        try:
            with connection.cursor() as cursor:
                cursor.execute('SELECT COUNT(*) FROM randomized_pos')
                count = cursor.fetchone()['COUNT(*)']
                if count == 0:                    
                    query = ('INSERT INTO randomized_pos (PROPINSI, KOTA, KECAMATAN, KELURAHAN, KODEPOS, DATA_UPDATE) VALUES (%s, %s, %s, %s, %s, %s)')                 
                    cursor.executemany(query, values)
                
            connection.commit()

        finally:
            connection.close()

    else:
        print('Database tidak dikenal')

In [133]:
def create_new_df_cari(database_type, jenis, filter_wilayah=''):
    table_name = clean_table_name(jenis, filter_wilayah)
    
    if database_type.lower() == 'sqlite':
        try:
            with sqlite3.connect('backend/data.db') as connection:
                cursor = connection.cursor()
                cursor.execute(f'SELECT IDCARI FROM {table_name} ORDER BY ID DESC LIMIT 1')
                last_cari = cursor.fetchone()[0]
        except Exception:
            last_cari = 0

        query = f'SELECT PROPINSI, KOTA, KECAMATAN, KELURAHAN, KODEPOS, ID AS IDCARI FROM randomized_pos WHERE IDCARI > {last_cari}'
        if filter_wilayah:
            propinsi = filter_wilayah['PROPINSI']
            kota = filter_wilayah['KOTA']
            kecamatan = filter_wilayah['KECAMATAN']
            kelurahan = filter_wilayah['KELURAHAN']

        if propinsi:
            query += f' AND PROPINSI = "{propinsi}"'
        if kota:
            query += f' AND KOTA = "{kota}"'
        if kecamatan:
            query += f' AND KECAMATAN = "{kecamatan}"'
        if kelurahan:
            query += f' AND KELURAHAN = "{kelurahan}"'

        with sqlite3.connect('backend/data.db') as connection:
            df_cari = pd.DataFrame(pd.read_sql_query(query, connection))
        
        return df_cari
    
    elif database_type.lower() == 'mariadb':
        host, port, user, password, database = [i.replace(' ','') for i in open('authentication/mariadb', 'r').read().split(',')]
        connection = pymysql.connect(host=host, port=int(port), user=user, password=password, database=database, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

        try:
            with connection.cursor() as cursor:
                try:
                    cursor.execute(f'SELECT IDCARI FROM {table_name} ORDER BY ID DESC LIMIT 1')
                    last_cari = cursor.fetchone()[0]
                except Exception:
                    last_cari = 0

                query = f'SELECT PROPINSI, KOTA, KECAMATAN, KELURAHAN, KODEPOS, ID AS IDCARI FROM randomized_pos WHERE ID > {last_cari}'
                if filter_wilayah:
                    propinsi = filter_wilayah['PROPINSI']
                    kota = filter_wilayah['KOTA']
                    kecamatan = filter_wilayah['KECAMATAN']
                    kelurahan = filter_wilayah['KELURAHAN']

                if propinsi:
                    query += f' AND PROPINSI = "{propinsi}"'
                if kota:
                    query += f' AND KOTA = "{kota}"'
                if kecamatan:
                    query += f' AND KECAMATAN = "{kecamatan}"'
                if kelurahan:
                    query += f' AND KELURAHAN = "{kelurahan}"'

                cursor.execute(query)
                rows = cursor.fetchall()
                df_cari = pd.DataFrame(rows)
        except Exception as e:
            print(e)

        finally:
            connection.close()
            
        return df_cari

In [134]:
def remove_spaces(input_string):
    result_string = input_string.replace(" ", "")
    return result_string

def create_search_link(query: str, lang, geo_coordinates, zoom):
    if geo_coordinates is None and zoom is not None:
        raise ValueError("geo_coordinates must be provided along with zoom")

    endpoint = urllib.parse.quote_plus(query)

    params = {'authuser': '0',
              'hl': lang,
              'entry': 'ttu',} if lang is not None else {'authuser': '0',
                                                         'entry': 'ttu',}
    
    geo_str = ''
    if geo_coordinates is not None:
        geo_coordinates = remove_spaces(geo_coordinates)
        if zoom is not None:
            geo_str = f'/@{geo_coordinates},{zoom}z'
        else:
            geo_str = f'/@{geo_coordinates}'

    url = f'https://www.google.com/maps/search/{endpoint}'
    if geo_str:
        url += geo_str
    url += f'?{urllib.parse.urlencode(params)}'

    return url

In [135]:
def proxy_auth(proxy_name):
    user, password, domain = [i.replace(' ','') for i in open(f'authentication/{proxy_name}', 'r').read().split(',')]
    return user, password, domain

In [136]:
def map_scraper(database_type, jenis, filter_wilayah, proxy=''):
    if proxy:
        user, password, domain = proxy_auth('proxyscrape')
        proxy_insert = f"{user}:{password}@{domain}"
        proxy_detail = {
                "https":f"http://{proxy_insert}"
            }

    df_cari = create_new_df_cari(database_type, jenis, filter_wilayah)

    query = f'INSERT INTO {clean_table_name(jenis, filter_wilayah)} (NAMA, KOORDINAT, ALAMAT, RATING, JML_RATING, TAG_GOOGLE, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'

    for i in range(len(df_cari)):
        start_time = time.time()
        dbtime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        propinsi = df_cari.iloc[i].iloc[0]
        kota = df_cari.iloc[i].iloc[1]
        kecamatan = df_cari.iloc[i].iloc[2]
        kelurahan = df_cari.iloc[i].iloc[3]
        idcari = int(df_cari.iloc[i].iloc[5])
        cari = f'{jenis} in {kelurahan}, {kecamatan}, {kota}, {propinsi}'
        url_cari = create_search_link(cari, None, '', 18)
    
        if proxy:
            retry_count = 0
            while retry_count <= 10:
                try:
                    response = requests.get(url_cari, proxies=proxy_detail)
                    if response.status_code == 200:
                        break
                except Exception as e:
                    print(e)
                    print('Proxy gagal, mencoba proxy lain')
                    pass
        
                retry_count += 1
            
            if retry_count > 60:
                print('Seluruh proxy gagal')
                break
        else:
            response = requests.get(url_cari)
    
        data_cari = response.text
        soup_cari = BeautifulSoup(data_cari, 'html.parser')
        scripts = soup_cari.find_all('script')
        values = []
    
        for script in scripts:
            if 'window.APP_INITIALIZATION_STATE' in str(script):
                data = str(script).split('=',3)[3]
                data2 = data.rsplit(';',10)[0]
                json_data = json.loads(data2)
                usaha = json_data[3][2][5:]
                json_usaha = json.loads(usaha)
                a = 1
                while True:
                    try:
                        nama = json_usaha[0][1][a][-1][11]
                        koordinat = ', '.join(list(map(str, json_usaha[0][1][a][-1][9][-2:])))
                        alamat = ', '.join(json_usaha[0][1][a][-1][2])
                        tag_google = ', '.join(json_usaha[0][1][a][-1][13])

                        try:
                            rating = float(json_usaha[0][1][a][-1][4][-2])
                        except:
                            rating = 0 

                        try:
                            jml_rating = int(json_usaha[0][1][a][-1][4][-1])
                        except:
                            jml_rating = 0
                            
                        values.append((nama, koordinat, alamat, rating, jml_rating, tag_google, kelurahan, kecamatan, kota, propinsi, jenis, idcari, dbtime))
                        a += 1

                    except:
                        break
            break
        
        if values:
            if database_type.lower() == 'sqlite':
                query = f'INSERT INTO {clean_table_name(jenis, filter_wilayah)} (NAMA, KOORDINAT, ALAMAT, RATING, JML_RATING, TAG_GOOGLE, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
                with sqlite3.connect('backend/data.db') as connection:
                    cursor = connection.cursor()
                    cursor.executemany(query, values)

            elif database_type.lower() == 'mariadb':
                query = f'INSERT INTO {clean_table_name(jenis, filter_wilayah)} (NAMA, KOORDINAT, ALAMAT, RATING, JML_RATING, TAG_GOOGLE, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
                host, port, user, password, database = [i.replace(' ','') for i in open('authentication/mariadb', 'r').read().split(',')]
                connection = pymysql.connect(host=host, port=int(port), user=user, password=password, database=database, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)

                try:
                    with connection.cursor() as cursor:
                        cursor = connection.cursor()
                        cursor.executemany(query, values)
                        connection.commit()
                except Exception as e:
                    print(e)
                finally:
                    connection.close()                     

            else:
                print('Database tidak dikenal')
                                        
        print(f'Query {i+1}/{len(df_cari)} {jenis} di kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {propinsi} selesai diinput sebanyak {a-1} data')
        print(f'Total waktu query {time.time() - start_time}')
    
    print(f'Scrape {jenis} selesai')

In [137]:
def get_driver(proxy=None):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")

    chrome_options.add_argument("--log-level=3")
    chrome_options.add_argument("--silent")

    capabilities = webdriver.DesiredCapabilities.CHROME
    
    if proxy:
        user, password, domain = proxy_auth('proxyscrape')
        proxy_insert = f"{user}:{password}@{domain}"

        prox = Proxy()
        prox.proxy_type = ProxyType.MANUAL
        prox.ssl_proxy = f"http://{proxy_insert}"

        prox.add_to_capabilities(capabilities)

    try:
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options, desired_capabilities=capabilities)
    except Exception:
        driver = webdriver.Chrome(service=Service('driver/124.0.6367.207/chromedriver-win32/chromedriver.exe'), options=chrome_options, desired_capabilities=capabilities)

    return driver

In [138]:
def map_scraper_with_scrolls(database_type, jenis, filter_wilayah, proxy):
    proxy_count = 0
    cek_proxy = ''

    while proxy_count < 10:
        df_cari = create_new_df_cari(database_type, jenis, filter_wilayah)
        print(f'Ekspektasi jumlah query di cycle ini: {len(df_cari)}')

        driver = get_driver(proxy)

        try:
            for i in range(len(df_cari)):
                start_time = time.time()
                dbtime= datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                propinsi = df_cari.iloc[i].iloc[0]
                kota = df_cari.iloc[i].iloc[1]
                kecamatan = df_cari.iloc[i].iloc[2]
                kelurahan = df_cari.iloc[i].iloc[3]
                idcari = int(df_cari.iloc[i].iloc[5])
                cari = f'{jenis} in {kelurahan}, {kecamatan}, {kota}, {propinsi}'
                url_cari = create_search_link(cari, None, '', 18)
                
                driver.get(url_cari)

                try:
                    WebDriverWait(driver, 10).until(EC.title_contains("Google Maps"))
                    cek_proxy = ''
                except Exception:
                    cek_proxy = 'Proxy gagal'
                    break

                try:
                    divSideBar=driver.find_element(By.CSS_SELECTOR, "div[role='feed']")
                except Exception:
                    print(f'Query {i+1}/{len(df_cari)} kosong kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {propinsi}')
                    print(f'Total waktu {time.time() - start_time}')
                    continue

                keepScrolling=True
                while keepScrolling:
                    divSideBar.send_keys(Keys.PAGE_DOWN)
                    div_html = driver.find_element(By.TAG_NAME, "html").get_attribute('outerHTML')

                    if "You've reached the end of the list." in div_html or 'Anda telah mencapai akhir daftar.' in div_html:
                        keepScrolling=False

                soup_cari = BeautifulSoup(driver.page_source, 'html.parser')
                targets = soup_cari.find("div", {'role': 'feed'}).find_all('div', {'class': False})[:-1]
                targets_no_ad = [div for div in targets if div.find('div', {'jsaction':True})]

                values = []
                a = 0
                while True:
                    try:
                        nama = targets_no_ad[a].find_all("div", {'class':True})[0].find('a')['aria-label']
                        coordinate = re.search(r'!3d(-?\d+\.\d+)!4d(-?\d+\.\d+)', targets_no_ad[a].find_all("div")[0].find("a")['href'])
                        longlat = f'{coordinate.group(1)}, {coordinate.group(2)}'
                        alamat = [span for span in targets_no_ad[a].find_all('span', {'aria-hidden':'', 'aria-label':'', 'class':''}) if not span.find('span')][1].text.strip()
                        rating = float(targets_no_ad[a].find_all('span')[4].find_all('span')[0].text.strip().replace(',','.'))

                        try:
                            jml_rating = int(targets_no_ad[a].find_all("div")[17].find_all("span")[4].text.strip()[1:-1].replace(',',''))
                        except:
                            jml_rating = 0

                        try:
                            tag_google = [span for span in targets_no_ad[a].find_all('span', {'aria-label':'', 'aria-hidden':'', 'class':''}) if not span.find('span')][0].text.strip()
                        except:
                            tag_google = ''

                        values.append((nama, longlat, alamat, rating, jml_rating, tag_google, kelurahan, kecamatan, kota, propinsi, jenis, idcari, dbtime))
                        a += 1
                    except Exception:
                        break

                if values:
                    if database_type.lower() == 'sqlite':
                        query = f'INSERT INTO {clean_table_name(jenis, filter_wilayah)} (NAMA, KOORDINAT, ALAMAT, RATING, JML_RATING, TAG_GOOGLE, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
                        with sqlite3.connect('backend/data.db') as connection:
                            cursor = connection.cursor()
                            cursor.executemany(query, values)
        
                    elif database_type.lower() == 'mariadb':
                        query = f'INSERT INTO {clean_table_name(jenis, filter_wilayah)} (NAMA, KOORDINAT, ALAMAT, RATING, JML_RATING, TAG_GOOGLE, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
                        host, port, user, password, database = [i.replace(' ','') for i in open('authentication/mariadb', 'r').read().split(',')]
                        connection = pymysql.connect(host=host, port=int(port), user=user, password=password, database=database, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
        
                        try:
                            with connection.cursor() as cursor:
                                cursor = connection.cursor()
                                cursor.executemany(query, values)
                                connection.commit()
                        except Exception as e:
                            print(e)
                        finally:
                            connection.close()                     
        
                    else:
                        print('Database tidak dikenal')

                print(f'Query {i+1}/{len(df_cari)} {jenis} di kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {propinsi} selesai diinput sebanyak {a+1} data')
                print(f'Total waktu {time.time() - start_time}')
        
        finally:
            proxy_count += 1
            try:
                driver.close()
            except Exception:
                pass

        if i+1 == len(df_cari):
            break

    if proxy_count > 10 and cek_proxy == 'Proxy gagal':    
        status = 'Seluruh proxy gagal'
        print(status)

    status = f'Scrape {jenis} selesai'
        
    print(status)

In [139]:
def map_scraper_with_scrolls_deep(database_type, jenis, filter_wilayah, proxy):
    proxy_count = 0
    cek_proxy = ''
    user, password, domain = proxy_auth('proxyscrape')
    proxy_insert = f"{user}:{password}@{domain}"
    proxy_detail = {
            "https":f"http://{proxy_insert}"
        }

    while proxy_count < 10:
        df_cari = create_new_df_cari(database_type, jenis, filter_wilayah)
        print(f'Ekspektasi jumlah query di cycle ini: {len(df_cari)}')

        driver = get_driver(proxy)

        try:
            for i in range(len(df_cari)):
                start_time = time.time()
                dbtime= datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                propinsi = df_cari.iloc[i].iloc[0]
                kota = df_cari.iloc[i].iloc[1]
                kecamatan = df_cari.iloc[i].iloc[2]
                kelurahan = df_cari.iloc[i].iloc[3]
                idcari = int(df_cari.iloc[i].iloc[5])
                cari = f'{jenis} in {kelurahan}, {kecamatan}, {kota}, {propinsi}'
                url_cari = create_search_link(cari, None, '', 18)
                
                driver.get(url_cari)

                try:
                    WebDriverWait(driver, 10).until(EC.title_contains("Google Maps"))
                    cek_proxy = ''
                except Exception:
                    cek_proxy = 'Proxy gagal'
                    break

                try:
                    divSideBar=driver.find_element(By.CSS_SELECTOR, "div[role='feed']")
                except Exception:
                    print(f'Query {i+1}/{len(df_cari)} kosong kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {propinsi}')
                    print(f'Total waktu {time.time() - start_time}')
                    continue

                keepScrolling=True
                while keepScrolling:
                    divSideBar.send_keys(Keys.PAGE_DOWN)
                    div_html = driver.find_element(By.TAG_NAME, "html").get_attribute('outerHTML')

                    if "You've reached the end of the list." in div_html or 'Anda telah mencapai akhir daftar.' in div_html:
                        keepScrolling=False

                soup_cari = BeautifulSoup(driver.page_source, 'html.parser')
                targets = soup_cari.find("div", {'role': 'feed'}).find_all('div', {'class': False})[:-1]
                targets_no_ad = [div for div in targets if div.find('div', {'jsaction':True})]

                values = []
                a = 0
                while True: 
                    try:
                        url_cari = targets_no_ad[a].find_all('a')[0]['href']

                        if proxy:
                            response = requests.get(url_cari, proxies=proxy_detail)
                        else:
                            response = requests.get(url_cari)

                        data_cari_depth = response.text
                        soup_cari_depth = BeautifulSoup(data_cari_depth, 'html.parser')
                        scripts_depth = soup_cari_depth.find_all('script')

                        for script_depth in scripts_depth:
                            if 'window.APP_INITIALIZATION_STATE' in str(script_depth):
                                data_depth = str(script_depth).split('=',3)[3]
                                data2_depth = data_depth.rsplit(';',10)[0]
                                json_data_depth = json.loads(data2_depth)
                                usaha_depth = json_data_depth[3][-1][5:]
                                json_usaha_depth = json.loads(usaha_depth)
                                break

                        nama = json_usaha_depth[6][11]
                        alamat = ', '.join(json_usaha_depth[6][2])
                        rating = float(json_usaha_depth[6][4][7])
                        jml_rating = int(json_usaha_depth[6][4][8])
                        longlat = ', '.join(str(k) for k in json_usaha_depth[6][9][-2:])
                        tag_google = ', '.join(json_usaha_depth[6][13])

                        values.append((nama, longlat, alamat, rating, jml_rating, tag_google, kelurahan, kecamatan, kota, propinsi, jenis, idcari, dbtime))
                        a += 1
                    except Exception:
                        break

                if values:
                    if database_type.lower() == 'sqlite':
                        query = f'INSERT INTO {clean_table_name(jenis, filter_wilayah)} (NAMA, KOORDINAT, ALAMAT, RATING, JML_RATING, TAG_GOOGLE, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
                        with sqlite3.connect('backend/data.db') as connection:
                            cursor = connection.cursor()
                            cursor.executemany(query, values)
        
                    elif database_type.lower() == 'mariadb':
                        query = f'INSERT INTO {clean_table_name(jenis, filter_wilayah)} (NAMA, KOORDINAT, ALAMAT, RATING, JML_RATING, TAG_GOOGLE, KELURAHAN, KECAMATAN, KOTA, PROVINSI, TIPE, IDCARI, DATA_UPDATE) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
                        host, port, user, password, database = [i.replace(' ','') for i in open('authentication/mariadb', 'r').read().split(',')]
                        connection = pymysql.connect(host=host, port=int(port), user=user, password=password, database=database, charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor)
        
                        try:
                            with connection.cursor() as cursor:
                                cursor = connection.cursor()
                                cursor.executemany(query, values)
                                connection.commit()
                        except Exception as e:
                            print(e)
                        finally:
                            connection.close()                     
        
                    else:
                        print('Database tidak dikenal')

                print(f'Query {i+1}/{len(df_cari)} {jenis} di kelurahan {kelurahan} kecamatan {kecamatan} kota {kota} provinsi {propinsi} selesai diinput sebanyak {a} data')
                print(f'Total waktu {time.time() - start_time}')
        
        finally:
            proxy_count += 1
            try:
                driver.close()
            except Exception:
                pass

        if i+1 == len(df_cari):
            break

    if proxy_count > 10 and cek_proxy == 'Proxy gagal':    
        status = 'Seluruh proxy gagal'
        print(status)

    status = f'Scrape {jenis} selesai'
        
    print(status)

In [140]:
# TODO buat database jenis

In [141]:
# TODO loop keseluruhan kode berdasarkan database jenis

database = 'sqlite' # opsi sqlite(lokal) atau mariadb(online)
proxy = 'proxyscrape' # opsi kosong atau proxyscrape

jenis = 'company registry'

filter_wilayah = {'PROPINSI': 'DAERAH ISTIMEWA YOGYAKARTA',
                  'KOTA': '',
                  'KECAMATAN': '',
                  'KELURAHAN': ''}

db_check(database, clean_table_name(jenis, filter_wilayah))
random_pos_check(database)

# PILIH SALAH SATU
# map_scraper(database, query, filter_wilayah, proxy) # max 20 results per query tapi lebih cepat
# map_scraper_with_scrolls(database, jenis, filter_wilayah, proxy) # max 200 results per query tapi lebih lama
map_scraper_with_scrolls_deep(database, jenis, filter_wilayah, proxy) # paling lama tapi paling akurat, max juga 200 results per query data lebih lengkap dan akurat

Ekspektasi jumlah query di cycle ini: 426
Query 1/426 kosong kelurahan NGUNUT kecamatan PLAYEN kota KAB. GUNUNGKIDUL provinsi DAERAH ISTIMEWA YOGYAKARTA
Total waktu 2.213503122329712
Query 2/426 kosong kelurahan CANDIREJO kecamatan SEMANU kota KAB. GUNUNGKIDUL provinsi DAERAH ISTIMEWA YOGYAKARTA
Total waktu 1.8798506259918213
Query 3/426 kosong kelurahan GETAS kecamatan PLAYEN kota KAB. GUNUNGKIDUL provinsi DAERAH ISTIMEWA YOGYAKARTA
Total waktu 2.099151611328125
Query 4/426 kosong kelurahan GIRIPANGGUNG kecamatan TEPUS kota KAB. GUNUNGKIDUL provinsi DAERAH ISTIMEWA YOGYAKARTA
Total waktu 1.5928709506988525
Query 5/426 kosong kelurahan TAMBAKROMO kecamatan PONJONG kota KAB. GUNUNGKIDUL provinsi DAERAH ISTIMEWA YOGYAKARTA
Total waktu 2.0551164150238037
Query 6/426 kosong kelurahan BEJI kecamatan NGAWEN kota KAB. GUNUNGKIDUL provinsi DAERAH ISTIMEWA YOGYAKARTA
Total waktu 2.1019365787506104
Query 7/426 company registry di kelurahan SUMBERGIRI kecamatan PONJONG kota KAB. GUNUNGKIDUL provi

KeyboardInterrupt: 

In [None]:
# TODO buat async function agar bisa beberapa scraper sekaligus
    # TODO ubah cek iterasi dari id cari ke kolom penanda iterasi

In [None]:
# %pyinstaller --onefile --paths ./env/Lib/site-packages main.py