# Scrape using BeautifulSoup and Selenium

In [1]:
# import necessary libraries
import requests
import html5lib
import time
import pandas as pd
import datetime
import re
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import ElementNotInteractableException

In [2]:
# enter URL address
url = 'https://covid19.jembranakab.go.id/data'

In [3]:
# scraping using BeautifulSoup to get the number of all buttons for kecamatan
req = requests.get(url, verify=False)
soup = bs(req.content, 'html5lib')
allButtons = len(soup.findAll('a', text='Detail'))

In [4]:
# automate browser using selenium. we use 'with' to ease ourself that we don't need to close the driver after use
with webdriver.Chrome() as driver:
# driver = webdriver.Chrome()
    print('Opening webpage.')
    driver.get(url) # open url
    driver.maximize_window() # maximize window of browser

    # wait for 2 seconds than scroll down
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, 1080)")

    # wait until the button interactable
    wait = WebDriverWait(driver, 20)
    wait.until(EC.visibility_of_all_elements_located((By.XPATH, '//*[@id="contact"]/div/div[2]/div[1]/div/div[2]/a')))

    # define empty lists for storing values
    KecamatanList = []
    DesaList = []
    ProsesPantauList = []
    SelesaiPantauList = []
    ProsesRawatList = []
    SelesaiRawatList = []
    PositifList = []
    SembuhList = []
    MeninggalList = []
    TotalPantauList = []
    TotalRawatList = []

    print('Starting scraping COVID-19 values.')
    # for-loop through each button of kecamatan
    for order in tqdm(range(allButtons)):
        # define xpath for each button kecamatan
        xpath_button = '//*[@id="contact"]/div/div[2]/div[{}]/div/div[2]/a'.format(str(order+1))

        # click the button
        driver.find_element_by_xpath(xpath_button).click()

        # wait until the pop-up windows visible
        wait = WebDriverWait(driver, 20)
        wait.until(EC.visibility_of_all_elements_located((By.CLASS_NAME, 'card-body')))
        soupButton = bs(driver.page_source, 'html5lib') # then throw the page to BeautifulSoup to scrape

        # get kecamatan name shown in the pop-up
        teks_kec = soupButton.find('h2', attrs={'class':'ser-title'}).text

        # find values for each column
        DesaRaw = soupButton.findAll('td', attrs={'data-label':'Desa'})
        ProsesPantauRaw = soupButton.findAll('td', attrs={'data-label':'Proses Pemantauan'})
        SelesaiPantauRaw = soupButton.findAll('td', attrs={'data-label':'Selesai Pemantauan'})
        ProsesRawatRaw = soupButton.findAll('td', attrs={'data-label':'Proses Perawatan'})
        SelesaiRawatRaw = soupButton.findAll('td', attrs={'data-label':'Selesai Perawatan'})
        PositifRaw = soupButton.findAll('td', attrs={'data-label':'Positif COVID-19'})
        SembuhRaw = soupButton.findAll('td', attrs={'data-label':'Sembuh'})
        MeninggalRaw = soupButton.findAll('td', attrs={'data-label':'Meninggal'})

        # for-loop through each row in the columns defined above
        for desa, prosespantau, selesaipantau, prosesrawat, selesairawat, positif, sembuh, meninggal in zip(DesaRaw, ProsesPantauRaw, SelesaiPantauRaw, ProsesRawatRaw, SelesaiRawatRaw, PositifRaw, SembuhRaw, MeninggalRaw):
            DesaList.append(desa.text.title())
            ProsesPantauList.append(int(prosespantau.text))
            SelesaiPantauList.append(int(selesaipantau.text))
            ProsesRawatList.append(int(prosesrawat.text))
            SelesaiRawatList.append(int(selesairawat.text))
            PositifList.append(int(positif.text))
            SembuhList.append(int(sembuh.text))
            MeninggalList.append(int(meninggal.text))

            # we append kecamatan name here in order to get the kecamatan name with the same length as DesaList
            KecamatanList.append(re.findall(r'\s\w+$', teks_kec)[0][1:])

        # for-loop through total of proses pemantauan and total of proses perawatan
        # both are extracted from the same tag so that we loop them together
        for idx, total in enumerate(soupButton.findAll('td', attrs={'data-label':'Total'})):
            if idx % 2 == 0: # total of proses pemantauan is located in the even numbers
                TotalPantauList.append(int(total.text))
            else: # total of proses perawatan is located in the odd numbers
                TotalRawatList.append(int(total.text))

        # close pop up window
        driver.find_element_by_xpath('//*[@id="myModal"]/div/div/div[1]/button').click()
        time.sleep(1)

    print('COVID-19 values scraped successfully.')

    ## SCRAPING ZONE COLOR FROM THE MAP ##
    ######################################
    driver.refresh() # refresh the webpage

    # # wait for 2 seconds
    time.sleep(2)

    driver.execute_script("window.scrollTo(0, 2100)") # scroll to the map
    time.sleep(1)

    # re-scrape the webpage in order to get fresh html
    soup = bs(driver.page_source, 'html5lib')

    # set empty lists to store zone color and its desa pair
    color = []
    desa_zone = []

    # define the elements of zoom-out button and footer of leaflet map
    # these elements are useful to drag the map
    zoom_element = driver.find_element_by_xpath('//*[@id="map"]/div[2]/div[1]/div/a[2]')
    # footer_element = driver.find_element_by_css_selector('#map > div.leaflet-control-container > div.leaflet-bottom.leaflet-right > div > a')

    # define all circles available in the map
    allCircles = soupButton.findAll('img', attrs={'class':"leaflet-marker-icon leaflet-zoom-animated leaflet-interactive"})

    print('Starting scraping zone color data in the map.')
    # for-loop through each circle
    for idx, circle in tqdm(enumerate(allCircles)):
        # define css_selector ordered by idx
        selector = '#map > div.leaflet-pane.leaflet-map-pane > div.leaflet-pane.leaflet-marker-pane > img:nth-child({})'.format(str(idx+1))
        color.append(circle.get('src')) # append the color url of the circle

        # the exceptions below are used in two error conditions (ElementNotInteractableException & ElementClickInterceptedException):
        # 1. during selenium clicking each circle, the map shown is outside of boundary so that selenium can't click the next circle
        # 2. during selenium clicking each circle, the circle can't be clicked by selenium due to overlapping with other elements
        try:
            source_element = driver.find_element_by_css_selector(selector) # define the element of circle
            source_element.click() # try to click it
        
        # if can't be clicked due to not interactable, enter the first error handling
        # where selenium is set to click the source_element (then hold) and move to zoom_element so that the next circle element can be clicked
        except ElementNotInteractableException:
            action = ActionChains(driver)
            action.click_and_hold(source_element).move_by_offset(10,0).release(zoom_element).perform()
            source_element.click()
        
        # if can't be clicked due to overlapping, enter the second error handling
        # where zoom-in button of the map is clicked so that the circle is expected to not overlap anymore
        except ElementClickInterceptedException:
            driver.find_element_by_xpath('//*[@id="map"]/div[2]/div[1]/div/a[1]').click() # zoom-in
            time.sleep(0.5) # wait until zoom-in responding
            source_element.click()
        
        # adjust the map right in the center of screen in case the exception above has moved the screen
        finally:
            driver.execute_script("window.scrollTo(0, 2100)")

        time.sleep(1) # wait until the pop-up circle responding
        desa_zone.append(driver.find_element_by_xpath('//*[@id="map"]/div[1]/div[6]/div/div[1]/div/table/tbody/tr[1]/td/b').text.title()) # scrape the desa name
        
        driver.find_element_by_xpath('//*[@id="map"]/div[1]/div[6]/div/a').click() # close pop-up zone

        # zoom-out
        driver.find_element_by_xpath('//*[@id="map"]/div[2]/div[1]/div/a[2]').click()
        time.sleep(1)  # wait until zoom-out responding

    print('Zone colors data scraped successfully.')

print('Browser closed.')

Opening webpage.
  0%|          | 0/5 [00:00<?, ?it/s]Starting scraping COVID-19 values.
100%|██████████| 5/5 [00:10<00:00,  2.11s/it]
COVID-19 values scraped successfully.
0it [00:00, ?it/s]Starting scraping zone color data in the map.
48it [01:54,  2.39s/it]
Zone colors data scraped successfully.
Browser closed.


In [6]:
# get then define list of date (update date from the web)
teks = soup.findAll('p')[0].text
tgl = re.findall(r'\d{1,2}/\d{1,2}/\d{4}', teks)[0]
tgl = datetime.datetime.strptime(tgl, '%d/%m/%Y')

TanggalList = [tgl for i in range(len(DesaList))]

In [7]:
# define list of kabupaten
KabupatenList = ['Jembrana' for i in range(len(DesaList))]

In [8]:
# combine all data
data = [TanggalList, KabupatenList, KecamatanList, DesaList, ProsesPantauList, SelesaiPantauList, TotalPantauList, ProsesRawatList, SelesaiRawatList, TotalRawatList, PositifList, SembuhList, MeninggalList]

# define column headers
headers = ['tanggal_update','kabupaten','kecamatan','desa','proses_pemantauan','selesai_pemantauan','total_pemantauan','proses_perawatan','selesai_perawatan','total_perawatan','positif_covid19','sembuh','meninggal']

In [9]:
# create a dataframe of the data
df = pd.DataFrame(data).transpose()
df.columns = headers

In [10]:
# see the zone colors
color_uniq = list(set(color))
print(color_uniq)

['https://covid19.jembranakab.go.id/img/circle.png', 'https://covid19.jembranakab.go.id/img/green.png', 'https://covid19.jembranakab.go.id/img/circle_odp.png']


<img src="https://covid19.jembranakab.go.id/img/circle_odp.png" alt="Drawing" style="width: 100px;"/>
This link https://covid19.jembranakab.go.id/img/circle_odp.png has yellow color.

<img src="https://covid19.jembranakab.go.id/img/green.png" alt="Drawing" style="width: 100px;"/>
This link https://covid19.jembranakab.go.id/img/green.png has green color.

<img src="https://covid19.jembranakab.go.id/img/circle.png" alt="Drawing" style="width: 100px;"/>
This link https://covid19.jembranakab.go.id/img/circle.png has red color.
<br></br>

Therefore, using those images, we can hard code the zone colors as defined below.

In [11]:
url2color = {color_uniq[0]:'yellow', color_uniq[1]:'green', color_uniq[2]:'red'} # a dict to convert url text to color
DesaColorDict = {des:url2color[col] for col, des in zip(color, desa_zone)} # a dict to store desa and color pairs

In [12]:
print("Which desa doesn't have zone color:\n", set(desa_zone) ^ set(df.desa.to_list()))

Which desa doesn't have zone color:
 {'Delodberawah', 'Medewi', 'Blimbingsari'}


In [30]:
# create a function to write desa zone color to dataframe
def write_zonecolor(desa):
    intersect = set([desa]).intersection(set([i for i in DesaColorDict.keys()]))
    if intersect:
        color = DesaColorDict[list(intersect)[0]]
        return color
    else:
        return ''

In [31]:
df['zona'] = df.desa.apply(write_zonecolor)

In [33]:
df.head()

Unnamed: 0,tanggal_update,kabupaten,kecamatan,desa,proses_pemantauan,selesai_pemantauan,total_pemantauan,proses_perawatan,selesai_perawatan,total_perawatan,positif_covid19,sembuh,meninggal,zona
0,2020-11-15,Jembrana,Melaya,Tuwed,0,15,15,0,2,2,2,1,1,yellow
1,2020-11-15,Jembrana,Melaya,Tukadaya,0,5,5,0,4,4,2,2,0,green
2,2020-11-15,Jembrana,Melaya,Gilimanuk,1,5,6,1,2,3,11,10,0,yellow
3,2020-11-15,Jembrana,Melaya,Blimbingsari,0,3,3,0,0,0,0,0,0,
4,2020-11-15,Jembrana,Melaya,Manistutu,0,8,8,0,4,4,9,9,0,green


In [18]:
df.shape

(51, 14)

In [None]:
df.iloc[:5, :]