#### References
* https://www.youtube.com/watch?v=C5AOZZWxvIY&ab_channel=GunturBudi
* https://stackoverflow.com/questions/58474049/how-to-scrape-websites-that-have-loaders 
* https://www.kaggle.com/code/farhanmalik371/bag-of-words-attempt
* https://www.selenium.dev/documentation/webdriver/getting_started/first_script/#2-take-action-on-browser
* https://www.youtube.com/watch?v=UOsRrxMKJYk&ab_channel=ThePyCoach

### Web Scraping Gramedia.com

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from random import randint

#### Scraping the href-links

In [63]:
# Setup Options
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Gramedia.com URLs
url = "https://www.gramedia.com/categories/buku/komik?page="

# NOTE
# There are 50 pages of book with comic & graphic novel category
# but to demonstrate the datascrape, this program will only scrape
# the first 6 pages (120 books).

# Array of href of each book in the list of book
hrefArray = []

# Initiate WebDriver Session
driver = webdriver.Chrome(options=chrome_options)
    
# Opens The Website in Chrome
driver.get(url + str(1))

# Fetching each book href link accross 50 pages
for p in range(6):

    # Waiting the page to load for 10-15 seconds
    time.sleep(randint(10,15))
    
    # Get the elements of the website that contains the href link
    products = driver.find_elements("xpath", "//gm-product-list/div/a")
    
    # Put those link inside an array
    for product in products:
        hrefArray.append(product.get_attribute('href'))
    
    # Next Page
    nextButton = driver.find_element("xpath", '//i[@class="ion-ios-arrow-forward"]')
    nextButton.click()

driver.quit()

print("Num href-links fetched: " + str(len(hrefArray)))

Num href-links fetched: 120


#### Scraping book data by looping the href-links

In [73]:
# Function to Reformat Months in Published Date
def reformat(date):
    dateTemp = date.split(" ")
    if (dateTemp[1] == "Mei"):
        dateTemp[1] = "May"
    elif (dateTemp[1] == "Agu"):
        dateTemp[1] = "Aug"
    elif (dateTemp[1] == "Agt"):
        dateTemp[1] = "Aug"
    elif (dateTemp[1] == "Okt"):
        dateTemp[1] = "Oct"
    elif (dateTemp[1] == "Des"):
        dateTemp[1] = "Dec"
    return dateTemp[0] + " " + dateTemp[1] + " " + dateTemp[2]

# Function to Equalize the length of the array of data if an error occurred
def equalize(numProcess):
    global author, title, basePrice, description, pageNum, publisher, publishDate, weight, isbn, width, language, length
    
    if (len(author) !=  numProcess):
        author.pop(len(author) - 1)
    if (len(title) != numProcess):
        title.pop(len(title) - 1)
    if (len(basePrice) != numProcess):
        basePrice.pop(len(basePrice) - 1)
    if (len(description) != numProcess):
        description.pop(len(description) - 1)
    if (len(pageNum) != numProcess):
        pageNum.pop(len(pageNum) - 1)
    if (len(publisher) != numProcess):
        publisher.pop(len(publisher) - 1)
    if (len(publishDate) != numProcess):
        publishDate.pop(len(publishDate) - 1)
    if (len(weight) != numProcess):
        weight.pop(len(weight) - 1)
    if (len(isbn) != numProcess):
        isbn.pop(len(isbn) - 1)
    if (len(width) != numProcess):
        width.pop(len(width) - 1)
    if (len(language) != numProcess):
        language.pop(len(language) - 1)
    if (len(length) != numProcess):
        length.pop(len(length) - 1)

In [74]:
from bs4 import BeautifulSoup as bsp
import requests
import pandas as pd
from datetime import datetime
import traceback

# Arrays to contain the data
author = []
title = []
basePrice = []
description = []
pageNum = []
publisher = []
publishDate = []
weight = []
isbn = []
width = []
language = []
length = []

In [75]:
# Processing IDX
i = 0
# The purpose of this IDX is that if an exception occurred,
# the pre-processing won't restart from the first href.
# It'll instead start from where it crashed.

In [77]:
# Create New Session
driver = webdriver.Chrome(options=chrome_options)

# Scrap the book data from the href
for numProcess in range(i, len(hrefArray)):
    
    print("Process #" + str(numProcess + 1) + ":", end=" ")
    
    # Open & wait the page to load
    driver.get(hrefArray[numProcess])
    time.sleep(randint(8,12))
    
    # Get the HTML
    html = driver.page_source
    
    # Change html into Soup Class
    soup = bsp(html, "lxml")
    # print(soup.prettify())
    
    try:
        # Find Author
        authorName = soup.find("span", "title-author")
        author.append(authorName.get_text().strip())
        # print(authorName.get_text(), end=" # ")

        # Find Title
        bookTitle = soup.find("div", "book-title")
        title.append(bookTitle.get_text().strip())
        # print(bookTitle.get_text())

        # Find Base Price (Price Before Discount)
        bookPrice = soup.find("div", "price-promo")
        if (bookPrice is None):
            bookPrice = soup.find("div", "price-from")
        basePrice.append(int(bookPrice.get_text().replace("Rp","").replace(".","").replace(" ","")))

        # Find Description
        prodDesc = soup.find("div", "product-desc")
        bookDesc = prodDesc.find_all("div", "")
        description.append(bookDesc[len(bookDesc) - 1].get_text().strip())

        # Find Book Details
        bookDetail = soup.find("div", "detail-section")
        details = bookDetail.find_all("p")
        pageNum.append(int(float(details[0].get_text())))
        publisher.append(details[1].get_text())

        weight.append(float(details[3].get_text().replace("kg","").replace(" ","")))
        isbn.append(int(details[4].get_text()))
        width.append(float(details[5].get_text().replace("cm","").replace(" ","")))
        if (len(details) == 8):
            language.append(details[6].get_text().capitalize())
            length.append(float(details[7].get_text().replace("cm","").replace(" ","")))
        else:
            language.append("N/A")
            length.append(float(0.0))

        date = details[2].get_text().strip()
        publishDate.append(datetime.strptime(reformat(date), '%d %b %Y').strftime('%Y-%m-%d'))
        
        i += 1
        print("Success!")
    except Exception as e:
        if (numProcess == 41):
            hrefArray.pop(41)
            print("No Data", end = " ")
        print("Error!")
        equalize(i)
        traceback.print_exc()
        break
        

# Close The Browser
driver.quit()

Process #42: Success!
Process #43: Success!
Process #44: Success!
Process #45: Success!
Process #46: Success!
Process #47: Success!
Process #48: Success!
Process #49: Success!
Process #50: Success!
Process #51: Success!
Process #52: Success!
Process #53: Success!
Process #54: Success!
Process #55: Success!
Process #56: Success!
Process #57: Success!
Process #58: Success!
Process #59: Success!
Process #60: Success!
Process #61: Success!
Process #62: Success!
Process #63: Success!
Process #64: Success!
Process #65: Success!
Process #66: Success!
Process #67: Success!
Process #68: Success!
Process #69: Success!
Process #70: Success!
Process #71: Success!
Process #72: Success!
Process #73: Success!
Process #74: Success!
Process #75: Success!
Process #76: Success!
Process #77: Success!
Process #78: Success!
Process #79: Success!
Process #80: Success!
Process #81: Success!
Process #82: Success!
Process #83: Success!
Process #84: Success!
Process #85: Success!
Process #86: Success!
Process #8

In [79]:
# Check Number of Process with The Num of Datum
print(i)
print(len(isbn))
print(len(author))
print(len(title))
print(len(basePrice))
print(len(description))
print(len(pageNum))
print(len(publisher))
print(len(publishDate))
print(len(weight))
print(len(length))
print(len(language))

119
119
119
119
119
119
119
119
119
119
119
119


### Creating Dataframe
#### in accordance with the Relational Table

In [80]:
# 1
# Create a unique ID for each author
uniqueAuthorSet = set(author.copy())
uniqueAuthorList = list(uniqueAuthorSet)
uniqueAID = []
for i in range (len(uniqueAuthorList)):
    uniqueAID.append("AU" + str(i + 1).zfill(4))

# Create an array to fill author description column
dummy = ['by 18221171' for i in range (len(uniqueAuthorList))]
    
# Create Dictionary to make Author Relation
authDict = {'authorID':uniqueAID,
            'authorName':uniqueAuthorList,
            'description':dummy}

# Create the dataframe from dictionary
authDF = pd.DataFrame(authDict,columns = ['authorID','authorName','description'])

# Show Table
authDF

Unnamed: 0,authorID,authorName,description
0,AU0001,TSUTOMO SATO,by 18221171
1,AU0002,Ojiro Makoto,by 18221171
2,AU0003,Faza Meonk,by 18221171
3,AU0004,RYOHGO NARITA/SHINTA FUJIMOTO,by 18221171
4,AU0005,TAPPEI NAGATSUKI/MAKOTO FUGETSU/SHINICHIROU OT...,by 18221171
...,...,...,...
99,AU0100,Yearim dang,by 18221171
100,AU0101,Pionicon,by 18221171
101,AU0102,Kei Koga,by 18221171
102,AU0103,Ryo Azuki,by 18221171


In [81]:
# 2
# Create a unique ID for each publisher
uniquePublisherSet = set(publisher.copy())
uniquePublisherList = list(uniquePublisherSet)
uniquePID = []
for i in range (len(uniquePublisherList)):
    uniquePID.append("PB" + str(i + 1).zfill(4))

# Create an array to fill other columns description column
dummy = ['by 18221171' for i in range (len(uniquePublisherList))]
    
# Create Dictionary to make Author Relation
publDict = {'publisherID':uniquePID,
            'publisherName':uniquePublisherList,
            'address':dummy,
            'phone':dummy}

# Create the dataframe from dictionary
publDF = pd.DataFrame(publDict,columns = ['publisherID','publisherName','address','phone'])

# Show Table
publDF

Unnamed: 0,publisherID,publisherName,address,phone
0,PB0001,Sinar angsa,by 18221171,by 18221171
1,PB0002,Elex Media Komputindo,by 18221171,by 18221171
2,PB0003,Romancious,by 18221171,by 18221171
3,PB0004,bukune,by 18221171,by 18221171
4,PB0005,Salsabila Al-kautsar,by 18221171,by 18221171
5,PB0006,Salsabila,by 18221171,by 18221171
6,PB0007,Aria Media Mandiri,by 18221171,by 18221171
7,PB0008,Bhuana Ilmu Populer,by 18221171,by 18221171
8,PB0009,Akad x Tekad,by 18221171,by 18221171
9,PB0010,Loveable,by 18221171,by 18221171


In [82]:
# 3
# Create an array containing authorID for the book relation
authorsID = []
for a in author:
    authorsID.append("AU" + str(uniqueAuthorList.index(a) + 1).zfill(4))
    
# Create an array containing publisherID for the book relation
publisherID = []
for p in publisher:
    publisherID.append("PB" + str(uniquePublisherList.index(p) + 1).zfill(4))


# Creating Dictionary to make Dataframe
prodDict = {'isbn':isbn, 
            'authorID':authorsID,
            'title':title, 
            'basePrice':basePrice, 
            'description':description, 
            'pageNum':pageNum, 
            'publisherID':publisherID, 
            'publishDate':publishDate,
            'weight':weight,
            'width':width,
            'length':length,
            'language':language}

# print(len(isbn))
# print(len(authorsID))
# print(len(title))
# print(len(basePrice))
# print(len(description))
# print(len(pageNum))
# print(len(publisherID))
# print(len(publishDate))
# print(len(weight))
# print(len(length))
# print(len(language))

# Create the dataframes from dictionary
bookDF = pd.DataFrame(prodDict,columns = ['isbn','authorID','title','basePrice','description','pageNum','publisherID','publishDate','weight','width','length','language'])

# Show Table
bookDF

Unnamed: 0,isbn,authorID,title,basePrice,description,pageNum,publisherID,publishDate,weight,width,length,language
0,9786232423763,AU0038,One by One,129000,"Snoop, aplikasi musik baru yang berhasil merai...",444,PB0019,2023-04-24,0.410,14.0,21.0,Indonesia
1,9786022204763,AU0101,Si Juki Anak Kos London,82500,Si Juki Anak Kos London\r\n\r\n\r\n\r\nJuki di...,180,PB0013,2022-12-26,0.180,14.0,20.0,Indonesia
2,9786024554811,AU0045,World Ghost Stories Eropa,125000,"""Di Eropa, banyak sekali cerita hantu yang ber...",192,PB0008,2018-04-08,0.200,17.0,22.0,Indonesia
3,9786238019106,AU0026,Muhammad Membangun Madinah,135000,"Perjalanan hidup manusia agung, Rasul Muhammad...",289,PB0006,2023-06-23,0.240,13.0,20.0,Indonesia
4,9786230311147,AU0046,Mice Cartoon - Indonesia Seru!,85000,Setiap hari Minggu kebanyakan orang tidak pern...,152,PB0015,2023-06-19,0.200,14.8,21.0,Indonesia
...,...,...,...,...,...,...,...,...,...,...,...,...
114,9786230046926,AU0094,Radiant 04,45000,Radiant (bahasa Jepang: ラディアン Hepburn: Radian)...,184,PB0002,2023-03-31,0.150,12.0,18.0,Indonesia
115,9786026714794,AU0003,Si Juki Anak Kosan Vol 3,84000,"Wah, si Juki dan kawan-kawan kembali dengan ki...",190,PB0020,2023-03-30,0.220,14.0,20.0,Indonesia
116,9786235729503,AU0021,"My Wife`s My Lecturer, My Husband`s My Student",85000,Semua orang memiliki kisahnya sendiri-sendiri....,174,PB0014,2023-03-29,0.155,14.0,20.0,Indonesia
117,9786230310423,AU0068,AKASHA : Chainsaw Man 02,48000,"Di antara jenis buku lainnya, komik memang dis...",192,PB0015,2023-04-07,0.150,13.0,19.0,Indonesia


### Dumping into JSON

In [83]:
from json import loads, dumps

# book.json
result = bookDF.to_json(orient="table")
parsed = loads(result)
print(dumps(parsed, indent=4))

bookDF.to_json(r'C:\Users\StepHomie\Documents\Hans St\Code\Jupyter\Seleksi-2023-Tugas-1\Data Scraping\data\book.json', orient="table", indent=4)

{
    "schema": {
        "fields": [
            {
                "name": "index",
                "type": "integer"
            },
            {
                "name": "isbn",
                "type": "integer"
            },
            {
                "name": "authorID",
                "type": "string"
            },
            {
                "name": "title",
                "type": "string"
            },
            {
                "name": "basePrice",
                "type": "integer"
            },
            {
                "name": "description",
                "type": "string"
            },
            {
                "name": "pageNum",
                "type": "integer"
            },
            {
                "name": "publisherID",
                "type": "string"
            },
            {
                "name": "publishDate",
                "type": "string"
            },
            {
                "name": "weight",
                "type": "numb

In [84]:
# author.json
result = authDF.to_json(orient="table")
parsed = loads(result)
print(dumps(parsed, indent=4))

authDF.to_json(r'C:\Users\StepHomie\Documents\Hans St\Code\Jupyter\Seleksi-2023-Tugas-1\Data Scraping\data\author.json', orient="table", indent=4)

{
    "schema": {
        "fields": [
            {
                "name": "index",
                "type": "integer"
            },
            {
                "name": "authorID",
                "type": "string"
            },
            {
                "name": "authorName",
                "type": "string"
            },
            {
                "name": "description",
                "type": "string"
            }
        ],
        "primaryKey": [
            "index"
        ],
        "pandas_version": "1.4.0"
    },
    "data": [
        {
            "index": 0,
            "authorID": "AU0001",
            "authorName": "TSUTOMO SATO",
            "description": "by 18221171"
        },
        {
            "index": 1,
            "authorID": "AU0002",
            "authorName": "Ojiro Makoto",
            "description": "by 18221171"
        },
        {
            "index": 2,
            "authorID": "AU0003",
            "authorName": "Faza Meonk",
            "de

In [85]:
# publisher.json
result = publDF.to_json(orient="table")
parsed = loads(result)
print(dumps(parsed, indent=4))

publDF.to_json(r'C:\Users\StepHomie\Documents\Hans St\Code\Jupyter\Seleksi-2023-Tugas-1\Data Scraping\data\publisher.json', orient="table", indent=4)

{
    "schema": {
        "fields": [
            {
                "name": "index",
                "type": "integer"
            },
            {
                "name": "publisherID",
                "type": "string"
            },
            {
                "name": "publisherName",
                "type": "string"
            },
            {
                "name": "address",
                "type": "string"
            },
            {
                "name": "phone",
                "type": "string"
            }
        ],
        "primaryKey": [
            "index"
        ],
        "pandas_version": "1.4.0"
    },
    "data": [
        {
            "index": 0,
            "publisherID": "PB0001",
            "publisherName": "Sinar angsa",
            "address": "by 18221171",
            "phone": "by 18221171"
        },
        {
            "index": 1,
            "publisherID": "PB0002",
            "publisherName": "Elex Media Komputindo",
            "address": "by

---
#### (Draft) Scraping Test-Run & Cleaning
---

In [61]:
# Create New Session
driver = webdriver.Chrome(options=chrome_options)
    
# Open Chrome
driver.get(hrefArray[2])
time.sleep(8)

# Get the HTML
html = driver.page_source

# Change html into Soup Class
soup = bsp(html, "lxml")
print(soup.prettify())

# Close
driver.quit()

<html class="" lang="id">
 <head>
  <!-- Google Tag Manager -->
  <script async="" src="https://cdn.yellowmessenger.com/plugin/latest/dist/main.min.js" type="text/javascript">
  </script>
  <script async="" src="https://app.yellowmessenger.com/widget/main.js" type="text/javascript">
  </script>
  <script async="" src="//cdnt.netcoresmartech.com/webactivity/ADGMOT35CHFLVDHBJNIG50K96B33CLCBGFVBD608PHJ3ICES10U0.js">
  </script>
  <script async="" src="//cdnt.netcoresmartech.com/webp/ADGMOT35CHFLVDHBJNIG50K96B33CLCBGFVBD608PHJ3ICES10U0_webp.js">
  </script>
  <script async="" src="//cdnt.netcoresmartech.com/webactivity/ADGMOT35CHFLVDHBJNIG50K96B33CLCBGFVBD608PHJ3ICES10U0.js">
  </script>
  <script async="" src="//cdnt.netcoresmartech.com/webactivity/ADGMOT35CHFLVDHBJNIG50K96B33CLCBGFVBD608PHJ3ICES10U0.js">
  </script>
  <script async="" src="https://webtrafficsource.com/track/code.js">
  </script>
  <script async="" data-id="C7OGSDL6KGKTT9CM9QUG" src="https://analytics.tiktok.com/i18n/pixe

In [68]:
# Find Author
authorName = soup.find("span", "title-author")
print(authorName.get_text())

# Find Title
bookTitle = soup.find("div", "book-title")
print(bookTitle.get_text())

# Find Base Price (Price Before Discount)
basePrice = soup.find("div", "price-promo")
if (basePrice is None):
    basePrice = soup.find("div", "price-from")
print(int(basePrice.get_text().replace("Rp","").replace(".","").replace(" ","")))

# Find Description
prodDesc = soup.find("div", "product-desc")
bookDesc = prodDesc.find_all("div", "")
print(bookDesc[1].get_text().strip())

# Find Book Details
bookDetail = soup.find("div", "detail-section")
details = bookDetail.find_all("p")
print(int(details[0].get_text()))
print(details[1].get_text())
print(float(details[3].get_text().replace("kg","").replace(" ","")))
print(int(details[4].get_text()))
print(int(details[5].get_text().replace("cm","").replace(" ","")))
print(details[6].get_text().capitalize())
print(int(details[7].get_text().replace("cm","").replace(" ","")))

# Special Process for Publish Date
from datetime import datetime
print(datetime.strptime(details[2].get_text().strip(), '%d %b %Y').strftime('%Y-%m-%d'))

Yim Kang Jae
World Ghost Stories Eropa
125000
"Di Eropa, banyak sekali cerita hantu yang berkembang di sana. Selain itu, ada banyak tempat dan legenda yang tak kalah menyeramkan di sana. Tokoh hantu terkenal pun banyak yang berasal dari Eropa. Di antaranya, raksasa Frankenstein yang besar dan mengerikan, drakula pengisap darah manusia, dan lain-lain. 



Beberapa kisah atau cerita hantu di Eropa pun sering keluar dalam film atau dongeng.
Komik ini terdiri dari dua bentuk cerita, yaitu berbentuk dongeng dan komik. Dengan gaya bahasa penulisan yang seru, komik ini pasti membuat pembaca bergidik ngeri.  "
192
Bhuana Ilmu Populer
0.2
9786024554811
17
Indonesia
22
2018-04-08
