## Web Scraping Watch Product in *jamtangan.com* with Python

### Pre-requisite:

* Have python > 3.0 installed : https://www.python.org/downloads/windows/
* Ensure pip or anaconda is installed
* Have jupyter notebook installed : https://jupyter.org/install (if using pip) or https://anaconda.org/anaconda/jupyter (if using anaconda)
* Have Selenium WebDriver installed : https://pypi.org/project/selenium/ (if using pip) or https://anaconda.org/conda-forge/selenium (if using anaconda)
* Have Pandas installed
* Download chrome webdriver : https://chromedriver.chromium.org/downloads (make sure it supports your Chrome version!)

In [None]:
# pip install -r "../requirements.txt"
# pip install EmailMessage

### Import Necessary Libraries

In [45]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime
import csv
import json
import time
import os
from email.message import EmailMessage
import ssl, smtplib

### Setup Email to Send Notification

In [4]:
time_start = time.asctime( time.localtime(time.time()) )

email_sender = "18221096@std.stei.itb.ac.id"
email_password = "<generated>"
email_receiver = "fikrinaufalh9@gmail.com"

time_end = time.asctime( time.localtime(time.time()) )

subject = "Web Scraping Report"
body = f"""
WEB SCRAPING DONE!!!
Time Start: {time_start}
Time End: {time_end}
"""

email = EmailMessage()
email['From'] = email_sender
email['To'] = email_receiver
email['Subject'] = subject
email.set_content(body)

context = ssl.create_default_context()

with smtplib.SMTP_SSL("smtp.gmail.com", 465, context=context) as smtp:
    smtp.login(email_sender, email_password)
    smtp.sendmail(email_sender, email_receiver, email.as_string())

### Function Definition

#### Supporting Function

In [None]:
def to_isoformat(input):
    month_dict = {
        "Jan": "01",
        "Feb": "02",
        "Mar": "03",
        "Apr": "04",
        "Mei": "05",
        "Jun": "06",
        "Jul": "07",
        "Ags": "08",
        "Sep": "09",
        "Okt": "10",
        "Nov": "11",
        "Des": "12"
    }

    for key, value in month_dict.items():
        input = input.replace(key, value)
    
    datetime_object = datetime.strptime(input, "%d %m %Y, %H:%M WIB")

    date = datetime_object.date()
    time = datetime_object.time()

    return [date, time]

def to_email(name):
    name = name.lower().strip()
    email = name.replace(" ", "") + "@gmail.com"
    return email

def csv_to_json(csvFilePath, jsonFilePath):
    jsonArray = []

    #Menulis file csv
    with open(csvFilePath, encoding='utf-8') as csvf: 
        #Me-load file csv menggunakan library dictionary reader
        csvReader = csv.DictReader(csvf) 

        #Mengubah setiap baris csv menjadi dictionary python
        for row in csvReader: 
            #Menambah python dictionary ini ke array jsonArray
            jsonArray.append(row)

    #Mengubah jsonArray menjadi JSON String dan menulis ke dalam file
    with open(jsonFilePath, 'w', encoding='utf-8') as jsonf: 
        jsonString = json.dumps(jsonArray, indent=4)
        jsonf.write(jsonString)

#### Extract Product Table Function

In [None]:
def extract_product(driver):
    product_dict = {
        "Brand" : "",
        "Model No" : "",
        "Series" : "",
        "Gender" : "",
        "Colour" : "",
        "Luminous" : "",
        "Calendar" : "",
        "Water Resistance" : "",
        "Movement" : "",
        "Weight after packing" : "",
        "Case Diameter" : "",
        "Strap Material" : "",
    }

    spec_grid = driver.find_element(By.CSS_SELECTOR, ".tab-content .grid")
    spec_list = spec_grid.find_elements(By.CSS_SELECTOR, ".spec-item")
    for spec in spec_list:
        if spec.find_element(By.CSS_SELECTOR, ".font-black").text in product_dict.keys():
            data = spec.find_elements(By.CSS_SELECTOR, ".leading-6")
            key = data[0].text.strip()
            value = data[1].text.strip()
            product_dict[key] = value
    
    return product_dict

#### Extract Sales Table Function

In [None]:
def extract_sales(driver):
    sales_dict = {
        "Product Name" : "",
        "Brand" : "",
        "Model No" : "",
        "Normal Price" : "",
        "Discounted Price" : "",
        "Discount Percentage" : "",
        "Number of Seen" : "",
        "Number of Sold" : "",
        "Offline Stock Status" : "",
        "Online Stock Status" : "",
    }

    spec_grid = driver.find_element(By.CSS_SELECTOR, ".tab-content .grid")
    spec_list_raw = spec_grid.find_elements(By.CSS_SELECTOR, ".spec-item")
    for spec in spec_list_raw:
        if spec.find_element(By.CSS_SELECTOR, ".font-black").text in ["Brand", "Model No"]:
            data = spec.find_elements(By.CSS_SELECTOR, ".leading-6")
            key = data[0].text.strip()
            value = data[1].text.strip()
            sales_dict[key] = value

    sales_dict["Product Name"] = driver.find_element(By.TAG_NAME, "h1").text.strip()

    try:
        normal_price = driver.find_element(By.CSS_SELECTOR, "div[data-testid='test-product-info'] .line-through").text.strip()
    except:
        normal_price = driver.find_element(By.CSS_SELECTOR, "div[data-testid='test-product-info'] .text-xl").text.strip()
    discounted_price = driver.find_element(By.CSS_SELECTOR, "div[data-testid='test-product-info'] .text-xl").text.strip()
    sales_dict["Normal Price"] = int(normal_price.replace("Rp", "").replace(".", "").strip())
    sales_dict["Discounted Price"] = int(discounted_price.replace("Rp", "").replace(".", "").strip())

    discount_percentage = (sales_dict["Normal Price"] - sales_dict["Discounted Price"]) / sales_dict["Normal Price"]
    sales_dict["Discount Percentage"] = round(discount_percentage * 100, 4)

    num_seen = driver.find_element(By.CSS_SELECTOR, ".ic-eye + div > .text-sm").text.strip()
    if (num_seen.__contains__("Rb")):
        num_seen = float(num_seen.replace(" Rb", "").strip()) * 1000
    sales_dict["Number of Seen"] = int(num_seen)

    num_sold = driver.find_element(By.CSS_SELECTOR, ".ic-cart.mr-1 + div > .text-sm").text.strip()
    if (num_sold.__contains__("Rb")):
        num_sold = float(num_sold.replace(" Rb", "").strip()) * 1000
    sales_dict["Number of Sold"] = int(num_sold)

    try:
        empty_badge = driver.find_element(By.CSS_SELECTOR, ".badge.bg-accent-red")
        if (empty_badge != None and empty_badge.text.strip().__contains__("habis")):
            sales_dict["Online Stock Status"] = "Not Available"
        else:
            raise Exception("The badge is not empty badge")
    except:
        online_stock_status = driver.find_element(By.CSS_SELECTOR, ".stepper-wrapper + div").text.strip()
        if (online_stock_status == "STOK ONLINE < 5 PCS"): sales_dict["Online Stock Status"] = "Low (< 5 PCS)"
        elif (online_stock_status == "STOK ONLINE > 5 PCS"): sales_dict["Online Stock Status"] = "High (>= 5 PCS)"
        else: sales_dict["Online Stock Status"] = "Unknown"

    try:
        offline_empty = driver.find_element(By.CSS_SELECTOR, "picture.mr-2 + div")
        if (offline_empty != None and offline_empty.text.strip().__contains__("Tidak tersedia")):
            sales_dict["Offline Stock Status"] = "Not Available"
        else:
            raise Exception("The text is not empty text")
    except:
        try:
            offline_stock_status = driver.find_element(By.CSS_SELECTOR, "div[data-testid='store-item-0']")
            if (offline_stock_status != None):
                sales_dict["Offline Stock Status"] = "Available"
        except:
            sales_dict["Offline Stock Status"] = "Unknown"

    return sales_dict

#### Extract Customer and Review Table Function

In [None]:
def extract_customer_review(driver):
    customer_dict = {
        "Email" : "",
        "Name" : "",
        "Member Status" : ""
    }
    customer_header = customer_dict.keys()
    customer_list = []

    review_dict = {
        "ID Review" : "",
        "Product Name" : "",
        "Email" : "",
        "Date" : "",
        "Time" : "",
        "Rating" : "",
        "Delivery Review" : "",
        "Product Review" : ""
    }
    review_header = review_dict.keys()
    review_list = []

    MAX_REVIEW_PER_PRODUCT = 5
    div_review_pagination = driver.find_element(By.CSS_SELECTOR, ".pb-14")
    try:
        max_tab_per_product = int(div_review_pagination.find_elements(By.TAG_NAME, "li")[-2].text)
    except:
        max_tab_per_product = 1

    review_count = 0
    tab_count = 0
    while (review_count < MAX_REVIEW_PER_PRODUCT and tab_count < max_tab_per_product):
        time.sleep(1)
        div_review_pagination = driver.find_element(By.CSS_SELECTOR, ".pb-14")
        review_divs = div_review_pagination.find_elements(By.XPATH, "preceding-sibling::*[position() <= 3]")

        for review in review_divs:
            reviewer_name = review.find_element(By.CSS_SELECTOR, ".mb-1 .text-base").text
            if reviewer_name[1] == '*' and reviewer_name[-2] == '*': continue

            product_name = driver.find_element(By.CSS_SELECTOR, "h1").text

            rating = len(review.find_elements(By.CSS_SELECTOR, ".rating .ic-star-fill"))
            datetime_review = to_isoformat(review.find_element(By.CSS_SELECTOR, "span.block.text-xxs").text)
            date_review, time_review = datetime_review[0], datetime_review[1]
            
            paragraph_review = review.find_elements(By.CSS_SELECTOR, "p")
            delivery_review = ""
            product_review = ""
            for par in paragraph_review:
                title = par.find_element(By.CSS_SELECTOR, "span").text
                if title == "Pengiriman:": delivery_review = par.text.lstrip("Pengiriman: ")
                if title == "Produk:": product_review = par.text.lstrip("Produk: ")
                else: continue
            if delivery_review == "": delivery_review = "Tidak ada review"
            if product_review == "": product_review = "Tidak ada review"

            member_status = driver.find_element(By.CSS_SELECTOR, ".badge.text-xxs").text

            review_count += 1
            if review_count > MAX_REVIEW_PER_PRODUCT: break

            customer_result = [to_email(reviewer_name), reviewer_name, member_status]
            for key, value in zip(customer_header, customer_result):
                customer_dict[key] = value
            customer_list.append(customer_dict.copy())
        
            result_review = [review_count, product_name, to_email(reviewer_name), date_review, time_review, rating, delivery_review, product_review]
            for key, value in zip(review_header, result_review):
                review_dict[key] = value
            review_list.append(review_dict.copy())
        
        tab_count += 1
        div_review_pagination = driver.find_element(By.CSS_SELECTOR, ".pb-14")
        page_buttons = div_review_pagination.find_elements(By.TAG_NAME, "li")
        for page_button in page_buttons:
            if page_button.text == str(tab_count + 1): 
                page_button.click()
                break

    return [customer_list, review_list]

### Main Program

In [None]:
# define the driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# define the URL
URL = "https://www.jamtangan.com/c/jam-tangan"

# open the URL
driver.maximize_window()
driver.get(URL)

# close the pop-up
try:
    if (driver.find_element(By.ID, "driver-popover-item")):
        driver.find_element(By.CLASS_NAME, "driver-close-btn").click()

    time.sleep(3)
    if (driver.find_element(By.CSS_SELECTOR, "button.ng-binding")):
        driver.find_element(By.CSS_SELECTOR, "button.ng-binding").click()
except:
    pass

# expand the brands list in filter sidebar
expand_brand = driver.find_elements(By.CSS_SELECTOR, ".accordion-content-wrapper .cursor-pointer")[1]
while(expand_brand.text == "LIHAT LAINNYA"):
    expand_brand.click()
    time.sleep(1)
    expand_brand = driver.find_elements(By.CSS_SELECTOR, ".accordion-content-wrapper .cursor-pointer")[1]

# get the brands name and url
brands_raw = driver.find_elements(By.CSS_SELECTOR, ".accordion-content-wrapper")[1].find_elements(By.TAG_NAME, "li")
eliminate = ['Semua Brand', 'Band', 'Strap', 'Bracelet', 'Accessories', 'Jewelry', 'Wallets']
brands = []
for brand in brands_raw:
    eliminated = False
    for phrase in eliminate:
        if (phrase in brand.find_element(By.TAG_NAME, "label").text):
            eliminated = True
            break
    if not eliminated:
        brands.append(brand)

brands_name = [brand.find_element(By.TAG_NAME, "label").text for brand in brands]
brands_url = [brand.find_element(By.TAG_NAME, "a").get_attribute("href") for brand in brands]
num_brands = len(brands)

print("Number of brands: ", num_brands)

product = []
sales = []
customer = []
review = []

for brand_url in brands_url:
    driver.get(brand_url)
    time.sleep(1)
    products_url = driver.find_elements(By.CSS_SELECTOR, "a[data-testid='product-card-test']").get_attribute("href")
    for product_url in products_url:
        driver.get(product_url)
        product.append(extract_product(driver))
        sales.append(extract_sales(driver))
        customer.append(extract_customer_review(driver)[0])
        review.append(extract_customer_review(driver)[1])

    time.sleep(1)

driver.close()

# Approximating the amount of pages to scrap

In [None]:
# # get the number of page for each brand
pages = []
for i in range(num_brands):
    driver.get(brands_url[i])
    time.sleep(1)
    try:
        page = driver.find_element(By.CSS_SELECTOR, ".qa-product-list-pagination").find_elements(By.TAG_NAME, "li")
        num_page = len(page) - 2
    except:
        num_page = 1
    pages.append(num_page)
    print(f"Number of page for {brands_name[i]}: ", num_page)

# print("Number of pages: ", sum(pages))
# print("Average number of page: ", np.mean(pages))
# print("Max number of page: ", np.max(pages))
# print("Min number of page: ", np.min(pages))

In [56]:
print(pages)
cut = []
for page in pages:
    if (page > 3):
        cut.append(3)
    else:
        cut.append(page)
print(cut)
print(f"Sum = {sum(cut)}")
print(f"Mean = {np.mean(cut)}")
freq = [0, 0, 0]
for page in cut:
    freq[page-1] += 1

print(f"Page 1 = {freq[0]}")
print(f"Page 2 = {freq[1]}")
print(f"Page 3 = {freq[2]}")

max_page = freq[0] * 40 + freq[1] * 80 + freq[2] * 120
print(f"Max page to scrap = {max_page}")

[2, 1, 7, 7, 5, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 4, 1, 7, 4, 2, 7, 5, 4, 1, 1, 1, 1, 7, 7, 1, 1, 1, 7, 2, 1, 1, 1, 1, 4, 1, 1, 7, 1, 1, 1, 2, 1, 1, 5, 2, 1, 1, 1, 3, 7, 4, 6, 7, 6, 3, 1, 1, 1, 2, 4, 1, 2, 1, 7, 1, 1, 7, 3, 3, 4, 2, 7, 7, 1, 4, 1, 1, 1, 1]
[2, 1, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, 1, 3, 3, 2, 3, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 3, 2, 1, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 2, 3, 1, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 3, 3, 1, 3, 1, 1, 1, 1]
Sum = 157
Mean = 1.8470588235294119
Page 1 = 45
Page 2 = 8
Page 3 = 32
Max page to scrap = 6280


In [58]:
print(pages)
cut = []
for page in pages:
    if (page > 1):
        cut.append(1)
    else:
        cut.append(page)
print(cut)
print(f"Sum = {sum(cut)}")
print(f"Mean = {np.mean(cut)}")
freq = [0, 0, 0]
for page in cut:
    freq[page-1] += 1

print(f"Page 1 = {freq[0]}")
print(f"Page 2 = {freq[1]}")
print(f"Page 3 = {freq[2]}")

max_page = freq[0] * 40 + freq[1] * 80 + freq[2] * 120
print(f"Max page to scrap = {max_page}")

[2, 1, 7, 7, 5, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 4, 1, 7, 4, 2, 7, 5, 4, 1, 1, 1, 1, 7, 7, 1, 1, 1, 7, 2, 1, 1, 1, 1, 4, 1, 1, 7, 1, 1, 1, 2, 1, 1, 5, 2, 1, 1, 1, 3, 7, 4, 6, 7, 6, 3, 1, 1, 1, 2, 4, 1, 2, 1, 7, 1, 1, 7, 3, 3, 4, 2, 7, 7, 1, 4, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Sum = 85
Mean = 1.0
Page 1 = 85
Page 2 = 0
Page 3 = 0
Max page to scrap = 3400
