# **Shopee SKU Scraper**

**Author:** Zachary Tang <br>
**Date published:** 9/1/21 <br>
<br>
**Contact information:**
- Email: zacharytangjiaying@gmail.com
- [Github](https://github.com/ZacharyTangJiaYing) 
- [Linkedin](https://www.linkedin.com/in/zacharytang/)

--- 


### **Library**

In [None]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import time
import json
import time
import pandas as pd
import requests
import datetime 

### **Settings**

In [None]:
# Settings for chrome driver
options = Options()

# options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-browser-side-navigation")
options.add_argument("--disable-gpu") 
options.add_argument('--incognito')
options.add_argument("start-maximized")
options.add_argument("enable-automation")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

# Saved file directory
filepath = input('Filepath: ')

In [None]:
# Insert target URL
# eg https://shopee.sg/Kraft-Mailer-Boxes-Courier-Box-Corrugated-Cardboard-Box-5-Sizes-XS-S-M-L-XL-Bundle-of-10-20pcs-i.264522724.7334949205
url = input('Target URL: ')
driver.get(url)
time.sleep(5)

### **Scraping Algorithm**

In [None]:
today = datetime.datetime.today().strftime("%Y%m%d") 
start_time = time.time() # record start time 
    
driver.execute_script("window.scrollTo(0, 1000)") #scroll down
time.sleep(2)
driver.execute_script("window.scrollTo(0, 2000)") 
time.sleep(2)
driver.execute_script("window.scrollTo(0, 3000)") 
time.sleep(2)
driver.execute_script("window.scrollTo(0, 4000)") 
time.sleep(2)
driver.execute_script("window.scrollTo(0, 5000)")
time.sleep(5)
driver.execute_script("window.scrollTo(0, 8000)")
time.sleep(5)

def convert(x): # to remove "k" in review number. eg "3k reviews" to 3
    if 'k' in x:
        x = float(x.replace('k','')) * 1000
        return x
    else:
        return float(x)

# get the number of review pages. each page has 6 reviews
rating_list = [convert(i.text[8:-1]) for i in driver.find_elements_by_class_name('product-rating-overview__filter')[1:6]]
number_of_ratings = sum(rating_list)
if number_of_ratings % 6 < 6:
    comment_pages = (number_of_ratings - (number_of_ratings % 6)) / 6 + 1
else: comment_pages = number_of_ratings / 6
    
comment_page_counter = 1

# initialize lists to store the scraped values
filename = driver.find_element_by_xpath('//div[@class="qaNIZv"]/span').text
variation_list = []
date_list = []
username_list = []
comment_list = []

# scrape
while comment_page_counter < comment_pages:
    try:
        for i in driver.find_elements_by_class_name('shopee-product-rating__author-name'):
            username_list.append(i.text)

        for c in driver.find_elements_by_class_name('shopee-product-rating__content'):
            comment_list.append(c.text)

        for d in driver.find_elements_by_class_name('shopee-product-rating__time'):
            date_list.append(d.text)

        for v in driver.find_elements_by_class_name('shopee-product-rating__variation'):
            variation_list.append(v.text)
            
        driver.find_elements_by_xpath('//div[@class="shopee-page-controller product-ratings__page-controller"]/button')[-1].click()
        comment_page_counter += 1
        time.sleep(3)
        
    except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException): 
            pass 

### **Save to File**

In [None]:
df = pd.DataFrame(list(zip(date_list, username_list, comment_list, variation_list)), 
                  columns=["date", "username", "comment", "variation"])

df['date'] =  pd.to_datetime(datetime.datetime.today().strftime("%Y%m%d"))
df['platform'] = 'Shopee'

data = df.to_csv('{0}/scraped.csv'.format(filepath) , index=False, encoding="utf-8_sig")
print('File for {0} saved! \n'.format(filename))

end_time = time.time()
elapsed_time = end_time - start_time
seconds = round(elapsed_time % 60, 2)
minutes = round((elapsed_time - seconds) / 60, 1)
print("Elapsed time: {0} minutes and {1} seconds!".format(str(minutes), str(seconds)))