# **Shopee Search and Review Webscraper**

**Author:** Zachary Tang <br>
**Date published:** 13/1/21 <br>
<br>
**Contact information:**
- Email: zacharytangjiaying@gmail.com
- [Github](https://github.com/ZacharyTangJiaYing) 
- [Linkedin](https://www.linkedin.com/in/zacharytang/)

--- 


### **User Guide**

This script scrapes search user specified search results in Shopee's website. There are a few options that requires the user's input: 

1. **Filepath** - specify the filepath where you want to save the scraped results. The results will be saved in .csv format.
2. **Country** - choose which Shopee website to scrape based on country.
3. **Search term** - user specified search term 
4. **Pages to scrape** - how many pages of search results to scrape.
5. **Scrape within SKU** - option to loop through each listing and pull additional information like seller information, stock information and customer reviews. <span style="color:red">**WARNING:** Might take a long time to finish scraping!</span>

To start scraping, clear the kernel and run all cells.

### **Library**

In [None]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np
import time

### **Settings**

In [None]:
# Settings for chrome driver
options = Options()

# options.add_argument('--headless')
options.add_argument("--no-sandbox")
options.add_argument("--disable-infobars")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-browser-side-navigation")
options.add_argument("--disable-gpu") 
options.add_argument('--incognito')
options.add_argument("start-maximized")
options.add_argument("enable-automation")
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)

# Saved file directory
filepath = input('Filepath: ')

country = input('SG/MY/TH/ID: ')
search = input('Search term: ')
scrape_limit = int(input('Pages to scrape: '))
scrape_within_SKU = input('Scrape within SKU? Y/N: ')

if country == 'SG':
    driver.get('https://shopee.sg')
if country == 'MY':
    driver.get('https://shopee.com.my')
if country == 'TH':
    driver.get('https://shopee.co.th')
if country == 'ID':
    driver.get('https://shopee.co.id')
    
time.sleep(8)
    
driver.find_element_by_class_name('shopee-searchbar-input__input').send_keys(search, Keys.ENTER)

### **Functions**

In [None]:
def render_page(): # scroll down to render all elements in the page
    driver.execute_script("window.scrollTo(0, 1000)") 
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, 2000)") 
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, 3000)") 
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, 4000)") 
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, 5000)")
    time.sleep(2)
    driver.execute_script("window.scrollTo(0, 8000)")
    
def convert(x): # to remove "k" in string and convert string to float
    if 'k' in x:
        x = float(x.replace('k','')) * 1000
        return x
    else:
        return float(x)

### **Execute**

In [None]:
if scrape_within_SKU == 'Y': 
    
    time.sleep(4)
    
    driver.find_element_by_xpath('//div[@class="shopee-sort-by-options"]/div[text()="Top Sales"]').click() # navigate to top sales
    
    time.sleep(4)

    page_number = 1
    total_pages = int(driver.find_element_by_xpath('//span[@class="shopee-mini-page-controller__total"]').text) # find out the total number of pages
    limits = min(scrape_limit, total_pages) # set limits to minimum of specified limit or number of search pages

    # initialize variables 
    urls = []
    sold_per_month = []
    product_name = []
    discounts = []
    prices_lowest = []
    prices_highest = []
    preferred = []
    country_ship = []
    ad = []

    while page_number <= limits: 

        render_page() # render page

        # scrape 
        
        try:
            product_name.extend([i.text for i in driver.find_elements_by_xpath('//div[@class="_1NoI8_ _2xHE6C _1co5xN"]')])

            sold_per_month.extend([s.text for s in driver.find_elements_by_xpath('//div[@class="_245-SC"]')])

            urls.extend([u.get_attribute('href') for u in driver.find_elements_by_xpath('//div[@class="col-xs-2-4 shopee-search-item-result__item"]/div/a')])

            number_of_products_in_page = len(driver.find_elements_by_xpath('//div[@class="row shopee-search-item-result__items"]/div'))

            # iterate through each element in the search results, since not all listings have the same structure
            for i in range(1,(number_of_products_in_page+1)):
                
                try: # discount tags
                    disc_tag = driver.find_element_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[1]/div[2]/div/div/span[1]'.format(i)).text
                    discounts.append(disc_tag)
                except NoSuchElementException:
                    discounts.append(np.nan)

                try: # lowest price if there is a price range
                    p1 = float("".join([i.text.replace(',','') for i in driver.find_elements_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[2]/div[@class="_1w9jLI _1DGuEV _2ZYSiu"]/span'.format(i))][1]))
                    prices_lowest.append(p1)
                except (NoSuchElementException, IndexError):
                    prices_lowest.append(np.nan)

                try: # highest price if there is a price range
                    p2 = float("".join([i.text for i in driver.find_elements_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[2]/div[@class="_1w9jLI _1DGuEV _2ZYSiu"]/span'.format(i))][3]))
                    prices_highest.append(p2)
                except (NoSuchElementException, IndexError):
                    prices_highest.append(np.nan)

                try: # preferred seller or not
                    pref_tag = driver.find_element_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[1]/div[1]/div/span[@class="_1DeDTg"]'.format(i)).text
                    preferred.append(pref_tag)
                except NoSuchElementException:
                    preferred.append(np.nan)

                try: # check if listing is a paid ad
                    ad_tag = driver.find_element_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[1]/div[@class="_1e9igF"]/div[@class="_2uCaTM"]'.format(i)).text
                    ad.append(ad_tag)
                except NoSuchElementException:
                    ad.append(np.nan)

                try: # check where the item is shipping from
                    country_tag = driver.find_element_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[@class="_41f1_p"]'.format(i)).text
                    if country_tag == '':
                        country_ship.append(country)
                    else:
                        country_ship.append(country_tag)
                except NoSuchElementException:
                    country_ship.append('np.nan')
            
            # navigate to the next page
            driver.find_element_by_xpath('//button[@class="shopee-button-outline shopee-mini-page-controller__next-btn"]').click()
            page_number += 1
            time.sleep(3)

        except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException): 
            break  
    
    # additional information that will be attached to search results 
    num_sold = []
    num_ratings = []
    num_fav = []
    num_stock = []
    seller_name = []
    seller_url = []
    seller_ratings = []
    seller_products = []
    seller_response = []
    seller_response_time = []
    seller_joined = []
    seller_follower = []
    product_cat = []
    product_desc = []
    
    
    # loop through each url in the initial search scrape 
    for url in urls:
    
        driver.get(url)
        time.sleep(4)
    
        render_page() # render the page

        # get the number of review pages. each page has 6 reviews
        rating_list = [convert(i.text[8:-1]) for i in driver.find_elements_by_class_name('product-rating-overview__filter')[1:6]]
        number_of_ratings = sum(rating_list)
        if number_of_ratings % 6 < 6:
            comment_pages = (number_of_ratings - (number_of_ratings % 6)) / 6 + 1
        else: comment_pages = number_of_ratings / 6

        comment_page_counter = 1

        # initialize lists to store the scraped values
        filename = driver.find_element_by_xpath('//div[@class="qaNIZv"]/span').text

        variation_list = []
        date_list = []
        username_list = []
        comment_list = []
        
        try:
            
            # gather additional information to be attached to main search results
            num_sold.append(convert(driver.find_element_by_class_name('_22sp0A').text))

            try:
                num_ratings.append(convert(driver.find_element_by_xpath('//div[@class="flex _32fuIU"]/div[2]/div[@class="_3Oj5_n"]').text))
            except NoSuchElementException:
                num_ratings.append(0)

            try:
                num_fav.append(driver.find_element_by_xpath('//div[@class="flex items-center _25DJo1"]/div').text)
            except NoSuchElementException:
                num_fav.append(0)
                
            num_stock.append(driver.find_element_by_css_selector('#main > div > div._1Bj1VS > div.page-product > \
                                                            div.container > div.product-briefing.flex.card._2cRTS4 > \
                                                            div.flex.flex-auto.k-mj2F > div > div._3DepLY > div > \
                                                            div.flex._3dRJGI._3a2wD- > div > div > div.flex.items-center > \
                                                            div:nth-child(2)').text)

            seller_name.append(driver.find_element_by_class_name('_3Lybjn').text)

            seller_url.append(driver.find_element_by_class_name('_136nGn').get_attribute('href'))

            seller_ratings.append(convert(driver.find_element_by_xpath('//div[@class="_3mK1I2"]/div[1]/div[1]/span').text))

            seller_products.append(convert(driver.find_element_by_xpath('//div[@class="_3mK1I2"]/div[1]/a/span').text))

            seller_response.append(driver.find_element_by_xpath('//div[@class="_3mK1I2"]/div[2]/div[1]/span').text)

            seller_response_time.append(driver.find_element_by_xpath('//div[@class="_3mK1I2"]/div[2]/div[2]/span').text)

            seller_joined.append(driver.find_element_by_xpath('//div[@class="_3mK1I2"]/div[3]/div[1]/span').text)

            seller_follower.append(convert(driver.find_element_by_xpath('//div[@class="_3mK1I2"]/div[3]/div[2]/span').text))

            product_cat.append(" > ".join([c.text for c in driver.find_elements_by_xpath('//a[@class="JFOy4z _20XOUy"]')]))

            product_desc.append(driver.find_element_by_class_name('_2u0jt9').text)
        
        except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException): 
            print("Error scraping SKU info")
            pass 

        scrape reviews that will be saved in a seperate csv
        while comment_page_counter <= comment_pages:

            try:
                
                for i in driver.find_elements_by_class_name('shopee-product-rating__author-name'):
                    username_list.append(i.text)

                for c in driver.find_elements_by_class_name('shopee-product-rating__content'):
                    comment_list.append(c.text)

                for d in driver.find_elements_by_class_name('shopee-product-rating__time'):
                    date_list.append(d.text)
                    
                if not driver.find_elements_by_class_name('shopee-product-rating__variation'):
                    variation_list.extend([np.nan for n in range(len(date_list))])               
                else: 
                    for v in driver.find_elements_by_class_name('shopee-product-rating__variation'):
                        variation_list.append(v.text)
                    
                driver.find_elements_by_xpath('//div[@class="shopee-page-controller product-ratings__page-controller"]/button')[-1].click()
                comment_page_counter += 1
                time.sleep(3)

            except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException, IndexError): 
                comment_page_counter += 1
                pass 

        df = pd.DataFrame(list(zip(date_list, username_list, comment_list, variation_list)), 
                      columns=["date", "username", "comment", "variation"])

        df.to_csv('{0}/{1}.csv'.format(filepath,filename.replace('/', '')), index=False, encoding="utf-8_sig")
        print('File for {0} saved! \n'.format(filename.replace("/", '')))
        
    df = pd.DataFrame(list(zip(product_name, urls, prices_lowest, prices_highest, discounts, sold_per_month, preferred, country_ship, ad, num_sold, num_ratings, num_fav, num_stock, seller_name, seller_url, seller_products, seller_ratings, seller_response, seller_response_time, seller_joined, seller_follower, product_cat, product_desc)),
                      columns=['product_name', 'urls', 'prices_lowest', 'prices_highest', 'discounts', 'sold_per_month', 'preferred', 'country_ship', 'ad', 'num_sold', 'num_ratings', 'num_fav', 'num_stock', 'seller_name', 'seller_url', 'seller_products', 'seller_ratings','seller_response', 'seller_response_time', 'seller_joined', 'seller_follower', 'product_cat', 'product_desc'])

    df.to_csv('{0}/scraped.csv'.format(filepath) , index=False, encoding="utf-8_sig")
    print('Search results for "{}" has been saved! \n'.format(search))
    
if scrape_within_SKU == 'N':
    
    time.sleep(4)
    
    driver.find_element_by_xpath('//div[@class="shopee-sort-by-options"]/div[text()="Top Sales"]').click() # navigate to top sales
    
    time.sleep(4)

    page_number = 1
    total_pages = int(driver.find_element_by_xpath('//span[@class="shopee-mini-page-controller__total"]').text) # find out the total number of pages
    limits = min(scrape_limit, total_pages) # set limits to minimum of specified limit or number of search pages

    # initialize variables 
    urls = []
    sold_per_month = []
    product_name = []
    discounts = []
    prices_lowest = []
    prices_highest = []
    preferred = []
    country_ship = []
    ad = []

    while page_number <= limits: 

        render_page() # render page

        # scrape 
        
        try:
            product_name.extend([i.text for i in driver.find_elements_by_xpath('//div[@class="_1NoI8_ _2xHE6C _1co5xN"]')])

            sold_per_month.extend([s.text for s in driver.find_elements_by_xpath('//div[@class="_245-SC"]')])

            urls.extend([u.get_attribute('href') for u in driver.find_elements_by_xpath('//div[@class="col-xs-2-4 shopee-search-item-result__item"]/div/a')])

            number_of_products_in_page = len(driver.find_elements_by_xpath('//div[@class="row shopee-search-item-result__items"]/div'))

            # iterate through each element in the search results, since not all listings have the same structure
            for i in range(1,(number_of_products_in_page+1)):
                
                try: # discount tags
                    disc_tag = driver.find_element_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[1]/div[2]/div/div/span[1]'.format(i)).text
                    discounts.append(disc_tag)
                except NoSuchElementException:
                    discounts.append(np.nan)

                try: # lowest price if there is a price range
                    p1 = float("".join([i.text.replace(',','') for i in driver.find_elements_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[2]/div[@class="_1w9jLI _1DGuEV _2ZYSiu"]/span'.format(i))][1]))
                    prices_lowest.append(p1)
                except (NoSuchElementException, IndexError):
                    prices_lowest.append(np.nan)

                try: # highest price if there is a price range
                    p2 = float("".join([i.text for i in driver.find_elements_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[2]/div[@class="_1w9jLI _1DGuEV _2ZYSiu"]/span'.format(i))][3]))
                    prices_highest.append(p2)
                except (NoSuchElementException, IndexError):
                    prices_highest.append(np.nan)

                try: # preferred seller or not
                    pref_tag = driver.find_element_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[1]/div[1]/div/span[@class="_1DeDTg"]'.format(i)).text
                    preferred.append(pref_tag)
                except NoSuchElementException:
                    preferred.append(np.nan)

                try: # check if listing is a paid ad
                    ad_tag = driver.find_element_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[1]/div[@class="_1e9igF"]/div[@class="_2uCaTM"]'.format(i)).text
                    ad.append(ad_tag)
                except NoSuchElementException:
                    ad.append(np.nan)

                try: # check where the item is shipping from
                    country_tag = driver.find_element_by_xpath('//div[@class="row shopee-search-item-result__items"]/div[{}]/div/a/div/div[2]/div[@class="_41f1_p"]'.format(i)).text
                    if country_tag == '':
                        country_ship.append(country)
                    else:
                        country_ship.append(country_tag)
                except NoSuchElementException:
                    country_ship.append('np.nan')
            
            # navigate to the next page
            driver.find_element_by_xpath('//button[@class="shopee-button-outline shopee-mini-page-controller__next-btn"]').click()
            page_number += 1
            time.sleep(3)

        except (NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException): 
            break 

    # save the results 
    df = pd.DataFrame(list(zip(product_name, urls, prices_lowest, prices_highest, discounts, sold_per_month, preferred, country_ship, ad)),
                 columns=['product_name', 'urls', 'prices_lowest', 'prices_highest', 'discounts', 'sold_per_month', 'preferred', 'country_ship', 'ad'])

    df.to_csv('{0}/search_scraped.csv'.format(filepath) , index=False, encoding="utf-8_sig")
    print('Search results for "{}" has been saved! \n'.format(search))