<a href="https://colab.research.google.com/github/tramlam-ng/scraping_data_tiki_selenium/blob/main/Selenium_TIKI_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install resources

In [None]:
# install selenium and other resources for crawling data
!pip install selenium
!apt-get update
!apt install chromium-chromedriver

In [None]:
### IMPORTS LIBRARIES ###
import re
import time
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

#Configuration for Driver and links

In [None]:
###############
### GLOBALS ###
###############

# Header for chromedriver
HEADER = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'}
# Urls
TIKI                = 'https://tiki.vn'
MAIN_CATEGORIES = [
    {'Name': 'Điện Thoại - Máy Tính Bảng',
     'URL': 'https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner'},

    {'Name': 'Điện Tử - Điện Lạnh',
     'URL': 'https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner'},

    {'Name': 'Phụ Kiện - Thiết Bị Số', 
     'URL': 'https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner'},

    {'Name': 'Laptop - Thiết bị IT', 
     'URL': 'https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner'},

    {'Name': 'Máy Ảnh - Quay Phim', 
     'URL': 'https://tiki.vn/may-anh/c1801?src=c.1801.hamburger_menu_fly_out_banner'},

    {'Name': 'Điện Gia Dụng', 
     'URL': 'https://tiki.vn/dien-gia-dung/c1882?src=c.1882.hamburger_menu_fly_out_banner'},

    {'Name': 'Nhà Cửa Đời Sống', 
     'URL': 'https://tiki.vn/nha-cua-doi-song/c1883?src=c.1883.hamburger_menu_fly_out_banner'},

    {'Name': 'Hàng Tiêu Dùng - Thực Phẩm', 
     'URL': 'https://tiki.vn/bach-hoa-online/c4384?src=c.4384.hamburger_menu_fly_out_banner'},

    {'Name': 'Đồ chơi, Mẹ & Bé', 
     'URL': 'https://tiki.vn/me-va-be/c2549?src=c.2549.hamburger_menu_fly_out_banner'},

    {'Name': 'Làm Đẹp - Sức Khỏe', 
     'URL': 'https://tiki.vn/lam-dep-suc-khoe/c1520?src=c.1520.hamburger_menu_fly_out_banner'},

    {'Name': 'Thể Thao - Dã Ngoại', 
     'URL': 'https://tiki.vn/the-thao/c1975?src=c.1975.hamburger_menu_fly_out_banner'},

    {'Name': 'Xe Máy, Ô tô, Xe Đạp', 
     'URL': 'https://tiki.vn/o-to-xe-may-xe-dap/c8594?src=c.8594.hamburger_menu_fly_out_banner'},

    {'Name': 'Hàng quốc tế', 
     'URL': 'https://tiki.vn/hang-quoc-te/c17166?src=c.17166.hamburger_menu_fly_out_banner'},

    {'Name': 'Sách, VPP & Quà Tặng', 
     'URL': 'https://tiki.vn/nha-sach-tiki/c8322?src=c.8322.hamburger_menu_fly_out_banner'},

    {'Name': 'Voucher - Dịch Vụ - Thẻ Cào', 
     'URL': 'https://tiki.vn/voucher-dich-vu/c11312?src=c.11312.hamburger_menu_fly_out_banner'}
]

# Global driver to use throughout the script
DRIVER = None

#Function to Start and Close Driver

In [None]:
# Function to (re)start driver
def start_driver(force_restart=False):
    global DRIVER
    
    if DRIVER is not None:
        if force_restart:
            DRIVER.close()
        else:
            raise RuntimeError('ERROR: cannot overwrite an active driver. Please close the driver before restarting.')
    
    # Setting up the driver
    options = webdriver.ChromeOptions()
    options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
    options.add_argument('-no-sandbox')
    options.add_argument('-disable-dev-shm-usage')

    DRIVER = webdriver.Chrome('chromedriver',options=options)

# Wrapper to close driver if its created
def close_driver():
    global DRIVER
    if DRIVER is not None:
        DRIVER.close()
    DRIVER = None

In [None]:
close_driver()

In [None]:
start_driver()

In [None]:
# get links by product types
data_url = []
for item in MAIN_CATEGORIES:
  url = (item.get('URL'))
  data_url.append(url)


In [None]:
DRIVER.get(data_url[0])

In [None]:
# get the first product element of the data link 
product_element = DRIVER.find_element_by_class_name('product-item')

#Function to get info from one product

In [None]:
#################
### FUNCTIONS ###
#################


# Function to extract product info from the product
def get_product_info_single(product_element):
    d = {'name':'',
         'price':'',
         'product_url':'',
         'image':'',
         'tiki_fast':'',
         'best_deal':'',
         'installment':'',
         'discount_percentage':''}

    # name get name through find_element_by_class_name
    try:
      name_tmp = product_element.find_element_by_class_name('name')
      d['name'] = name_tmp.find_element_by_tag_name('span').get_attribute('innerHTML')
    except NoSuchElementException:
        pass

    # get price find_element_by_class_name
    try:
      price_tmp = product_element.find_element_by_class_name('price-discount__price').get_attribute('innerHTML')
      d['price'] = int(price_tmp.strip(' ₫').replace('.',""))
    except NoSuchElementException:
      d['price'] = -1

    try:
      d['discount_percentage'] = product_element.find_element_by_class_name('price-discount__discount').get_attribute('innerHTML')
    except NoSuchElementException:
      pass
    
    # get link from .get_attribute()
    try:
        d['product_url']   = product_element.get_attribute('href')

    except NoSuchElementException:
        pass
    
    # get thumbnail by class_name and Tag name and get_attribute()
    try:
      thumbnail = product_element.find_elements_by_class_name('thumbnail')
      thumb1 = thumbnail[0]
      thumb2 = thumb1.find_elements_by_tag_name('img')
      if len(thumb2) == 1:
        d['image'] = thumb2[0].get_attribute('src')
      else:
        d['image'] = thumb2[1].get_attribute('src')
    except NoSuchElementException:
      pass
    
    # get tiki fast thumbnail
    try:
      thumbnail = product_element.find_element_by_class_name('thumbnail').find_elements_by_tag_name('img')
      if len(thumbnail) == 2:
        d['tiki_fast'] = True
      else: 
        d['tiki_fast'] = False
    except NoSuchElementException:
      pass

    # get best deal
    try:
      badge = product_element.find_element_by_class_name('badge-under-price')
      badge1 = badge.find_elements_by_tag_name('img')
      if len(badge1) == 1:
        d['best_deal'] = True
      else:
        d['best_deal'] = False
    except NoSuchElementException:
      pass

    # get installation
    try:
      benefits = product_element.find_element_by_class_name('badge-benefits')
      benefits1 = benefits.find_elements_by_tag_name('img')
      if len(benefits1) == 1:
        d['installment'] = True
      else:
        d['installment'] = False
    except NoSuchElementException:
      pass
      
    return d


In [None]:
# to get the whole info of the 1st product in any given link
get_product_info_single(product_element)

{'best_deal': True,
 'discount_percentage': '-39%',
 'image': 'https://salt.tikicdn.com/cache/280x280/ts/product/43/f8/2b/da41ce9d2b0039255777102a7bfc23fc.jpg',
 'installment': False,
 'name': 'Máy tính bảng Vankyo MatrixPad Z4 - Hàng Chính Hãng',
 'price': 2790000,
 'product_url': 'https://tka.tiki.vn/pixel?data=djAwMYTDad3iPxHotbssZDmUcLaPaY1K0VG05BO1Uquj1XHZZwEP4WWgKV4ZB7U2ODC6mP0fi25q1fzmowYHpoLp1i3Xq9Kvz8wgq55XwRHILXAYJ-nuLfUeRIxfG46Q-zM91GL-EbdGFgz1nssc-azbymQ9QTmx8SMWdp3hPh7w9eC-K8bJcJR5_q93Wmce3KzEhRH-K5dTdEQp7EMWAiuzw7WHMq-kLLtrBHrmQcvmkjoMEhmNvRsvz31IC8ptzBt-mXp8TIGwBGPVlgrBaP9qQUIiyXVvGAH0-kysSfHBIRdWi6dYYRxFemrvdqwS23YGEfIHTgxP8E24uNPhbwf6Fc9s8jJf4MAkHWwWJttwa42_EUCDCOahRm8XxtlS4X7xbHNJLrJJm1yOpqjOehW-b66AEZdZ6--s8vwR4DKl57VFuxD_4L4qOP_1Uw6kB_PcAvvVfJP6lsm94ptiMOGkFOeFlhCr2P400fqPAdS--5aCwySUzCtH6bKOuNdIomzlU6zmNAJBChurpTu5zZ6kcqxC9bl-jnxPUJWfHkDm8Vsz_pN9Zd6O3xs7EhgRnWzOXTXgXZgD1FJEIXV2Q4CFKqeuVkDxP0TbEJ7ilnrOz3QOVVuJQ934Innub7elRCN9CghzViv_dUf5N7lcUezIn1I9MmpeitRe9wydpj5Y1

#Function to scrape info of all products from a Page URL

In [None]:
# Function to scrape all products from ONE page
def get_product_info_from_page(page_url):
    """ Extract info from all products of a specfic page_url on Tiki website
        Args:
            page_url: (string) url of the page to scrape
        Returns:
            data: list of dictionary of products info. If no products shown, return empty list.
    """
    global DRIVER

    prod_data = []
    DRIVER.get(page_url) # Use the driver to get info from the product page
    time.sleep(5) ## Must have the sleep function

    products_all = DRIVER.find_elements_by_class_name('product-item')
    print(f'Found {len(products_all)} products')

    for product in products_all:
      #Look through the product and get the data
      # YOUR CODE HERE
      tmp = get_product_info_single(product)
      prod_data.append(tmp)

    return prod_data

In [None]:
# testing get_product_from_page function with one link 
get_product_info_from_page(data_url[0])

Found 64 products


[{'best_deal': True,
  'discount_percentage': '-39%',
  'image': 'https://salt.tikicdn.com/cache/280x280/ts/product/43/f8/2b/da41ce9d2b0039255777102a7bfc23fc.jpg',
  'installment': False,
  'name': 'Máy tính bảng Vankyo MatrixPad Z4 - Hàng Chính Hãng',
  'price': 2790000,
  'product_url': 'https://tka.tiki.vn/pixel?data=djAwMTAOXf7wxv3lGppIHGGsbvpAgNLr1eE0v-U7k_DAOlbTVFD7naOwdIZqVy2yX55vzYJXXucC4J6-0GoVsRqNb5J2ldOXgXlvhda5fOCiDRhw3foDjNdklcIAfpGZHq_aAbkYbWurrWRAlAcrD_mVpXA8GWI4RoURdvIpCio1yCGMy05BTfC3m6OBh8u_BoXIXoWitKTMhXg3Ppzp938xdJxHQKEFor4qLUJnsY1kkUvU8QXZOhTcxp35jYI6Q8B5yxUBtI4L2h9i73xs-LoqOrOJWYHSaqamJdJthq8zf8oaukebbRhM1AsWDYxhAN1K_Rx_TW4N9Qaz3rBe-MvagTZgCdp9OQ7m5aFeKlWyzOJF4w10vu3aTTR3x2xV9EkVGfsdClN0PyrwEc7cttJpAT_PGPajTNjCXac-hy7NBIrk61ADI4D47WCiTOIHQq2i9M_Qd_qn_9KJs1n9All4HN48AAWBiEAQ7rMbov-Ll-vjCUrTZNa1z9X3f7fsNz1nNxNF8x64RNXQiN76uJATc9io0RXwDXuhBEZ9a18AAhz49TTl2L4PkdUPkzIxLvcfq-73D7-nElHXzz9lY6mXP3vsGwIBxUe2cYzysVEPrzLYcBrv6MlzJ1OtLkvx1FkJax7ofuxVS_wBvTE4vMqv1O6q-ZgCom4evI

In [None]:
# get page 2 of each category
page2 = []
for i in range(len(data_url)):
  tmp1, tmp2 = data_url[i].split('?')
  page = tmp1 + '?' + 'page=2&' + tmp2
  page2.append(page)

In [None]:
page2

['https://tiki.vn/dien-thoai-may-tinh-bang/c1789?page=2&src=c.1789.hamburger_menu_fly_out_banner',
 'https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?page=2&src=c.4221.hamburger_menu_fly_out_banner',
 'https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?page=2&src=c.1815.hamburger_menu_fly_out_banner',
 'https://tiki.vn/laptop-may-vi-tinh/c1846?page=2&src=c.1846.hamburger_menu_fly_out_banner',
 'https://tiki.vn/may-anh/c1801?page=2&src=c.1801.hamburger_menu_fly_out_banner',
 'https://tiki.vn/dien-gia-dung/c1882?page=2&src=c.1882.hamburger_menu_fly_out_banner',
 'https://tiki.vn/nha-cua-doi-song/c1883?page=2&src=c.1883.hamburger_menu_fly_out_banner',
 'https://tiki.vn/bach-hoa-online/c4384?page=2&src=c.4384.hamburger_menu_fly_out_banner',
 'https://tiki.vn/me-va-be/c2549?page=2&src=c.2549.hamburger_menu_fly_out_banner',
 'https://tiki.vn/lam-dep-suc-khoe/c1520?page=2&src=c.1520.hamburger_menu_fly_out_banner',
 'https://tiki.vn/the-thao/c1975?page=2&src=c.1975.hamburger_menu_fly_out_banner',
 'h

In [None]:
# combine page 1 and 2 of each category together
pages = zip(data_url, page2)
pages_list = list(pages)


In [None]:
######################
### START SCRAPING ###
######################

### SOLUTION 1: to get (num_max_page) from one category
start_driver(force_restart=True)
main_cat_url = MAIN_CATEGORIES[-1]['URL']
num_max_page = 2
prod_data = []

#CODE TO GET DATA
for i in range(1, num_max_page):
  link = main_cat_url + f'&page={i+1}'
  tmp = get_product_info_from_page(link)
  prod_data.extend(tmp)





print(prod_data)

close_driver()

In [None]:
######################
### START SCRAPING ###
######################

### SOLUTION 2: to get from n pages for all categories
start_driver(force_restart=True)


# get data
prod_data = []
for item in pages_list:  # to loop each category from 15 categories
  for link in item:       # to loop from page 1 to page 2 of each category 
    tmp = get_product_info_from_page(link)
    prod_data.extend(tmp)

print(prod_data)

close_driver()

#SAVE DATA TO CSV FILE
df = pd.DataFrame(data=prod_data, columns=prod_data[0].keys())
df.to_csv('tiki_products.csv')


Found 64 products
Found 53 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 62 products
Found 51 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
Found 64 products
Found 61 products
[{'name': 'Máy tính bảng Vankyo MatrixPad Z4 - Hàng Chính Hãng', 'price': 2790000, 'product_url': 'https://tka.tiki.vn/pixel?data=djAwMZPt8CVF1I7fFpmxldBAEDnJ5lrbBTljZUeeCnM4JPVB1zFJGTSe3U0xaZdH0YiUBu0KnWnLiaq0Czk1SzRuCWhSgTKHIiyhvslQHJ8_GTD0-aERE08hhyEoTAp7phGvF7kteIwZmb-ROV7DuDjan6gLZkrQYa3PYVUdZGeTHCuCnfgEMByB5sHB6RPAdf7nQ-W6TxkCrT6fsmd4XfRSBuReIagAwFVysKXSSenfJjYMhfgF2qNudwwzUUBdiUEMqI27Ojk0xK4RAayjUEXgHgCKJUO7UDFmefKWlrAwEVrVtFEPxHW13qWwYgtY1CEDnkczgO

In [None]:
df.to_csv('tiki_products1.csv', encoding='utf-8-sig')

In [None]:
df.head(25)

Unnamed: 0,name,price,product_url,image,tiki_fast,best_deal,installment,discount_percentage
0,Máy tính bảng Vankyo MatrixPad Z4 - Hàng Chính...,2790000,https://tka.tiki.vn/pixel?data=djAwMZPt8CVF1I7...,https://salt.tikicdn.com/cache/280x280/ts/prod...,False,True,False,-39%
1,Điện Thoại iPhone 11 64GB - Hàng Chính Hãng,14290000,https://tiki.vn/dien-thoai-iphone-11-64gb-hang...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,False,True,-38%
2,Điện thoại Vivo Y12s - Hàng Chính Hãng,2930000,https://tiki.vn/dien-thoai-vivo-y12s-hang-chin...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,True,False,-32%
3,Điện Thoại Samsung Galaxy M11 (3GB/32GB) - Hàn...,2590000,https://tiki.vn/dien-thoai-samsung-galaxy-m11-...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,True,False,-30%
4,Điện Thoại Samsung Galaxy A71 (8GB/128GB) - Hà...,7390000,https://tka.tiki.vn/pixel?data=djAwMc9kC5IKn6j...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,True,True,-30%
5,Điện Thoại Vsmart Joy 4 - Hàng Chính Hãng,2220000,https://tiki.vn/dien-thoai-vsmart-joy-4-hang-c...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,False,False,-33%
6,Điện Thoại iPhone 12 Pro Max 128GB - Hàng Chín...,28250000,https://tiki.vn/dien-thoai-iphone-12-pro-max-1...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,False,True,-17%
7,Điện Thoại Samsung Galaxy M12 (4GB/64GB) - Hàn...,3250000,https://tiki.vn/dien-thoai-samsung-galaxy-m12-...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,True,False,-16%
8,Điện Thoại Samsung Galaxy M12 (4GB/64GB) - Hàn...,3250000,https://tka.tiki.vn/pixel?data=djAwMf4zx7_iB7_...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,True,False,-16%
9,Điện Thoại Samsung Galaxy Note 10 (8GB/256GB) ...,11990000,https://tiki.vn/dien-thoai-samsung-galaxy-note...,https://salt.tikicdn.com/cache/280x280/ts/prod...,True,True,True,-48%


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1855 entries, 0 to 1854
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   name                 1855 non-null   object
 1   price                1855 non-null   int64 
 2   product_url          1855 non-null   object
 3   image                1855 non-null   object
 4   tiki_fast            1855 non-null   bool  
 5   best_deal            1855 non-null   bool  
 6   installment          1855 non-null   bool  
 7   discount_percentage  1855 non-null   object
dtypes: bool(3), int64(1), object(4)
memory usage: 78.0+ KB


In [None]:
# check unique product names and its percentage
df['name'].value_counts(normalize = True, sort = True)

Bộ Phát Wifi 4G LTE TP-Link TL-MR6400 Chuẩn N 300Mbps - Hàng Chính Hãng                                                              0.001617
Máy đo huyết áp dạng kẹp ngón tay  Mini SpO2 Monitor Oxygen Saturation Monitor Pulse Rate Measuring Gauge Device 5s Rapid Reading    0.001617
Tã Dán Pampers Nội Địa Nhật Bản NB70/S64/M52/L42                                                                                     0.001617
Combo 3 Tã Dán Pampers Thượng Hạng Nội Địa Nhật NB70/S64/M52/L42                                                                     0.001617
Bộ Kích Sóng Wifi Repeater Mercusys MW300RE 300Mbps - Hàng Chính Hãng                                                                0.001617
                                                                                                                                       ...   
Thùng Bia Tiger Crystal 24 Lon Cao (330ml/Lon) (Bật lon Tiger giơ cao bóng vàng)                                                     0.000539
Quần T