# **Crawling Tiki main categories and sub categories**



In [None]:
from bs4 import BeautifulSoup
import requests
import sqlite3

TIKI_URL = 'https://tiki.vn'

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
PATH_TO_DB = '/content/gdrive/MyDrive/'

In [None]:
conn = sqlite3.connect(PATH_TO_DB+'tiki.db')
cur = conn.cursor()

In [None]:
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}

### **Function to get HTML response from given**

In [None]:
# Get the HTML content get_url()
def get_url(url):
    try:
        response = requests.get(url,headers = HEADERS).text
        soup = BeautifulSoup(response, 'html.parser')
        return soup
    except Exception as err:
        print('ERROR BY REQUEST:', err)
        

In [None]:
# test if we can crawl Tiki
get_url(TIKI_URL)

### **Create main categories table**

In [None]:
# Create table categories in the database using a function
def create_main_cat_table():
    query = """
        CREATE TABLE IF NOT EXISTS main_cat (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name VARCHAR(255),
            url TEXT, 
            create_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """
    try:
        cur.execute(query)
        conn.commit()
    except Exception as err:
        print('ERROR BY CREATE TABLE', err)
create_main_cat_table()

### **Create sub-categories table**

In [None]:
# Create table categories in the database using a function
def create_sub_cat_table():
    query = """
        CREATE TABLE IF NOT EXISTS sub_cat (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name VARCHAR(255),
            url TEXT, 
            parent_id INTERGER,
            parent_name TEXT,
            create_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """
    try:
        cur.execute(query)
        conn.commit()
    except Exception as err:
        print('ERROR BY CREATE TABLE', err)

create_sub_cat_table()

### **OOP to do CRUD (create/ read/ update/ delete) on database**

In [None]:
# Instead of using a function to do CRUD on database,
# creating a class Category is preferred
# attributes: name, url, parent_id
# instance method: save_into_db()
class Category:
    def __init__(self,  name, url, cat_id=None,):
        self.cat_id = cat_id
        self.name = name
        self.url = url

    def __repr__(self):
        return f"ID: {self.cat_id}, Name: {self.name}, URL: {self.url}"

    def save_into_db(self):
        query = """
            INSERT INTO main_cat (name, url)
            VALUES (?, ?);
        """
        val = (self.name, self.url)
        try:
            cur.execute(query, val)
            self.cat_id = cur.lastrowid
            conn.commit()
        except Exception as err:
            print('ERROR BY INSERT:', err)

class Sub_Category(Category):
    def __init__(self, name, url, parent_id=None, parent_name = None, cat_id=None):
        self.cat_id = cat_id
        self.name = name
        self.url = url
        self.parent_id = parent_id
        self.parent_name = parent_name
    
    def save_sub_into_db(self):
        query = """
            INSERT INTO sub_cat (name, url, parent_id, parent_name)
            VALUES (?, ?, ?, ?);
        """
        val = (self.name, self.url, self.parent_id, self.parent_name)
        try:
            cur.execute(query, val)
            self.cat_id = cur.lastrowid
            conn.commit()
        except Exception as err:
            print('ERROR BY INSERT:', err)


### **Get main categories**

In [None]:
CATEGORY_SET = set()
def can_add_to_cat_set(cat_name,save=False):
  if cat_name not in CATEGORY_SET:
    if save:
      CATEGORY_SET.add(cat_name)
      print(f'Added "{cat_name}" to CATEGORY_SET')
    return True
  return False

In [None]:
def get_main_categories(save_db=False):
    soup = get_url(TIKI_URL)

    result = []
    for a in soup.find_all('a', {'class': 'menu-link'}):
        name = a.find('span', {'class': 'text'}).text.strip()
        
        _=can_add_to_cat_set(name,save_db)

        url = a['href']
        main_cat = Category(name, url) # object from class Category

        if save_db:
            main_cat.save_into_db()
        result.append(main_cat)
    return result

In [None]:
# main_categories = get_main_categories(save_db=False)
cur.execute('DROP TABLE main_cat;')
conn.commit()
create_main_cat_table()
main_categories = get_main_categories(save_db=True)
cur.execute('SELECT * FROM main_cat;').fetchall()


In [None]:
main_categories

[ID: 1, Name: Điện Thoại - Máy Tính Bảng, URL: https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner,
 ID: 2, Name: Điện Tử - Điện Lạnh, URL: https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner,
 ID: 3, Name: Phụ Kiện - Thiết Bị Số, URL: https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner,
 ID: 4, Name: Laptop - Thiết bị IT, URL: https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner,
 ID: 5, Name: Máy Ảnh - Quay Phim, URL: https://tiki.vn/may-anh/c1801?src=c.1801.hamburger_menu_fly_out_banner,
 ID: 6, Name: Điện Gia Dụng, URL: https://tiki.vn/dien-gia-dung/c1882?src=c.1882.hamburger_menu_fly_out_banner,
 ID: 7, Name: Nhà Cửa Đời Sống, URL: https://tiki.vn/nha-cua-doi-song/c1883?src=c.1883.hamburger_menu_fly_out_banner,
 ID: 8, Name: Hàng Tiêu Dùng - Thực Phẩm, URL: https://tiki.vn/bach-hoa-online/c4384?src=c.4384.hamburger_menu_fly_out_banner,
 ID: 9, Nam

### **Get all sub-categories**

In [None]:
import re

# get_sub_categories() given a parent category
def get_sub_categories(parent_category, save_db=False):
    parent_url = parent_category.url
    result = []

    try:
        soup = get_url(parent_url)
        for a in soup.find_all('a', {'class':'item item--category '}):
            print(a)
            name = a.text.strip()
            if can_add_to_cat_set(name,save_db): 
              sub_url = a['href']

              cat = Sub_Category(name, sub_url, parent_category.cat_id, parent_category.name) # we now have parent_id, which is cat_id of parent category
              if save_db:
                  cat.save_sub_into_db()
              result.append(cat)
    except Exception as err:
        print('ERROR IN GETTING SUB CATEGORIES:', err)
    return result

In [None]:
# get_all_categories() given a list of main categories (This is a recursion function)
def get_all_categories(categories,save_db):
    # if I reach the last possible category, I need to stop
    if len(categories) == 0:
        return      
    for cat in categories:
        print(f'Getting {cat} sub-categories...')
        sub_categories = get_sub_categories(cat, save_db=save_db)
        print(f'Finished! {cat.name} has {len(sub_categories)} sub-categories')
        get_all_categories(sub_categories,save_db=save_db) # make sure to switch on (or off) save_db here

In [None]:
create_sub_cat_table()
cur.execute('DROP TABLE sub_cat;')
conn.commit()
create_sub_cat_table()

In [None]:
get_all_categories(main_categories,save_db=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
<a class="item item--category " href="https://tiki.vn/thiet-bi-mang/c21436?src=c.17166.hamburger_menu_fly_out_banner" style="padding-left:0">Thiết bị mạng</a>
<a class="item item--category " href="https://tiki.vn/thiet-bi-so-khac/c21440?src=c.17166.hamburger_menu_fly_out_banner" style="padding-left:0">Thiết bị số - Khác</a>
<a class="item item--category " href="https://tiki.vn/thiet-bi-thong-minh/c21422?src=c.17166.hamburger_menu_fly_out_banner" style="padding-left:0">Thiết bị thông minh</a>
<a class="item item--category " href="https://tiki.vn/the-nho-dien-thoai/c21420?src=c.17166.hamburger_menu_fly_out_banner" style="padding-left:0">Thẻ nhớ điện thoại</a>
<a class="item item--category " href="https://tiki.vn/dau-doc-the-nho/c21392?src=c.17166.hamburger_menu_fly_out_banner" style="padding-left:0">Đầu đọc thẻ nhớ</a>
<a class="item item--category " href="https://tiki.vn/de-tan-nhiet-laptop/c21396?src=c.17166.hamburger_men

In [None]:
main_categories

[ID: 1, Name: Điện Thoại - Máy Tính Bảng, URL: https://tiki.vn/dien-thoai-may-tinh-bang/c1789?src=c.1789.hamburger_menu_fly_out_banner,
 ID: 2, Name: Điện Tử - Điện Lạnh, URL: https://tiki.vn/tivi-thiet-bi-nghe-nhin/c4221?src=c.4221.hamburger_menu_fly_out_banner,
 ID: 3, Name: Phụ Kiện - Thiết Bị Số, URL: https://tiki.vn/thiet-bi-kts-phu-kien-so/c1815?src=c.1815.hamburger_menu_fly_out_banner,
 ID: 4, Name: Laptop - Thiết bị IT, URL: https://tiki.vn/laptop-may-vi-tinh/c1846?src=c.1846.hamburger_menu_fly_out_banner,
 ID: 5, Name: Máy Ảnh - Quay Phim, URL: https://tiki.vn/may-anh/c1801?src=c.1801.hamburger_menu_fly_out_banner,
 ID: 6, Name: Điện Gia Dụng, URL: https://tiki.vn/dien-gia-dung/c1882?src=c.1882.hamburger_menu_fly_out_banner,
 ID: 7, Name: Nhà Cửa Đời Sống, URL: https://tiki.vn/nha-cua-doi-song/c1883?src=c.1883.hamburger_menu_fly_out_banner,
 ID: 8, Name: Hàng Tiêu Dùng - Thực Phẩm, URL: https://tiki.vn/bach-hoa-online/c4384?src=c.4384.hamburger_menu_fly_out_banner,
 ID: 9, Nam

In [None]:
cur.execute(
'''
SELECT count(*) FROM sub_cat
;
''').fetchall()

[(2577,)]

### **Find the lowest level categories**

In [None]:
lowest_level = cur.execute(
'''
SELECT * FROM sub_cat 
WHERE name not in (SELECT DISTINCT parent_name FROM sub_cat)
;
''').fetchall()


In [None]:
len(lowest_level)

2147

In [None]:
lowest_level_url = []
for i in range(len(lowest_level)):

  url = str(lowest_level[i][2])
  lowest_level_url.append(url)


In [None]:
lowest_level_url

['https://tiki.vn/may-tinh-bang/c1794?src=c.1789.hamburger_menu_fly_out_banner',
 'https://tiki.vn/may-doc-sach/c28856?src=c.1789.hamburger_menu_fly_out_banner',
 'https://tiki.vn/dien-thoai-smartphone/c1795?src=c.1789.hamburger_menu_fly_out_banner',
 'https://tiki.vn/dien-thoai-ban/c8061?src=c.1789.hamburger_menu_fly_out_banner',
 'https://tiki.vn/dien-thoai-pho-thong/c1796?src=c.1789.hamburger_menu_fly_out_banner',
 'https://tiki.vn/may-giat/c3862?src=c.4221.hamburger_menu_fly_out_banner',
 'https://tiki.vn/may-lanh-may-dieu-hoa/c3865?src=c.4221.hamburger_menu_fly_out_banner',
 'https://tiki.vn/may-nuoc-nong/c3866?src=c.4221.hamburger_menu_fly_out_banner',
 'https://tiki.vn/may-rua-chen/c3864?src=c.4221.hamburger_menu_fly_out_banner',
 'https://tiki.vn/may-say-quan-ao/c3863?src=c.4221.hamburger_menu_fly_out_banner',
 'https://tiki.vn/tu-lanh/c2328?src=c.4221.hamburger_menu_fly_out_banner',
 'https://tiki.vn/tu-dong-tu-mat/c3868?src=c.4221.hamburger_menu_fly_out_banner',
 'https://tik

### **Create product table**

In [None]:
cur.execute('DROP TABLE product;')
conn.commit()

In [None]:
# Create table categories in the database using a function
def create_product_table():
    query = """
        CREATE TABLE IF NOT EXISTS product (
            product_title VARCHAR(255) PRIMARY KEY,
            price TEXT,
            url_img TEXT,
            url_page TEXT,
            tiki_now TEXT,
            free_ship TEXT, 
            reviews TEXT, 
            rating TEXT, 
            under_price TEXT,
            discount TEXT, 
            shocking_price TEXT,
            installment TEXT,
            free_gift TEXT,
            cat_url TEXT,
            create_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    """
    try:
        cur.execute(query)
        conn.commit()
    except Exception as err:
        print('ERROR BY CREATE TABLE', err)

create_product_table()

In [None]:
class Products:
    def __init__(self,  product_id, product_title, price, url_img, url_page,tiki_now, free_ship, reviews, rating, under_price, discount, shocking_price,installment, free_gift,cat_url):
        
        self.product_id = product_id
        self.product_title = product_title 
        self.price = price
        self.url_img = url_img
        self.url_page = url_page
        self.tiki_now = tiki_now
        self.free_gift = free_gift
        self.reviews = reviews
        self.ratings = ratings
        self.under_price = under_price
        self.discount = discount
        self.shocking_price = shocking_price
        self.installment = installment
        self.free_gift = free_gift
        self.cat_url = cat

    def save_into_db(self):
        query = """
            INSERT INTO product(product_id, product_title, price, url_img, url_page,tiki_now, free_ship, reviews, rating, under_price, discount, shocking_price,installment, free_gift, cat_url)
            VALUES (?,?, ?,?,?,?,?,?,?,?,?,?,?,?);
        """
        val = (self.product_id, self.product_title, self.price, self.url_img, self.url_page, self.tiki_now, self.free_ship, self.reviews, self.rating, self.under_price, self.discount, self.shocking_price,self.installment, self.free_gift,self.cat_url)
        try:
            cur.execute(query, val)
            conn.commit()
        except Exception as err:
            print('ERROR BY INSERT:', err)

### **Scape product function**

In [None]:
# get products of all lowest level:
from time import sleep
from random import randint
import pandas as pd

def get_tiki_data(url):
    failed_count = 0
    while True:
      try:
        r = requests.get(url, headers=HEADERS)
      except:
        return None
      # r.text is a HTML file so we will use html.parser
      soup = BeautifulSoup(r.text, 'html.parser')
      # All occurences of the products in that page
      print("\nAll occurences of the product div sections:")
      products = soup.find_all('a', {'class':'product-item'})

      print("Number of products:", len(products))
      if (len(products) == 0):
        # some time it just does not work on 1st try	
        if (failed_count > 30):
          break
        failed_count += 1
        sleep(randint(8,15))	
      else:
        return products			
    
    return None			

def scrape_data_items(product_list):  
    data = []
    for  product in products:
# Each article is dictionary containing the required information:
        d = {'product_title':'', 'price':'', 'url_img': '', 'url_page':'' ,
                'tiki_now':'', 'free_ship':'', 'reviews':'', 'rating':'', 'under_price':'', 'discount':'', 'shocking_price':'',
                 'installment':'', 'free_gift':'', 'cat_url':''}

            # We use the try-except blocks to handle errors
        try:
            d['product_title'] = product.find('div', {'class':'name'}).span.text
            d['price'] = product.find('div',{'class':'price-discount__price'}).text
            d['url_img'] = product.img['src']
            d['url_page'] = product['href']
            d['discount'] = product.find('div',{'class':'price-discount__discount'}).text

            # Review & rating   
            d['reviews'] = product.find('div',{'class':'review'}).text
            star = product.find('div',{'class':'rating'})
            d['rating'] = star.find_all('div')[1]['style']

            # Tikinow
            d['tiki_now'] = 'Yes' if bool(product.find('div',{'class':'badge-service'}).div) else 'No'

            # Badge_under_price     
            d['under_price'] = 'Yes' if bool(product.find('div',{'class':'badge-under-price'}).div) else 'No'

            # Free gift
            d['free_gift'] = 'Yes' if bool(product.find('div',{'class':'freegift-list'})) else 'No'

            # paid_by_installment      
            d['installment'] = 'Yes' if bool(product.find('div',{'class':'badge-benefits'}).span) else 'No'

            # Freeship / shocking price
            badge_top = product.find('div', {'class': 'badge-top'}).span.text if bool(product.find('div', {'class': 'badge-top'}).span) else 'NA'
            d['shocking_price'] = badge_top if badge_top != 'Freeship' else 'NA' 
            d['free_ship'] = badge_top if badge_top != 'Freeship' else 'NA'
            d['cat_url'] = level_url

                # Append the dictionary to data list
            data.append(d)

        except:
                # Skip if error and print error message
            print("We got one article error!")
        if save_db:
            d.save_into_db()
            data.append(d)

    return data

In [None]:
test = lowest_level_url[1:50]
items = get_products(test,save=True)
items