In [491]:
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import os
from random import shuffle
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pprint import pprint
from selenium.webdriver.common.keys import Keys
import time
import re
from underthesea import sent_tokenize
import string
from tqdm import tqdm
import json

In [492]:
opts = Options()
opts.headless = True
opts.add_argument("--disable-extensions")
opts.add_argument("--disable-infobars")
opts.add_argument("start-maximized")
opts.add_argument("--disable-notifications")

In [493]:

exclude = list(string.punctuation) + ["“", "”", "–", "‘", "’", "shopee", "\n", "\s", "\'", "<br>"]
exclude.remove(".")
exclude.remove(",")

accented_chars_vietnamese = [
    'á', 'à', 'ả', 'ã', 'ạ', 'â', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ', 'ă', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ',
    'ó', 'ò', 'ỏ', 'õ', 'ọ', 'ô', 'ố', 'ồ', 'ổ', 'ỗ', 'ộ', 'ơ', 'ớ', 'ờ', 'ở', 'ỡ', 'ợ',
    'é', 'è', 'ẻ', 'ẽ', 'ẹ', 'ê', 'ế', 'ề', 'ể', 'ễ', 'ệ',
    'ú', 'ù', 'ủ', 'ũ', 'ụ', 'ư', 'ứ', 'ừ', 'ử', 'ữ', 'ự',
    'í', 'ì', 'ỉ', 'ĩ', 'ị',
    'ý', 'ỳ', 'ỷ', 'ỹ', 'ỵ','đ',
]
accented_chars_vietnamese = "".join(accented_chars_vietnamese)




def preprocess_text(text):
    text = text.lower().strip()
    stop_words = []
    for stop_word in exclude:
        text = text.replace(stop_word, " ")
    text = re.sub("\s+", " ", text)
    pattern = f"[^A-Za-z0-9 {accented_chars_vietnamese}]+"
    text = re.sub(pattern, '', text)
    return text

# def search(query, browser):
#     product_lists = []
#     input_button = browser.find_element_by_class_name("shopee-searchbar-input__input")
#     input_button.clear()
#     input_button.send_keys(query)
#     input_button.send_keys(Keys.ENTER)
#     items = browser.find_elements_by_class_name("shopee-search-item-result__item")    

#     for i in range(min(20, len(items))):
#         product = {}
#         browser.implicitly_wait(2)
#         item = browser.find_elements_by_class_name("shopee-search-item-result__item")[i]
#         item.click()

#         #product info
#         product["title"] = preprocess_text(browser.find_element_by_css_selector(".qaNIZv").text)
#         product["price"] = preprocess_text(browser.find_element_by_css_selector("._3n5NQx").text)
#         #product description
#         product_details = browser.find_element_by_css_selector('div.product-detail.page-product__detail')
#         product_desc_1, product_desc_2 = product_details.find_elements_by_tag_name("div")[:2]
#         for field in product_desc_1.find_elements_by_class_name("kIo6pj"):
#             label = field.find_element_by_tag_name("label").text
#             text = preprocess_text(field.text)
#             product[label] = text

#         details_2 = preprocess_text(product_details.find_element_by_class_name("_2u0jt9").text)
#         product["desc_2"] = details_2
#         product_lists.append(product)
#         browser.back()
#     return product_lists

In [499]:
from tqdm.autonotebook import tqdm

def crawl_link(link, browser, max_page=50, max_item=40):
    
    for page in range(max_page):
        product_list = []
        items = browser.find_elements_by_class_name("shopee-search-item-result__item")    
        for i in tqdm(range(min(max_item, len(items)))):
            try:
                product = {}
                browser.implicitly_wait(1.5)
                item = browser.find_elements_by_class_name("shopee-search-item-result__item")[i]
                item.click()

                #product info
                try:
                    product["title"] = preprocess_text(browser.find_element_by_css_selector(".qaNIZv").text)
                    product["price"] = preprocess_text(browser.find_element_by_css_selector("._3n5NQx").text)

                    #product description
                    product_details = browser.find_element_by_css_selector('div.product-detail.page-product__detail')
                    product_desc_1, product_desc_2 = product_details.find_elements_by_tag_name("div")[:2]
                    for field in product_desc_1.find_elements_by_class_name("kIo6pj"):
                        label = field.find_element_by_tag_name("label").text
                        text = preprocess_text(field.text)
                        product[label] = text

                    details_2 = preprocess_text(product_details.find_element_by_class_name("_2u0jt9").text)
                    product["desc_2"] = details_2
                    product_list.append(product)
                except Exception as e:
                    print(e)
                browser.back()
            except Exception as e:
                print(e)
                
        next_page_btn = browser.find_element_by_css_selector("button.shopee-icon-button.shopee-icon-button--right")
        next_page_btn.send_keys(Keys.ENTER)
        browser.implicitly_wait(2)
        
        link = link.split("/")[-1]
        filename = "./dataset/{}_{}.json".format(link, page)
        with open(filename, 'w', encoding='utf8') as f:
            json.dump(product_list, f, sort_keys=True, indent=4, ensure_ascii=False)
        print("next click")


In [503]:
chrome_link = ChromeDriverManager().install()
links = [
    "https://shopee.vn/%C3%81o-cat.77.1871",
    "https://shopee.vn/%C4%90%E1%BA%A7m-cat.77.2821",
    "https://shopee.vn/Ch%C3%A2n-v%C3%A1y-cat.77.2822",
    "https://shopee.vn/Qu%E1%BA%A7n-cat.77.1877",
    "https://shopee.vn/%C4%90%E1%BB%93-%C4%91%C3%B4i-cat.77.2335",
    "https://shopee.vn/%C4%90%E1%BB%93-l%C3%B3t-%C4%90%E1%BB%93-ng%E1%BB%A7-%C4%90%E1%BB%93-m%E1%BA%B7c-nh%C3%A0-cat.77.1879",
    "https://shopee.vn/Trang-ph%E1%BB%A5c-th%E1%BB%83-thao-cat.77.2823",
    "https://shopee.vn/Ph%E1%BB%A5-ki%E1%BB%87n-may-m%E1%BA%B7c-cat.77.10576",
    "https://shopee.vn/Th%E1%BB%9Di-trang-trung-ni%C3%AAn-cat.77.13506",
    "https://shopee.vn/%C3%81o-kho%C3%A1c-%C3%81o-vest-cat.77.1875",
    "https://shopee.vn/Trang-Ph%E1%BB%A5c-%C4%90%C3%B4ng-cat.77.9865",
    "https://shopee.vn/Th%E1%BB%9Di-trang-b%E1%BA%A7u-v%C3%A0-sau-sinh-cat.77.16848"
]

browser = Chrome(chrome_link, options=opts)
browser.implicitly_wait(2000)
for link in links:
    browser.get(link)
    crawl_link(link, browser, max_page=6, max_item=40)


Looking for [chromedriver 78.0.3904.105 linux64] driver in cache 
File found in cache by path [/home/nguyen.thanh.trungb/.wdm/drivers/chromedriver/78.0.3904.105/linux64/chromedriver]


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.product-detail.page-product__detail"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

list index out of range

next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

list index out of range

next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

list index out of range

next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

list index out of range
Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

list index out of range

next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.product-detail.page-product__detail"}
  (Session info: headless chrome=79.0.3945.88)

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

list index out of range

next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

list index out of range

next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)

list index out of range

next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

list index out of range

next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))

Message: no such element: Unable to locate element: {"method":"css selector","selector":".qaNIZv"}
  (Session info: headless chrome=79.0.3945.88)


next click


HBox(children=(IntProgress(value=0, max=40), HTML(value='')))


next click


In [454]:
import json

# product_lists = []
# for query in queries:
#     print(query)
#     product_list = search(query, browser)
#     print(len(product_list))
    


áo thun nam
20
váy ngắn nữ
20
quần bò nam
20
áo len
20
quần đùi nam
20
áo sơ mi trắng
20
áo sơ mi màu
0
áo sơ mi công sở
20
váy công sở
20
áo thun nữ
20
đồ ngủ
0
váy ngủ
20


In [432]:
filename = "áo_sơ_mi_màu.txt"
with open(filename, 'r') as f:
    tmp = json.load(f)
    pprint(tmp)

[{'Chất liệu': 'chất liệu chiffon',
  'Danh Mục': 'danh mục thời trang nữ áo áo kiểu',
  'Kho hàng': 'kho hàng 1994',
  'Kiểu tay': 'kiểu tay dài tay',
  'Thương hiệu': 'thương hiệu no brand',
  'desc_2': ' size chart size cm inch s length60cm 2362 chest88cm 3465 '
            'shoulder35cm 1378 sleeve56cm 2205 m length61cm 2402 chest92cm '
            '3622 shoulder36cm 1417 sleeve57cm 2244 l length62cm 2441 '
            'chest96cm 3780 shoulder37cm 1457 sleeve58cm 2283 xl length63cm '
            '2480 chest100cm 3937 shoulder38cm 1496 sleeve59cm 2323 2xl '
            'length64cm 2520 chest104cm 4094 shoulder39cm 1535 sleeve60cm 2362 '
            'note depend on defferent measurement methods  1 3cm error is '
            'normal promotion promo 1 new customers follow our store and like '
            '5 items which can be reduced by 2000 vnd  follow our store there '
            'will be surprises for you new designs and big discounts  promo 2 '
            'all five star praise 5 

  'title': 'áo sơ mi nam nhiều màu thời trang trẻ trung'},
 {'Chất liệu': 'chất liệu cotton',
  'Cổ áo': 'cổ áo cổ bẻ',
  'Danh Mục': 'danh mục thời trang nam áo sơ mi dài tay',
  'Dáng áo': 'dáng áo dáng vừa',
  'Gửi từ': 'gửi từ quận thủ đức tp hồ chí minh',
  'Kho hàng': 'kho hàng 1554',
  'Tay áo': 'tay áo tay dài',
  'Thương hiệu': 'thương hiệu no brand',
  'desc_2': 'áo sơ mi nam cotton lụa cao cấp cực phẩm thời trang chất vải '
            'cotton lụa co giãn mềm mịn như tơ cầm tay thích ngay chất vải mặc '
            'rất mát không xù lông không phai màu kiểu dáng đơn giãn bất hủ '
            'nhưng không kém phần trẻ trung và cực kỳ sang trọng hàng cao cấp '
            'đường may kỹ từng đương kim mũi chỉ form dáng thời trang phong '
            'cách hàn quốc đang được các bạn trẻ yêu thích nhiều màu lựa chọn '
            'lịch lãm và cực kỳ sang trọng hàng đẹp chuẩn 100 bao mọi chi phí '
            'đổi trả nếu khác hình hỗ trợ đổi size khi mặc chật không vừa tham '
   

  'Danh Mục': 'danh mục thời trang nam áo sơ mi dài tay',
  'Dáng áo': 'dáng áo dáng ôm',
  'Gửi từ': 'gửi từ quận hà đông hà nội',
  'Kho hàng': 'kho hàng 17217',
  'Tay áo': 'tay áo tay dài',
  'Thương hiệu': 'thương hiệu no brand',
  'Túi áo': 'túi áo không',
  'desc_2': 'lh 0985629309 để tìm hiểu thêm về sp quý khách hãy sử dụng mã '
            'giảm giá để được hỗ trợ tối đa 40k phí ship ạ mô tả sản phẩm  đã '
            'không làm thì thôi nhưng đã làm thì phải làm tốt nhất shop xin '
            'mang đến dòng sơ mi nam mầu trắng cho các quý ông trẻ tuổi lịch '
            'lãm và sang trọng  các bạn hãy chú ý đường may trên cổ tay và '
            'phần viền trước ngực của áo rất thẳng và đều không bị lỗi đè chỉ '
            'hay xiêu vẹo trên đường may đây chính là cách nhận biết hàng được '
            'may rất cẩn thận và cũng là điều dùng để phân biệt với hàng sơ mi '
            'trắng kém chất lượng hàng chợ màu đen chất liệu nến lụa cao cấp '
            'sờ cực sang 

            'du lịch xuất xứ việt nam kích cỡ size xs cho bạn có cân nặng từ '
            'dưới 40kg tùy chiều cao size s cho bạn có cân nặng từ dưới 41 '
            '50kg tùy chiều cao size m cho bạn có cân nặng từ 51 60kg tùy '
            'chiều cao size l cho bạn có cân nặng từ 61 65kg tùy chiều cao '
            'size xl cho bạn có cân nặng từ 66 70kg tùy chiều cao lưu ý nên '
            'giặt bằng tay nếu giặt bằng máy bạn nên giặt với nhiệt độ nước '
            'không quá 40 độ c để đảm bảo sản phẩm giữ được màu sắc ban đầu ',
  'price': '71000',
  'title': 'yêu thích áo sơ mi nam dài tay caro cô bẻ phối thân hai màu in '
           'hình kiêu dang trẻ trung mixxstoreno so mi nam 100005'},
 {'Chất liệu': 'chất liệu lụa',
  'Cách giặt': 'cách giặt giặt máy',
  'Cổ áo': 'cổ áo cổ bẻ',
  'Danh Mục': 'danh mục thời trang nam áo sơ mi dài tay',
  'Dáng áo': 'dáng áo dáng vừa',
  'Gửi từ': 'gửi từ quận gò vấp tp hồ chí minh',
  'Kho hàng': 'kho hàng 2957',
  'Tay áo': 'tay áo tay 