In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Run this cell if the computer hasn't installed the pymssql library yet

In [None]:
# !pip install pymssql

In [None]:
import pymssql

conn = pymssql.connect(
    server='THANH', 
    user='',
    password='',
    database='E_Market'
)
cursor = conn.cursor()

In [None]:
cursor.execute('''
    IF OBJECT_ID('Users', 'U') IS NULL
    CREATE TABLE Users (
        id INTEGER PRIMARY KEY,
        name VARCHAR(255),
        follower INTEGER
    );

    IF OBJECT_ID('Products', 'U') IS NULL
    CREATE TABLE Products (
        id INTEGER PRIMARY KEY,
        name NVARCHAR(255),
        id_shop INTEGER,
        price FLOAT,
        FOREIGN KEY (id_shop) REFERENCES Users(id)
    );

    IF OBJECT_ID('Feedbacks', 'U') IS NULL
    CREATE TABLE Feedbacks (
        id INTEGER PRIMARY KEY,
        id_product INTEGER,
        rate INTEGER,
        content VARCHAR(255),
        FOREIGN KEY (id_product) REFERENCES Products(id)
    );
''')

conn.commit()

# Setup Selenium and Web Driver

Run this cell if the computer hasn't installed the selenium and webdriver-manager libraries yet

In [None]:
# !pip install selenium
# !pip install webdriver-manager

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import time

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.maximize_window()

# Scrape Data from Websites

## Lazada website

In [96]:
df_Laz_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Voucher', 'Sold', 'NumRate'])
df_Laz_shop = pd.DataFrame(columns = ['Url', 'Name', 'Followers'])
df_Laz_feedback = pd.DataFrame(columns = ['Product_Url', 'Content', 'Rate'])

Extract information from 120 products on Lazada website

In [97]:
url = 'https://www.lazada.vn/#?'
driver.get(url)
search_box = driver.find_element(By.CSS_SELECTOR, "input.search-box__input--O34g")
search_box.send_keys("Quần áo nữ")
search_button = driver.find_element(By.CSS_SELECTOR, "a.search-box__button--1oH7")
search_button.click()

button_nextPage = driver.find_element(By.CSS_SELECTOR, 'li[title="Next Page"] button')

for i in range (0, 3):
    time.sleep(2)

    try:
        name_elements = driver.find_elements(By.CSS_SELECTOR, '.RfADt a')
        names = [e.text for e in name_elements]
        urls = [e.get_attribute('href') for e in name_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        price_elements = driver.find_elements(By.CSS_SELECTOR, '.aBrP0')
        prices = [e.text for e in price_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        voucher_elements = driver.find_elements(By.CSS_SELECTOR, '.WNoq3')
        vouchers = [e.text for e in voucher_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        sold_elements = driver.find_elements(By.CSS_SELECTOR, 'span._1cEkb')
        solds = [e.text for e in sold_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        nRate_elements = driver.find_elements(By.CSS_SELECTOR, 'span.qzqFw')
        nRates = [e.text for e in nRate_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
    
    default_length = len(urls)
            
    if len(names) < default_length:
        names.extend(["0"] * (default_length - len(names)))
    if len(prices) < default_length:
        prices.extend(["0%"] * (default_length - len(prices)))
    if len(vouchers) < default_length:
        vouchers.extend(["0"] * (default_length - len(vouchers)))
    if len(solds) < default_length:
        solds.extend(["0"] * (default_length - len(solds)))
    if len(nRates) < default_length:
        nRates.extend(["0"] * (default_length - len(nRates)))

    df = pd.DataFrame({
            'Url': urls,
            'Name': names,
            'Price': prices,
            'Voucher': vouchers,
            'Sold': solds,
            'NumRate': nRates
        })

    df_Laz_product = pd.concat([df_Laz_product, df], ignore_index = True)
        
    button_nextPage.click()

In [98]:
df_Laz_product

Unnamed: 0,Url,Name,Price,Voucher,Sold,NumRate
0,https://www.lazada.vn/products/do-bo-nu-di-cho...,Đồ Bộ Nữ Đi Chơi Chất Thun Mát Size M - dưới 5...,"₫39,000",Voucher save 61%,3.9K sold,(1370)
1,https://www.lazada.vn/products/set-ao-phong-ta...,Sét áo phông tay lỡ phối kẻ + quần kẻ dài mẫu mới,"₫49,000",Voucher save 62%,23 sold,(11)
2,https://www.lazada.vn/products/bo-pijama-do-bo...,"Bộ Pijama, Đồ Bộ Mặc Nhà Nữ Dáng Cộc xinh xắn","₫19,000",Voucher save 46%,458 sold,(131)
3,https://www.lazada.vn/products/re-vo-dich-tong...,[ Rẻ Vô Địch ] Tổng hợp các sét quần hoạt hình...,"₫79,200",Voucher save 28%,265 sold,(91)
4,https://www.lazada.vn/products/set-bo-ao-phong...,Set Bộ áo phông BđBcR Nữ Cotton Mix Quần dài K...,"₫49,000",Voucher save 29%,171 sold,(60)
...,...,...,...,...,...,...
115,https://www.lazada.vn/products/set-quan-short-...,Set quần short đen + áo len sọc xanh QC,"₫52,000",,18 sold,(5)
116,https://www.lazada.vn/products/ph-set-bo-3-ao-...,(PH) Set Bộ 3 Áo Ống Kèm Áo Lưới Tay Dài Croto...,"₫105,000",Voucher save 30%,7 sold,(1)
117,https://www.lazada.vn/products/quan-dui-nu-mac...,"Quần đùi nữ mặc nhà ỐNG RỘNG, VẢI XƯỢC, DÂY KÉ...","₫35,800",Voucher save 47%,15 sold,(6)
118,https://www.lazada.vn/products/do-bo-dui-co-tr...,Đồ Bộ Đùi Cổ Tròn Tay Cộc Chất Thun Cotton Su ...,"₫79,000",Voucher save 50%,0,0


In [99]:
product_urls = list(df_Laz_product['Url'])

In [100]:
print(len(product_urls))
for url in product_urls:
    print(url)

120
https://www.lazada.vn/products/do-bo-nu-di-choi-chat-thun-mat-size-m-duoi-53kg-size-l-tu-54kg-den-65kg-size-xl-tu-65kg-den-72kg-i1072394871.html
https://www.lazada.vn/products/set-ao-phong-tay-lo-phoi-ke-quan-ke-dai-mau-moi-i2598385570.html
https://www.lazada.vn/products/bo-pijama-do-bo-mac-nha-nu-dang-coc-xinh-xan-i2379278926.html
https://www.lazada.vn/products/re-vo-dich-tong-hop-cac-set-quan-hoat-hinh-mix-ao-phong-form-rong-unisex-i2655851219.html
https://www.lazada.vn/products/set-bo-ao-phong-bdbcr-nu-cotton-mix-quan-dai-ke-tho-cap-chun-i2633567179.html
https://www.lazada.vn/products/set-bo-hottrend-ao-thun-babytee-gan-no-co-no-tay-quan-suong-kaki-4-soc-that-no-i2724942439.html
https://www.lazada.vn/products/set-bo-ao-thun-croptop-mix-quan-xuong-hoat-hinh-i2711435293.html
https://www.lazada.vn/products/quan-ao-nu-set-ao-3-chu-gau-kem-quan-ong-rong-i2044014686.html
https://www.lazada.vn/products/set-bo-do-nu-ao-thun-phoi-tay-bbr-mix-quan-xuong-ke-dang-rong-i2633561733.html
https

Store the product URLs in a text file as a backup

In [101]:
with open('Laz_product_urls.txt', mode='w') as file:
    for row in product_urls:
        file.write(row + "\n")

Retrieve the list of product URLs from the backup file

In [102]:
with open('Laz_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [103]:
Laz_product_urls = links

Store the product dataframe in a csv file as a backup

In [104]:
df_Laz_product.to_csv('lazada_products.csv', index=False)

Retrieve the product dataframe from the backup file

In [105]:
df_Laz_product = pd.read_csv('lazada_products.csv')
df_Laz_product

Unnamed: 0,Url,Name,Price,Voucher,Sold,NumRate
0,https://www.lazada.vn/products/do-bo-nu-di-cho...,Đồ Bộ Nữ Đi Chơi Chất Thun Mát Size M - dưới 5...,"₫39,000",Voucher save 61%,3.9K sold,(1370)
1,https://www.lazada.vn/products/set-ao-phong-ta...,Sét áo phông tay lỡ phối kẻ + quần kẻ dài mẫu mới,"₫49,000",Voucher save 62%,23 sold,(11)
2,https://www.lazada.vn/products/bo-pijama-do-bo...,"Bộ Pijama, Đồ Bộ Mặc Nhà Nữ Dáng Cộc xinh xắn","₫19,000",Voucher save 46%,458 sold,(131)
3,https://www.lazada.vn/products/re-vo-dich-tong...,[ Rẻ Vô Địch ] Tổng hợp các sét quần hoạt hình...,"₫79,200",Voucher save 28%,265 sold,(91)
4,https://www.lazada.vn/products/set-bo-ao-phong...,Set Bộ áo phông BđBcR Nữ Cotton Mix Quần dài K...,"₫49,000",Voucher save 29%,171 sold,(60)
...,...,...,...,...,...,...
115,https://www.lazada.vn/products/set-quan-short-...,Set quần short đen + áo len sọc xanh QC,"₫52,000",,18 sold,(5)
116,https://www.lazada.vn/products/ph-set-bo-3-ao-...,(PH) Set Bộ 3 Áo Ống Kèm Áo Lưới Tay Dài Croto...,"₫105,000",Voucher save 30%,7 sold,(1)
117,https://www.lazada.vn/products/quan-dui-nu-mac...,"Quần đùi nữ mặc nhà ỐNG RỘNG, VẢI XƯỢC, DÂY KÉ...","₫35,800",Voucher save 47%,15 sold,(6)
118,https://www.lazada.vn/products/do-bo-dui-co-tr...,Đồ Bộ Đùi Cổ Tròn Tay Cộc Chất Thun Cotton Su ...,"₫79,000",Voucher save 50%,0,0


Retrieve the shop urls on Lazada website

In [31]:
def count_rate_star(containerStar_element):
    goldStar_url = 'https://img.lazcdn.com/g/tps/tfs/TB19ZvEgfDH8KJjy1XcXXcpdXXa-64-64.png'
    countStar = 0
    img_elements = container.find_elements(By.TAG_NAME, 'img')
    
    for img in img_elements:
        if img.get_attribute('src') == goldStar_url:
            countStar = countStar + 1
            # print('Gold')
    return countStar

In [19]:
def scroll_to_feedbacks(driver):
    try:
        feedbacks_section = driver.find_element(By.CSS_SELECTOR, '.mod-review')
        driver.execute_script("arguments[0].scrollIntoView();", feedbacks_section)
        time.sleep(2)
    except NoSuchElementException:
        print("No reviews section found.")

def scroll_to_next_page(driver):
    try:
        next_button = driver.find_element(By.CSS_SELECTOR, 'button.next-btn.next-btn-normal.next-btn-medium.next-pagination-item.next')
        driver.execute_script("arguments[0].scrollIntoView();", next_button)
        time.sleep(2)
    except NoSuchElementException:
        print("No next button found.")

In [None]:
Laz_shop_urls = []
for url in Laz_product_urls:
    driver.get(url)
    time.sleep(5)

    try:    
        shop_url = driver.find_element(By.CSS_SELECTOR, 'div.seller-link a').get_attribute('href')
        Laz_shop_urls.append(shop_url)
    except NoSuchElementException:
        shop_url = ''
        print('NoSuchElementException')
        
    # Get the feedback information
    
    while True:
        try:
            content_elements = driver.find_elements(By.CSS_SELECTOR, '')
            contents = [e.text for e in content_elements]
        except NoSuchElementException:
            print('NoSuchElementException')
            
        try:
            containerStar_elements = driver.find_elements(By.CSS_SELECTOR, '.container-star.starCtn.left')
            rates = [count_rate_star(container) for container in containerStar_elements]
        except NoSuchElementException:
            print('NoSuchElementException')
    
        default_length = len(rates)
                
        if len(contents) < default_length:
            contents.extend(["0"] * (default_length - len(contents)))
            
        urls = [url for i in range(0, default_length)]
    
        df = pd.DataFrame({
                'Product_Url': urls,
                'Content': contents,
                'Rate': rates
            })
        df_Laz_feedback = pd.concat([df_Laz_feedback, df], ignore_index = True)

        scroll_to_next_page(driver)
        next_button = driver.find_element(By.CSS_SELECTOR, 'button.next-btn.next-btn-normal.next-btn-medium.next-pagination-item.next')
        
        if next_button.get_attribute('disabled'):
                print("No more pages to scrape.")
                break
        else:
            next_button.click()
            time.sleep(5)

In [51]:
####################################################################
df1 = pd.DataFrame(columns = ['Product_Url', 'Content', 'Rate'])
url = 'https://www.lazada.vn/products/set-ao-thun-cuu-quan-karo-ao-phong-from-rong-chat-vai-mat-thiet-ke-phong-cach-ca-tinh-leeda-i2325148429.html'
driver.maximize_window()
driver.get(url)
time.sleep(5)

try:    
    shop_url = driver.find_element(By.CSS_SELECTOR, 'div.seller-link a').get_attribute('href')
    # Laz_shop_urls.append(shop_url)
except NoSuchElementException:
    shop_url = ''
    print('NoSuchElementException')
    
# Get the feedback information
driver.execute_script(f"window.scrollTo(0, 1550);")
time.sleep(5)

while True:
    try:
        content_elements = driver.find_elements(By.CSS_SELECTOR, '.item div:nth-child(3) .content')
        #module_product_review > div > div > div:nth-child(3) > div.mod-reviews > div:nth-child(3) > div.item-content > div.content
        contents = [e.text for e in content_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        containerStar_elements = driver.find_elements(By.CSS_SELECTOR, '.container-star.starCtn.left')
        #rates = [count_rate_star(container) for container in containerStar_elements]
    except NoSuchElementException:
        print('NoSuchElementException')

    default_length = len(containerStar_elements)
            
    if len(contents) < default_length:
        contents.extend(["0"] * (default_length - len(contents)))
        
    urls = [url for i in range(0, default_length)]

    df = pd.DataFrame({
            'Product_Url': urls,
            'Content': contents,
            'Rate': containerStar_elements
        })
    df1 = pd.concat([df1, df], ignore_index = True)

    driver.execute_script(f"window.scrollTo(0, 2280);")
    time.sleep(5)
    next_button = driver.find_element(By.CSS_SELECTOR, 'button.next-btn.next-btn-normal.next-btn-medium.next-pagination-item.next')
    
    if next_button.get_attribute('disabled'):
            print("No more pages to scrape.")
            break
    else:
        next_button.click()
        time.sleep(5)
        
driver.quit()

No more pages to scrape.


['sp m nhận được ko giống ảnh shop đăng lắm, áo mỏng nhẹ nhưng có vẻ nóng, quần cũng đc, shop mô tả phom rộng nhưng áo m mua sz L dài 62, quần dài 88, cao 1m62 nặng 50 kg mặc cộc, về tay 100k.',
 'Hoàn hảo cho vẻ ngoài giản dị và hợp thời trang, Thiết kế độc đáo và thời trang,',
 'Chất liệu thoáng mát và thoải mái,',
 'áo quá mỏng sản phẩm rất tốt',
 'Đồ đẹp, shop giao hàng nhanh, nhân viên nhiệt tình. Lần sau mình sẽ tiếp tục ủng hộ shop.']

In [57]:
len(Laz_shop_urls)

NameError: name 'Laz_shop_urls' is not defined

In [None]:
Laz_shop_urls = list(set(Laz_shop_urls))

Store the shop URLs in a text file as a backup

In [None]:
with open('Laz_shop_urls.txt', mode='w') as file:
    for row in Laz_shop_urls:
        file.write(row + "\n")

Retrieve the list of shop URLs from the backup file

In [None]:
with open('Laz_shop_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Laz_shop_urls = links

Retrieve the shop's information from the product URLs

In [None]:
for url in Laz_shop_urls:
    driver.get(url)
    time.sleep(2)

    try:
        name = driver.find_element(By.CSS_SELECTOR, '#pi-component-container > div > div:nth-child(2) > div > div > div > div:nth-child(2) > div:nth-child(1) > div > div > div > div:nth-child(2) > div:nth-child(1) > h1').text
        print(name)
        
        followers = driver.find_element(By.CSS_SELECTOR, '#pi-component-container > div > div:nth-child(2) > div > div > div > div:nth-child(2) > div:nth-child(1) > div > div > div > div:nth-child(2) > div:nth-child(2) > div').text
        print(followers)

        df_Laz_shop = pd.concat([df_Laz_shop, pd.DataFrame([[url, name, followers]], columns = ['Url', 'Name', 'Followers'])], ignore_index = True)
        print(df_Laz_shop[df_Laz_shop['Url'] == url])
    except NoSuchElementException:
        print('NoSuchElementException')

In [None]:
df_Laz_shop

Store the shop dataframe in a csv file as a backup

In [None]:
df_Laz_shop.to_csv('lazada_shops.csv', index=False)

Retrieve the shop dataframe from the backup file

In [None]:
df_Laz_shop = pd.read_csv('lazada_shops.csv')
df_Laz_shop

## Shein website

In [None]:
df_Shein_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Shop_url'])
df_Shein_shop = pd.DataFrame(columns = ['Url', 'Name', 'Followers'])

In [None]:
#The url below is the original link to homepage of Shein website
#url = 'https://www.shein.com.vn/?_gl=1*1gloy8n*_up*MQ..&gclid=EAIaIQobChMIgbqoh-mDiAMV9tIWBR3KgQ6KEAAYASAAEgKgHPD_BwE'

#This url is the Shein website after send key 'QUần áo nữ' to the search input 
url = 'https://www.shein.com.vn/pdsearch/Qu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF/?ici=s1`EditSearch`Qu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF`_fb`d0`PageSearchResult&search_source=1&search_type=all&source=search&src_identifier=st%3D2%60sc%3DQu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF%60sr%3D0%60ps%3D1&src_identifier_pre_search=&src_module=search&src_tab_page_id=page_search1724166755052'
driver.get(url)
time.sleep(5)

product_urls = []

dr_products = driver.find_elements(By.CSS_SELECTOR, 'div.product-card__goods-title-container a.goods-title-link')
print(len(dr_products))

for product in dr_products:
    url = product.get_attribute('href')
    product_urls.append(url)
    print(url)

product_urls = list(set(product_urls))

In [None]:
with open('Shein_product_urls.txt', mode='w') as file:
    for row in product_urls:
        file.write(row + "\n")

In [None]:
with open('Shein_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Shein_product_urls = links

In [None]:
Shein_shop_urls = []
for url in Shein_product_urls:
    driver.get(url)
    time.sleep(2)
        
    name = driver.find_element(By.CSS_SELECTOR, 'h1.product-intro__head-name').text
    print(name)

    if driver.find_element(By.CSS_SELECTOR, 'div.ProductIntroHeadPrice__head-mainprice .original span'):
        price = driver.find_element(By.CSS_SELECTOR, 'div.ProductIntroHeadPrice__head-mainprice .original span').text
    else:
        price = 0
    print(price)

    # shop_content_box = driver.find_element(By. CSS_SELECTOR, '.shop-entry__contentBox').get_attribute('style')
    # if driver.find_element(By. CSS_SELECTOR, '.shop-entry__contentBox').get_attribute('display') == 'display: none;':
    #     driver.find_element(By.CSS_SELECTOR, '.product-intro__brand-head').click()
    #     time.sleep(2)
    #     shop_url = driver.find_element(By.CSS_SELECTOR, 'div.seller-link a').get_attribute('href')
    #     Shein_shop_urls.append(shop_url)
    # else:
    #     shop_url = ''
    # print(shop_url)t

drive.quit()

In [None]:
# /html/body/div[1]/div[1]/div/div[1]/div/div[2]/div[2]/div/div[5]/div[3]/div[1]/span

In [None]:


# URL of the product page
url = "https://www.shein.com.vn/SHEIN-ICON-Women-s-Summer-Leopard-Print-Slim-Fit-Cropped-Camisole-Fashion-Cami-Top-p-36628205.html?src_identifier=st%3D2%60sc%3DQu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF%60sr%3D0%60ps%3D1"

# Navigate to the URL
driver.get(url)
time.sleep(2)  # Wait for the page to load

try:
    # Get the product name
    product_name = driver.find_element(By.CSS_SELECTOR, 'h1.product-intro__head-name').text
    print(f"Product Name: {product_name}")
    
    # Get the price
    price = driver.find_element(By.CSS_SELECTOR, 'div.ProductIntroHeadPrice__head-mainprice .original span').text
    print(f"Price: {price}")
    
    # Get the shop name (if available)
    try:
        shop_name = driver.find_element(By.CSS_SELECTOR, 'a.store-name').text
        print(f"Shop Name: {shop_name}")
    except NoSuchElementException:
        shop_name = "N/A"
        print("Shop Name not found")
    
    # Get the number of followers (if available)
    try:
        followers = driver.find_element(By.CSS_SELECTOR, 'span.follow-number').text
        print(f"Followers: {followers}")
    except NoSuchElementException:
        followers = "N/A"
        print("Followers not found")

except NoSuchElementException as e:
    print(f"An error occurred: {e}")


## Tiki website

In [None]:
df_Tiki_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Shop_url'])
df_Tiki_shop = pd.DataFrame(columns = ['Url', 'Name', 'Followers'])

In [None]:
#The url below is the original link to homepage of Tiki website
#url = 'https://tiki.vn/'

#This url is the Tiki website after send key 'QUần áo nữ' to the search input 
url = 'https://tiki.vn/search?q=Qu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF'
driver.get(url)
time.sleep(5)

product_urls = []
button_nextPage = driver.find_element(By.CSS_SELECTOR, '.pagination-block div:nth-child(3) a')
dr_products = driver.find_elements(By.CSS_SELECTOR, 'a[data-view-id="product_list_item"]')
print(len(dr_products))

for i in range (0, 3):
    time.sleep(2)

    dr_products = driver.find_elements(By.CSS_SELECTOR, 'a[data-view-id="product_list_item"]')
    print(len(dr_products))
    
    for product in dr_products:
        url = product.get_attribute('href')
        product_urls.append(url)
        print(url)
    
    product_urls = list(set(product_urls))
    button_nextPage.click()

In [None]:
with open('Tiki_product_urls.txt', mode='w') as file:
    for row in product_urls:
        file.write(row + "\n")

In [None]:
with open('Tiki_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Tiki_product_urls = links

In [None]:
Tiki_shop_urls = []
for url in Tiki_product_urls:
    driver.get(url)
    time.sleep(2)

    try:
        name = driver.find_element(By.CSS_SELECTOR, 'h1.Title__TitledStyled-sc-c64ni5-0').text
        # print(name)
    
        if driver.find_element(By.CSS_SELECTOR, '.product-price__current-price'):
            price = driver.find_element(By.CSS_SELECTOR, '.product-price__current-price').text
        else:
            price = 0
        # print(price)
        
        if driver.find_elements(By.CSS_SELECTOR, 'div.SellerHeader__SellerHeaderStyled-sc-la7c6v-0 a'):
            shop_url = driver.find_elements(By.CSS_SELECTOR, 'div.SellerHeader__SellerHeaderStyled-sc-la7c6v-0 a')[0].get_attribute('href')
            Tiki_shop_urls.append(shop_url)
        else:
            shop_url = ''
        # print(shop_url)

        df_Tiki_product = pd.concat([df_Tiki_product, pd.DataFrame([[url, name, price, shop_url]], columns = ['Url', 'Name', 'Price', 'Shop_url'])], ignore_index = True)
        print(df_Tiki_product[df_Tiki_product['Url'] == url])
    except NoSuchElementException:
        print('NoSuchElementException')

In [None]:
df_Tiki_product

In [None]:
df_Tiki_product.to_csv('tiki_products.csv', index=False)

In [None]:
df_Tiki_product = pd.read_csv('tiki_products.csv')
df_Tiki_product

In [None]:
Tiki_shop_urls = list(set(Tiki_shop_urls))

In [None]:
with open('Tiki_shop_urls.txt', mode='w') as file:
    for row in Tiki_shop_urls:
        file.write(row + "\n")

In [None]:
with open('Tiki_shop_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Tiki_shop_urls = links

In [None]:
for url in Tiki_shop_urls:
    driver.get(url)
    time.sleep(2)

    try:
        name = driver.find_element(By.CSS_SELECTOR, 'h1.Header__SellerName-sc-vgcg69-2').text
        # print(name)
        
        followers = driver.find_element(By.CSS_SELECTOR, '#seller-info-wrapper > div.Header__SellerNameWrapper-sc-vgcg69-5.jZYoaR > div.Header__BadgeWrapper-sc-vgcg69-11.czboZb > div:nth-child(3) > div.Header__SubInfo-sc-vgcg69-12.iUmIsA').text
        # print(followers)
        
        df_Tiki_shop = pd.concat([df_Tiki_shop, pd.DataFrame([[url, name, followers]], columns = ['Url', 'Name', 'Followers'])], ignore_index = True)
        print(df_Tiki_shop[df_Tiki_shop['Url'] == url])
    except NoSuchElementException:
        print('NoSuchElementException')

In [None]:
df_Tiki_shop

In [None]:
df_Tiki_shop.to_csv('tiki_shops.csv', index=False)

In [None]:
df_Tiki_shop = pd.read_csv('tiki_shops.csv')
df_Tiki_shop

In [None]:
driver.quit()

# Pre-proccesing data

## Lazada

In [None]:
df_Laz_product = pd.read_csv('lazada_products.csv')

In [None]:
df_Laz_product

In [None]:
df_Laz_product.info()

Convert column `['Price']` to float

In [None]:
df_Laz_product['Price'] = df_Laz_product['Price'].str.replace('.', '').str.replace('₫', '').astype(float)
# df_Laz_product['Price']

Convert column `['NumRate']` to int

In [None]:
df_Laz_product['NumRate'] = df_Laz_product[NumRate].str.replace('(', '').str.replace(')', '').astype(int)
df_Lax_product['NumRate']

Run this cell if the computer hasn't installed the nltk library yet

In [None]:
# !pip install nltk

In [None]:
categories_keywords = {
    "đồ bộ": ["đồ bộ", "set", "bộ", "quần áo", "sét"],
    "đồ lót": ["đồ lót", "underwear", "quần lót", "áo lót", "áo ngực", "bra", "panty", "boxer", "quần chíp"],
    "váy/quần": ["váy", "quần", "skirt", "pants", "jeans"],
    "áo": ["áo", "t-shirt", "shirt", "croptop", "yếm", "khoác", "áo khoác", "vest", "blazer"],
    "đầm": ["đầm", "dress", "váy ngủ"], 
    "nón": ["nón", "mũ", "hat", "helmet"],
    "vớ": ["vớ", "tất", "socks"],
    "giày/dép": ["giày", "dép", "bata", "cao gót", "shoes", "slipper", "guốc", "boots", "xăng đan", "sandals", "sneakers"],
    "trang sức": ["trang sức", "nhẫn", "vòng", "dây chuyền", "earrings", "necklace"]
}

Classify products using n-grams techniques

In [None]:
import unicodedata
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

def generate_ngrams(string, n):
    string = unicodedata.normalize('NFC', string)
    tokens = word_tokenize(string, language='english')
    tokens = [token.lower() for token in tokens]
    return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
    
def classify_product(name, categories_keywords):
    _2grams = generate_ngrams(name, 2)
    _1grams = generate_ngrams(name, 1)
    
    for category, keywords in categories_keywords.items():
        for keyword in keywords:
            if keyword in _2grams + _1grams: 
                return category 
    
    return "khác"

Add a column `['Category']` to the `df_Laz_product` DataFrame using the `classify_product` function provided above

In [None]:
df_Laz_product['Category'] = df_Laz_product['Name'].apply(lambda name: classify_product(name, categories_keywords))

Re-check the new column

In [None]:
Laz_category_counts = df_Laz_product['Category'].value_counts()
print(Laz_category_counts)

Identify the outliers and manually replace them with the correct values

In [None]:
df_Laz_product[df_Laz_product['Category'] == 'khác']

In [None]:
df_Laz_product.loc[50, 'Category'] = 'đồ bộ'

Check if the dataframe has NaN value

In [None]:
Laz_nan_summary = df_Laz_product.isna().sum()
print(Laz_nan_summary)

## Shein

## Tiki

In [None]:
df_Tiki_product = pd.read_csv('tiki_products.csv')

In [None]:
df_Tiki_product.head()

In [None]:
df_Tiki_product.info()

Convert column `['Price']` to float

In [None]:
df_Tiki_product['Price'] = df_Tiki_product['Price'].str.replace('.', '').str.replace('₫', '').astype(float)
df_Tiki_product['Price']

Add a column `['Category']` to the `df_Laz_product` DataFrame using the `classify_product` function provided above

In [None]:
df_Tiki_product['Category'] = df_Tiki_product['Name'].apply(lambda name: classify_product(name, categories_keywords))

Re-check the new column

In [None]:
Tiki_category_counts = df_Tiki_product['Category'].value_counts()
print(Tiki_category_counts)

Identify the outliers and manually replace them with the correct values

In [None]:
df_Tiki_product[df_Tiki_product['Category'] == 'khác']

In [None]:
df_Tiki_product.loc[3, 'Category'] = 'đồ bộ'
df_Tiki_product.loc[29, 'Category'] = 'đồ bộ'
df_Tiki_product.loc[76, 'Category'] = 'đồ bộ'
df_Tiki_product.loc[83, 'Category'] = 'đồ bộ'
df_Tiki_product.loc[103, 'Category'] = 'đồ bộ'

# Analysis on each e-market platform

## Lazada

In [None]:
df_Laz_product.info()

In [None]:
df_Laz_product.describe()

In [None]:
Laz_category_counts = df_Laz_product['Category'].value_counts()
print(Laz_category_counts)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = Laz_category_counts.index, y = Laz_category_counts.values, palette = 'pastel')
plt.title('Phân bố các giá trị trong cột Category', fontsize = 16)
plt.xlabel('Loại', fontsize = 14)
plt.ylabel('Số lượng', fontsize = 14)
plt.xticks(rotation = 45, ha = 'right')
plt.show()

In [None]:
df_Laz_product.to_csv('lazada_products_.csv', index=False)

In [None]:
Laz_avgPrice_per_category = df_Laz_product.groupby('Category')['Price'].mean().round()

In [None]:
print(Laz_avgPrice_per_category)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = Laz_avgPrice_per_category.index, y = Laz_avgPrice_per_category.values, palette = 'pastel')
plt.title('Giá tiền trung bình mỗi loại', fontsize = 16)
plt.xlabel('Loại', fontsize = 14)
plt.ylabel('Giá trung bình', fontsize = 14)
plt.xticks(rotation = 45, ha = 'right')
plt.show()

## Shein

## Tiki

In [None]:
df_Tiki_product.info()

In [None]:
df_Tiki_product.describe()

In [None]:
Tiki_nan_summary = df_Tiki_product.isna().sum()
print(Tiki_nan_summary)

In [None]:
Tiki_category_counts = df_Tiki_product['Category'].value_counts()
print(Tiki_category_counts)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = Tiki_category_counts.index, y = Tiki_category_counts.values, palette = 'pastel')
plt.title('Phân bố các giá trị trong cột Category', fontsize = 16)
plt.xlabel('Loại', fontsize = 14)
plt.ylabel('Số lượng', fontsize = 14)
plt.xticks(rotation = 45, ha = 'right')
plt.show()

In [None]:
df_Tiki_product.to_csv('tiki_products_.csv', index=False)

In [None]:
Tiki_avgPrice_per_category = df_Tiki_product.groupby('Category')['Price'].mean().round()

In [None]:
print(Tiki_avgPrice_per_category)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = Tiki_avgPrice_per_category.index, y = Tiki_avgPrice_per_category.values, palette = 'pastel')
plt.title('Giá tiền trung bình mỗi loại', fontsize = 16)
plt.xlabel('Loại', fontsize = 14)
plt.ylabel('Giá trung bình', fontsize = 14)
plt.xticks(rotation = 45, ha = 'right')
plt.show()

# Analysis comprehensive overview based on the whole 3 datasets