In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Run this cell if the computer hasn't installed the pymssql library yet

In [None]:
# !pip install pymssql

In [None]:
import pymssql

conn = pymssql.connect(
    server='THANH', 
    user='',
    password='',
    database='E_Market'
)
cursor = conn.cursor()

In [None]:
cursor.execute('''
    IF OBJECT_ID('Users', 'U') IS NULL
    CREATE TABLE Users (
        id INTEGER PRIMARY KEY,
        name VARCHAR(255),
        follower INTEGER
    );

    IF OBJECT_ID('Products', 'U') IS NULL
    CREATE TABLE Products (
        id INTEGER PRIMARY KEY,
        name NVARCHAR(255),
        id_shop INTEGER,
        price FLOAT,
        FOREIGN KEY (id_shop) REFERENCES Users(id)
    );

    IF OBJECT_ID('Feedbacks', 'U') IS NULL
    CREATE TABLE Feedbacks (
        id INTEGER PRIMARY KEY,
        id_product INTEGER,
        rate INTEGER,
        content VARCHAR(255),
        FOREIGN KEY (id_product) REFERENCES Products(id)
    );
''')

conn.commit()

# Setup Selenium and Web Driver

Run this cell if the computer hasn't installed the selenium and webdriver-manager libraries yet

In [None]:
# !pip install selenium
# !pip install webdriver-manager

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import time

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Scrape Data from Websites

## Lazada website

In [None]:
df_Laz_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Shop_url'])
df_Laz_shop = pd.DataFrame(columns = ['Url', 'Name', 'Followers'])

In [None]:
url = 'https://www.lazada.vn/#?'
driver.get(url)
search_box = driver.find_element(By.CSS_SELECTOR, "input.search-box__input--O34g")
search_box.send_keys("Quần áo nữ")
search_button = driver.find_element(By.CSS_SELECTOR, "a.search-box__button--1oH7")
search_button.click()

product_urls = []
button_nextPage = driver.find_element(By.CSS_SELECTOR, 'li[title="Next Page"] button')

for i in range (0, 3):
    time.sleep(2)

    dr_products = driver.find_elements(By.CSS_SELECTOR, 'div[data-tracking="product-card"] a')
    print(len(dr_products))
    
    for product in dr_products:
        url = product.get_attribute('href')
        product_urls.append(url)
        print(url)
    
    product_urls = list(set(product_urls))
    button_nextPage.click()

In [None]:
print(len(product_urls))
for url in product_urls:
    print(url)

In [None]:
with open('Laz_product_urls.txt', mode='w') as file:
    for row in product_urls:
        file.write(row + "\n")

In [None]:
with open('Laz_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Laz_product_urls = links

In [None]:
Laz_shop_urls = []
for url in Laz_product_urls:
    driver.get(url)
    time.sleep(2)

    try:            
        name = driver.find_element(By.CSS_SELECTOR, 'h1.pdp-mod-product-badge-title').text
        # print(name)
    
        if driver.find_element(By.CSS_SELECTOR, 'span.pdp-price'):
            price = driver.find_element(By.CSS_SELECTOR, 'span.pdp-price').text
        else:
            price = 0
        # print(price)
    
        if driver.find_element(By.CSS_SELECTOR, 'div.seller-link a'):
            shop_url = driver.find_element(By.CSS_SELECTOR, 'div.seller-link a').get_attribute('href')
            Laz_shop_urls.append(shop_url)
        else:
            shop_url = ''
        # print(shop_url)

        df_Laz_product = pd.concat([df_Laz_product, pd.DataFrame([[url, name, price, shop_url]], columns = ['Url', 'Name', 'Price', 'Shop_url'])], ignore_index = True)
        print(df_Laz_product[df_Laz_product['Url'] == url])
    except NoSuchElementException:
        print('NoSuchElementException')

In [None]:
df_Laz_product

In [None]:
df_Laz_product.to_csv('lazada_products.csv', index=False)

In [None]:
df_Laz_product = pd.read_csv('lazada_products.csv')
df_Laz_product

In [None]:
Laz_shop_urls = list(set(Laz_shop_urls))

In [None]:
with open('Laz_shop_urls.txt', mode='w') as file:
    for row in Laz_shop_urls:
        file.write(row + "\n")

In [None]:
with open('Laz_shop_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Laz_shop_urls = links

In [None]:
# driver.get('https://www.lazada.vn/shop/an-store-chuyen-bo-me-va-be/?itemId=2635424282&channelSource=pdp')

for url in Laz_shop_urls:
    driver.get(url)
    time.sleep(2)

    try:
        name = driver.find_element(By.CSS_SELECTOR, '#pi-component-container > div > div:nth-child(2) > div > div > div > div:nth-child(2) > div:nth-child(1) > div > div > div > div:nth-child(2) > div:nth-child(1) > h1').text
        print(name)
        
        followers = driver.find_element(By.CSS_SELECTOR, '#pi-component-container > div > div:nth-child(2) > div > div > div > div:nth-child(2) > div:nth-child(1) > div > div > div > div:nth-child(2) > div:nth-child(2) > div').text
        print(followers)

        df_Laz_shop = pd.concat([df_Laz_shop, pd.DataFrame([[url, name, followers]], columns = ['Url', 'Name', 'Followers'])], ignore_index = True)
        print(df_Laz_shop[df_Laz_shop['Url'] == url])
    except NoSuchElementException:
        print('NoSuchElementException')

In [None]:
df_Laz_shop

In [None]:
df_Laz_shop.to_csv('lazada_shops.csv', index=False)

In [None]:
df_Laz_shop = pd.read_csv('lazada_shops.csv')
df_Laz_shop

## Shein website

In [None]:
df_Shein_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Shop_url'])
df_Shein_shop = pd.DataFrame(columns = ['Url', 'Name', 'Followers'])

In [None]:
#The url below is the original link to homepage of Shein website
#url = 'https://www.shein.com.vn/?_gl=1*1gloy8n*_up*MQ..&gclid=EAIaIQobChMIgbqoh-mDiAMV9tIWBR3KgQ6KEAAYASAAEgKgHPD_BwE'

#This url is the Shein website after send key 'QUần áo nữ' to the search input 
url = 'https://www.shein.com.vn/pdsearch/Qu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF/?ici=s1`EditSearch`Qu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF`_fb`d0`PageSearchResult&search_source=1&search_type=all&source=search&src_identifier=st%3D2%60sc%3DQu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF%60sr%3D0%60ps%3D1&src_identifier_pre_search=&src_module=search&src_tab_page_id=page_search1724166755052'
driver.get(url)
time.sleep(5)

product_urls = []

dr_products = driver.find_elements(By.CSS_SELECTOR, 'div.product-card__goods-title-container a.goods-title-link')
print(len(dr_products))

for product in dr_products:
    url = product.get_attribute('href')
    product_urls.append(url)
    print(url)

product_urls = list(set(product_urls))

In [None]:
with open('Shein_product_urls.txt', mode='w') as file:
    for row in product_urls:
        file.write(row + "\n")

In [None]:
with open('Shein_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Shein_product_urls = links

In [None]:
Shein_shop_urls = []
for url in Shein_product_urls:
    driver.get(url)
    time.sleep(2)
        
    name = driver.find_element(By.CSS_SELECTOR, 'h1.product-intro__head-name').text
    print(name)

    if driver.find_element(By.CSS_SELECTOR, 'div.ProductIntroHeadPrice__head-mainprice .original span'):
        price = driver.find_element(By.CSS_SELECTOR, 'div.ProductIntroHeadPrice__head-mainprice .original span').text
    else:
        price = 0
    print(price)

    # shop_content_box = driver.find_element(By. CSS_SELECTOR, '.shop-entry__contentBox').get_attribute('style')
    # if driver.find_element(By. CSS_SELECTOR, '.shop-entry__contentBox').get_attribute('display') == 'display: none;':
    #     driver.find_element(By.CSS_SELECTOR, '.product-intro__brand-head').click()
    #     time.sleep(2)
    #     shop_url = driver.find_element(By.CSS_SELECTOR, 'div.seller-link a').get_attribute('href')
    #     Shein_shop_urls.append(shop_url)
    # else:
    #     shop_url = ''
    # print(shop_url)

drive.quit()

In [None]:


# URL of the product page
url = "https://www.shein.com.vn/SHEIN-ICON-Women-s-Summer-Leopard-Print-Slim-Fit-Cropped-Camisole-Fashion-Cami-Top-p-36628205.html?src_identifier=st%3D2%60sc%3DQu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF%60sr%3D0%60ps%3D1"

# Navigate to the URL
driver.get(url)
time.sleep(2)  # Wait for the page to load

try:
    # Get the product name
    product_name = driver.find_element(By.CSS_SELECTOR, 'h1.product-intro__head-name').text
    print(f"Product Name: {product_name}")
    
    # Get the price
    price = driver.find_element(By.CSS_SELECTOR, 'div.ProductIntroHeadPrice__head-mainprice .original span').text
    print(f"Price: {price}")
    
    # Get the shop name (if available)
    try:
        shop_name = driver.find_element(By.CSS_SELECTOR, 'a.store-name').text
        print(f"Shop Name: {shop_name}")
    except NoSuchElementException:
        shop_name = "N/A"
        print("Shop Name not found")
    
    # Get the number of followers (if available)
    try:
        followers = driver.find_element(By.CSS_SELECTOR, 'span.follow-number').text
        print(f"Followers: {followers}")
    except NoSuchElementException:
        followers = "N/A"
        print("Followers not found")

except NoSuchElementException as e:
    print(f"An error occurred: {e}")


## Tiki website

In [None]:
df_Tiki_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Shop_url'])
df_Tiki_shop = pd.DataFrame(columns = ['Url', 'Name', 'Followers'])

In [None]:
#The url below is the original link to homepage of Tiki website
#url = 'https://tiki.vn/'

#This url is the Tiki website after send key 'QUần áo nữ' to the search input 
url = 'https://tiki.vn/search?q=Qu%E1%BA%A7n%20%C3%A1o%20n%E1%BB%AF'
driver.get(url)
time.sleep(5)

product_urls = []
button_nextPage = driver.find_element(By.CSS_SELECTOR, '.pagination-block div:nth-child(3) a')
dr_products = driver.find_elements(By.CSS_SELECTOR, 'a[data-view-id="product_list_item"]')
print(len(dr_products))

for i in range (0, 3):
    time.sleep(2)

    dr_products = driver.find_elements(By.CSS_SELECTOR, 'a[data-view-id="product_list_item"]')
    print(len(dr_products))
    
    for product in dr_products:
        url = product.get_attribute('href')
        product_urls.append(url)
        print(url)
    
    product_urls = list(set(product_urls))
    button_nextPage.click()

In [None]:
with open('Tiki_product_urls.txt', mode='w') as file:
    for row in product_urls:
        file.write(row + "\n")

In [None]:
with open('Tiki_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Tiki_product_urls = links

In [None]:
Tiki_shop_urls = []
for url in Tiki_product_urls:
    driver.get(url)
    time.sleep(2)

    try:
        name = driver.find_element(By.CSS_SELECTOR, 'h1.Title__TitledStyled-sc-c64ni5-0').text
        # print(name)
    
        if driver.find_element(By.CSS_SELECTOR, '.product-price__current-price'):
            price = driver.find_element(By.CSS_SELECTOR, '.product-price__current-price').text
        else:
            price = 0
        # print(price)
        
        if driver.find_elements(By.CSS_SELECTOR, 'div.SellerHeader__SellerHeaderStyled-sc-la7c6v-0 a'):
            shop_url = driver.find_elements(By.CSS_SELECTOR, 'div.SellerHeader__SellerHeaderStyled-sc-la7c6v-0 a')[0].get_attribute('href')
            Tiki_shop_urls.append(shop_url)
        else:
            shop_url = ''
        # print(shop_url)

        df_Tiki_product = pd.concat([df_Tiki_product, pd.DataFrame([[url, name, price, shop_url]], columns = ['Url', 'Name', 'Price', 'Shop_url'])], ignore_index = True)
        print(df_Tiki_product[df_Tiki_product['Url'] == url])
    except NoSuchElementException:
        print('NoSuchElementException')

In [None]:
df_Tiki_product

In [None]:
df_Tiki_product.to_csv('tiki_products.csv', index=False)

In [None]:
df_Tiki_product = pd.read_csv('tiki_products.csv')
df_Tiki_product

In [None]:
Tiki_shop_urls = list(set(Tiki_shop_urls))

In [None]:
with open('Tiki_shop_urls.txt', mode='w') as file:
    for row in Tiki_shop_urls:
        file.write(row + "\n")

In [None]:
with open('Tiki_shop_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Tiki_shop_urls = links

In [None]:
for url in Tiki_shop_urls:
    driver.get(url)
    time.sleep(2)

    try:
        name = driver.find_element(By.CSS_SELECTOR, 'h1.Header__SellerName-sc-vgcg69-2').text
        # print(name)
        
        followers = driver.find_element(By.CSS_SELECTOR, '#seller-info-wrapper > div.Header__SellerNameWrapper-sc-vgcg69-5.jZYoaR > div.Header__BadgeWrapper-sc-vgcg69-11.czboZb > div:nth-child(3) > div.Header__SubInfo-sc-vgcg69-12.iUmIsA').text
        # print(followers)
        
        df_Tiki_shop = pd.concat([df_Tiki_shop, pd.DataFrame([[url, name, followers]], columns = ['Url', 'Name', 'Followers'])], ignore_index = True)
        print(df_Tiki_shop[df_Tiki_shop['Url'] == url])
    except NoSuchElementException:
        print('NoSuchElementException')

In [None]:
df_Tiki_shop

In [None]:
df_Tiki_shop.to_csv('tiki_shops.csv', index=False)

In [None]:
df_Tiki_shop = pd.read_csv('tiki_shops.csv')
df_Tiki_shop

In [None]:
driver.quit()

# Pre-proccesing data

## Lazada

In [None]:
df_Laz_product = pd.read_csv('lazada_products.csv')

In [None]:
df_Laz_product.info()

In [None]:
df_Laz_product['Price'] = df_Laz_product['Price'].str.replace('.', '').str.replace(' ₫', '').astype(float)
df_Laz_product['Price']

Run this cell if the computer hasn't installed the nltk library yet

In [None]:
# !pip install nltk

In [None]:
categories_keywords = {
    "đồ bộ": ["đồ bộ", "set", "bộ", "quần áo"],
    "đồ lót": ["đồ lót", "underwear", "quần lót", "áo lót", "áo ngực", "bra", "panty", "boxer", "quần chíp"],
    "váy/quần": ["váy", "quần", "skirt", "pants", "jeans"],
    "áo": ["áo", "t-shirt", "shirt", "croptop", "yếm", "khoác", "áo khoác", "vest", "blazer"],
    "đầm": ["đầm", "dress", "váy ngủ"], 
    "nón": ["nón", "mũ", "hat", "helmet"],
    "vớ": ["vớ", "tất", "socks"],
    "giày/dép": ["giày", "dép", "bata", "cao gót", "shoes", "slipper", "guốc", "boots", "xăng đan", "sandals", "sneakers"],
    "trang sức": ["trang sức", "nhẫn", "vòng", "dây chuyền", "earrings", "necklace"]
}

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

def generate_ngrams(string, n):
    string = unicodedata.normalize('NFC', string)
    tokens = word_tokenize(string, language='english')
    tokens = [token.lower() for token in tokens]
    return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
    
def classify_product(name, categories_keywords):
    _2grams = generate_ngrams(name, 2)
    _1grams = generate_ngrams(name, 1)
    
    for category, keywords in categories_keywords.items():
        for keyword in keywords:
            if keyword in _2grams + _1grams: 
                return category 
    
    return "khác"

In [None]:
df_Laz_product['Category'] = df_Laz_product['Name'].apply(lambda name: classify_product(name, categories_keywords))

## Shein

## Tiki

# Analysis on each e-market platform

## Lazada

In [None]:
category_counts = df_Laz_product['Category'].value_counts()

plt.figure(figsize=(10, 6))
sns.barplot(x=category_counts.index, y=category_counts.values, palette='pastel')
plt.title('Phân bố các giá trị trong cột Category', fontsize=16)
plt.xlabel('Loại', fontsize=14)
plt.ylabel('Số lượng', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.show()

## Shein

## Tiki