Run this cell if the computer hasn't installed the numpy, pandas, matplotlib and seaborn libraries yet

In [None]:
# !pip install numpy
# !pip install pandas
# !pip install matplotlib
# !pip install seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setup Selenium and Web Driver

Run this cell if the computer hasn't installed the selenium and webdriver-manager libraries yet

In [None]:
# !pip install selenium
# !pip install webdriver-manager

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
import time

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)
driver.maximize_window()

# Scrape Data from Websites

## Lazada website

In [None]:
df_Laz_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Sold'])
df_Laz_feedback = pd.DataFrame(columns = ['Product_Url', 'Content'])

Extract information from 120 products on Lazada website

In [None]:
url = 'https://www.lazada.vn/#?'
driver.get(url)
search_box = driver.find_element(By.CSS_SELECTOR, "input.search-box__input--O34g")
search_box.send_keys("Đầm")
search_button = driver.find_element(By.CSS_SELECTOR, "a.search-box__button--1oH7")
search_button.click()

button_nextPage = driver.find_element(By.CSS_SELECTOR, 'li[title="Next Page"] button')

for i in range (0, 3):
    time.sleep(2)

    try:
        name_elements = driver.find_elements(By.CSS_SELECTOR, '.RfADt a')
        names = [e.text for e in name_elements]
        urls = [e.get_attribute('href') for e in name_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        price_elements = driver.find_elements(By.CSS_SELECTOR, '.aBrP0')
        prices = [e.text for e in price_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        sold_elements = driver.find_elements(By.CSS_SELECTOR, 'span._1cEkb')
        solds = [e.text for e in sold_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
    
    default_length = len(urls)
            
    if len(names) < default_length:
        names.extend(["0"] * (default_length - len(names)))
    if len(prices) < default_length:
        prices.extend(["0%"] * (default_length - len(prices)))
    if len(solds) < default_length:
        solds.extend(["0"] * (default_length - len(solds)))

    df = pd.DataFrame({
            'Url': urls,
            'Name': names,
            'Price': prices,
            'Sold': solds
        })

    df_Laz_product = pd.concat([df_Laz_product, df], ignore_index = True)
        
    button_nextPage.click()

In [None]:
df_Laz_product

In [None]:
product_urls = list(df_Laz_product['Url'])

In [None]:
print(len(product_urls))
for url in product_urls:
    print(url)

Store the product URLs in a text file as a backup

In [None]:
with open('Laz_product_urls.txt', mode='a') as file:
    for row in product_urls:
        file.write(row + "\n")

Retrieve the list of product URLs from the backup file

In [None]:
with open('Laz_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

Store the product dataframe in a csv file as a backup

In [None]:
df_Laz_product.to_csv('lazada_products.csv', mode = 'a', index=False)

Retrieve the product dataframe from the backup file

In [None]:
df_Laz_product = pd.read_csv('lazada_products.csv')
df_Laz_product

Extract feedbacks from 120 products on Lazada website

In [None]:
len(product_urls)

In [None]:
for url in links:
    driver.get(url)
    time.sleep(5)
    
    driver.execute_script(f"window.scrollTo(0, 1550);")
    time.sleep(5)
    
    while True:
        try:
            content_elements = driver.find_elements(By.CSS_SELECTOR, '.item div:nth-child(3) .content')
            contents = [e.text for e in content_elements]
        except NoSuchElementException:
            print('NoSuchElementException')

        default_length = len(contents)
        urls = [url for i in range(0, default_length)]
    
        df = pd.DataFrame({
                'Product_Url': urls,
                'Content': contents
            })
        df_Laz_feedback = pd.concat([df_Laz_feedback, df], ignore_index = True)
    
        driver.execute_script(f"window.scrollTo(0, 2280);")
        time.sleep(5)
        
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'button.next-btn.next-btn-normal.next-btn-medium.next-pagination-item.next')
        except NoSuchElementException:
            break
            print('NoSuchElementException')
        
        if next_button.get_attribute('disabled'):
            print("No more pages to scrape.")
            break
        else:
            next_button.click()
            time.sleep(5)

In [None]:
df_Laz_feedback

In [None]:
unique_values = df_Laz_feedback['Product_Url'].unique()
print(len(unique_values))

Store the feedback dataframe in a csv file as a backup

In [None]:
df_Laz_feedback.to_csv('lazada_feedbacks.csv', mode = 'a', index=False)

Retrieve the feedback dataframe from the backup file

In [None]:
df_Laz_feedback = pd.read_csv('lazada_feedbacks.csv')
df_Laz_feedback

In [None]:
df_Laz_feedback = df_Laz_feedback.drop_duplicates()

## Tiki website

In [None]:
df_Tiki_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Sold'])
df_Tiki_feedback = pd.DataFrame(columns = ['Product_Url', 'Content'])

Extract information from 120 products on Tiki website

In [None]:
#The url below is the original link to homepage of Tiki website
#url = 'https://tiki.vn/'

#This url is the Tiki website after send key 'xxxxx' to the search input 
url_sample = 'https://tiki.vn/search?q=%C4%91%E1%BA%A7m'
driver.get(url_sample)
time.sleep(5)

button_nextPage = driver.find_element(By.CSS_SELECTOR, 'a[data-view-id="product_list_pagination_item"]')

for i in range (1, 4):
    #url = url_sample.format('&page=' + str(i))
    url = f'{url_sample}&page={i}'
    driver.get(url)
    time.sleep(2)

    try:
        url_elements = driver.find_elements(By.CSS_SELECTOR, 'a[data-view-id="product_list_item"]')
        urls = [e.get_attribute('href') for e in url_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        name_elements = driver.find_elements(By.CSS_SELECTOR, 'h3.style__NameStyled-sc-139nb47-8')
        names = [e.text for e in name_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        price_elements = driver.find_elements(By.CSS_SELECTOR, '.price-discount__price')
        prices = [e.text for e in price_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
        
    try:
        sold_elements = driver.find_elements(By.CSS_SELECTOR, 'span.quantity')
        solds = [e.text for e in sold_elements]
    except NoSuchElementException:
        print('NoSuchElementException')
    
    default_length = len(urls)
            
    if len(names) < default_length:
        names.extend(["0"] * (default_length - len(names)))
    if len(prices) < default_length:
        prices.extend(["0%"] * (default_length - len(prices)))
    if len(solds) < default_length:
        solds.extend(["0"] * (default_length - len(solds)))

    df = pd.DataFrame({
            'Url': urls,
            'Name': names,
            'Price': prices,
            'Sold': solds
        })

    df_Tiki_product = pd.concat([df_Tiki_product, df], ignore_index = True)
    print(len(df))

In [None]:
product_urls = list(df_Tiki_product['Url'])

In [None]:
df_Tiki_product

Store the product URLs in a text file as a backup

In [None]:
with open('Tiki_product_urls.txt', mode='a') as file:
    for row in product_urls:
        file.write(row + "\n")

Retrieve the list of product URLs from the backup file

In [None]:
with open('Tiki_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

In [None]:
Tiki_product_urls = links

Store the product dataframe in a csv file as a backup

In [None]:
df_Tiki_product.to_csv('tiki_products.csv', index=False)

Retrieve the product dataframe from the backup file

In [None]:
df_Tiki_product = pd.read_csv('tiki_products.csv')
df_Tiki_product

In [None]:
print(len(Tiki_product_urls))

In [None]:
for url in Tiki_product_urls:
    driver.get(url)
    time.sleep(5)
    
    driver.execute_script(f"window.scrollTo(0, 1550);")
    time.sleep(5)
    
    while True:
        try:
            content_elements = driver.find_elements(By.CSS_SELECTOR, '.review-comment__content')
            contents = [e.text for e in content_elements]
        except NoSuchElementException:
            print('NoSuchElementException')
            
        urls = [url for i in range(0, len(contents))]
    
        df = pd.DataFrame({
                'Product_Url': urls,
                'Content': contents
            })
        df_Tiki_feedback = pd.concat([df_Tiki_feedback, df], ignore_index = True)
    
        driver.execute_script(f"window.scrollTo(0, 2280);")
        time.sleep(5)
        
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'a.next')
        except NoSuchElementException:
            break
            print('NoSuchElementException')
        
        if next_button.get_attribute('color') == "#C4C4CF":
            print("No more pages to scrape.")
            break
        else:
            next_button.click()
            time.sleep(5)

In [None]:
df_Tiki_feedback

In [None]:
unique_urls = df_Tiki_feedback['Product_Url'].nunique()
print(f"Số lượng giá trị không trùng nhau trong cột 'Url': {unique_urls}")

Store the feedback dataframe in a csv file as a backup

In [None]:
df_Tiki_feedback.to_csv('tiki_feedbacks.csv', index=False)

Retrieve the feedback dataframe from the backup file

In [None]:
df_Tiki_feedback = pd.read_csv('tiki_feedbacks.csv')
df_Tiki_feedback

In [None]:
df_Tiki_feedback = df_Tiki_feedback.drop_duplicates()

In [None]:
df_Tiki_product = df_Tiki_product.drop_duplicates()

In [None]:
driver.quit()

## Sendo website

In [None]:
df_Sen_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Sold'])
df_Sen_feedback = pd.DataFrame(columns = ['Product_Url', 'Content'])

Extract information from 120 products on Sendo website

In [None]:
url = 'https://www.sendo.vn/ao-nu?q=%C4%91%E1%BA%A7m'
driver.get(url)
# search_box = driver.find_element(By.CSS_SELECTOR, "input#sendo-search")
# search_box.send_keys("Thời Trang Nữ")
# search_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="button submit"]')
# search_button.click()

time.sleep(10)

try:
    url_elements = driver.find_elements(By.CSS_SELECTOR, '.d7ed-d4keTB a')
    urls = [e.get_attribute('href') for e in url_elements]
except NoSuchElementException:
    print('NoSuchElementException')

try:
    name_elements = driver.find_elements(By.CSS_SELECTOR, 'span.d7ed-Vp2Ugh._0032-Zwkt7j.undefined.d7ed-KXpuoS.d7ed-mzOLVa')
    names = [e.text for e in name_elements]
except NoSuchElementException:
    print('NoSuchElementException')
    
try:
    price_elements = driver.find_elements(By.CSS_SELECTOR, 'span._0032-GpBMYp._0032-npoTU_.d7ed-CLUDGW.d7ed-AHa8cD.d7ed-giDKVr')
    prices = [e.text for e in price_elements]
except NoSuchElementException:
    print('NoSuchElementException')
    
# try:
#     sold_elements = driver.find_elements(By.CSS_SELECTOR, 'span.undefined.d7ed-bm83Kw.d7ed-mzOLVa')
#     solds = [e.text for e in sold_elements]
# except NoSuchElementException:
#     print('NoSuchElementException')

solds = []

default_length = len(urls)
        
if len(names) < default_length:
    names.extend(["0"] * (default_length - len(names)))
if len(prices) < default_length:
    prices.extend(["0%"] * (default_length - len(prices)))
if len(solds) < default_length:
    solds.extend(["0"] * (default_length - len(solds)))

df = pd.DataFrame({
        'Url': urls,
        'Name': names,
        'Price': prices,
        'Sold': solds
    })

df_Sen_product = pd.concat([df_Sen_product, df], ignore_index = True)
    
# button_nextPage.click()

In [None]:
df_Sen_product

In [None]:
product_urls = list(df_Sen_product['Url'])

In [None]:
print(len(product_urls))
for url in product_urls:
    print(url)

Store the product URLs in a text file as a backup

In [None]:
with open('Sen_product_urls.txt', mode='a') as file:
    for row in product_urls:
        file.write(row + "\n")

Retrieve the list of product URLs from the backup file

In [None]:
with open('Sen_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

Store the product dataframe in a csv file as a backup

In [None]:
df_Sen_product.to_csv('sendo_products.csv', index=False)

Retrieve the product dataframe from the backup file

In [None]:
df_Sen_product = pd.read_csv('sendo_products.csv')
df_Sen_product

Extract feedbacks from 120 products on Sendo website

In [None]:
len(product_urls)

In [None]:
for url in links:
    driver.get(url)
    time.sleep(5)
    
    driver.execute_script(f"window.scrollTo(0, 1550);")
    time.sleep(5)
    
    while True:
        try:
            content_elements = driver.find_elements(By.CSS_SELECTOR, '.item div:nth-child(3) .content')
            contents = [e.text for e in content_elements]
        except NoSuchElementException:
            print('NoSuchElementException')

        default_length = len(contents)
        urls = [url for i in range(0, default_length)]
    
        df = pd.DataFrame({
                'Product_Url': urls,
                'Content': contents
            })
        df_Sen_feedback = pd.concat([df_Sen_feedback, df], ignore_index = True)
    
        driver.execute_script(f"window.scrollTo(0, 2280);")
        time.sleep(5)
        
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'button.next-btn.next-btn-normal.next-btn-medium.next-pagination-item.next')
        except NoSuchElementException:
            break
            print('NoSuchElementException')
        
        if next_button.get_attribute('disabled'):
            print("No more pages to scrape.")
            break
        else:
            next_button.click()
            time.sleep(5)

In [None]:
df_Sen_feedback

In [None]:
unique_values = df_Sen_feedback['Product_Url'].unique()
print(len(unique_values))

Store the feedback dataframe in a csv file as a backup

In [None]:
df_Sen_feedback.to_csv('sendo_feedbacks.csv', mode = 'a', index=False)

Retrieve the feedback dataframe from the backup file

In [None]:
df_Sen_feedback = pd.read_csv('sendo_feedbacks.csv')
df_Sen_feedback

In [None]:
df_Sen_feedback = df_Sen_feedback.drop_duplicates()

In [None]:
df_Sen_product = df_Sen_product.drop_duplicates()

## Shopee website

In [None]:
df_Shop_product = pd.DataFrame(columns = ['Url', 'Name', 'Price', 'Sold'])
df_Shop_feedback = pd.DataFrame(columns = ['Product_Url', 'Content'])

Extract information from 120 products on Shopee website

In [None]:
url = ''
driver.get(url)
# search_box = driver.find_element(By.CSS_SELECTOR, "")
# search_box.send_keys("Thời Trang Nữ")
# search_button = driver.find_element(By.CSS_SELECTOR, '')
# search_button.click()

time.sleep(10)

try:
    url_elements = driver.find_elements(By.CSS_SELECTOR, '.d7ed-d4keTB a')
    urls = [e.get_attribute('href') for e in url_elements]
except NoSuchElementException:
    print('NoSuchElementException')

try:
    name_elements = driver.find_elements(By.CSS_SELECTOR, 'span.d7ed-Vp2Ugh._0032-Zwkt7j.undefined.d7ed-KXpuoS.d7ed-mzOLVa')
    names = [e.text for e in name_elements]
except NoSuchElementException:
    print('NoSuchElementException')
    
try:
    price_elements = driver.find_elements(By.CSS_SELECTOR, 'span._0032-GpBMYp._0032-npoTU_.d7ed-CLUDGW.d7ed-AHa8cD.d7ed-giDKVr')
    prices = [e.text for e in price_elements]
except NoSuchElementException:
    print('NoSuchElementException')
    
# try:
#     sold_elements = driver.find_elements(By.CSS_SELECTOR, 'span.undefined.d7ed-bm83Kw.d7ed-mzOLVa')
#     solds = [e.text for e in sold_elements]
# except NoSuchElementException:
#     print('NoSuchElementException')

solds = []

default_length = len(urls)
        
if len(names) < default_length:
    names.extend(["0"] * (default_length - len(names)))
if len(prices) < default_length:
    prices.extend(["0%"] * (default_length - len(prices)))
if len(solds) < default_length:
    solds.extend(["0"] * (default_length - len(solds)))

df = pd.DataFrame({
        'Url': urls,
        'Name': names,
        'Price': prices,
        'Sold': solds
    })

df_Sen_product = pd.concat([df_Sen_product, df], ignore_index = True)
    
# button_nextPage.click()

In [None]:
df_Shop_product

In [None]:
product_urls = list(df_Sen_product['Url'])

In [None]:
print(len(product_urls))
for url in product_urls:
    print(url)

Store the product URLs in a text file as a backup

In [None]:
with open('Shop_product_urls.txt', mode='a') as file:
    for row in product_urls:
        file.write(row + "\n")

Retrieve the list of product URLs from the backup file

In [None]:
with open('Shop_product_urls.txt', mode='r') as file:
    links = file.readlines()
links = [link.strip() for link in links]
# print(links)

Store the product dataframe in a csv file as a backup

In [None]:
df_Shop_product.to_csv('shopee_products.csv', index=False)

Retrieve the product dataframe from the backup file

In [None]:
df_Shop_product = pd.read_csv('shopee_products.csv')
df_Shop_product

Extract feedbacks from 120 products on Shopee website

In [None]:
len(product_urls)

In [None]:
for url in links:
    driver.get(url)
    time.sleep(5)
    
    driver.execute_script(f"window.scrollTo(0, 1550);")
    time.sleep(5)
    
    while True:
        try:
            content_elements = driver.find_elements(By.CSS_SELECTOR, '.item div:nth-child(3) .content')
            contents = [e.text for e in content_elements]
        except NoSuchElementException:
            print('NoSuchElementException')

        default_length = len(contents)
        urls = [url for i in range(0, default_length)]
    
        df = pd.DataFrame({
                'Product_Url': urls,
                'Content': contents
            })
        df_Shop_feedback = pd.concat([df_Shop_feedback, df], ignore_index = True)
    
        driver.execute_script(f"window.scrollTo(0, 2280);")
        time.sleep(5)
        
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'button.next-btn.next-btn-normal.next-btn-medium.next-pagination-item.next')
        except NoSuchElementException:
            break
            print('NoSuchElementException')
        
        if next_button.get_attribute('disabled'):
            print("No more pages to scrape.")
            break
        else:
            next_button.click()
            time.sleep(5)

In [None]:
df_Shop_feedback

In [None]:
unique_values = df_Shop_feedback['Product_Url'].unique()
print(len(unique_values))

Store the feedback dataframe in a csv file as a backup

In [None]:
df_Shop_feedback.to_csv('shopee_feedbacks.csv', mode = 'a', index=False)

Retrieve the feedback dataframe from the backup file

In [None]:
df_Shop_feedback = pd.read_csv('shopee_feedbacks.csv')
df_Shop_feedback

In [None]:
df_Shop_feedback = df_Shop_feedback.drop_duplicates()

In [None]:
df_Shop_product = df_Shop_product.drop_duplicates()

# Pre-proccesing data

## Lazada

### df_Laz_product

In [None]:
df_Laz_product = pd.read_csv('lazada_products.csv')

In [None]:
df_Laz_product

In [None]:
df_Laz_product.info()

Convert column `['Price']` to float

In [None]:
df_Laz_product['Price'] = df_Laz_product['Price'].str.replace(',', '').str.replace('₫', '').astype(float)
df_Laz_product['Price']

Convert `['Sold']` to int

In [None]:
def convert_sold_to_int(sold_value):
    if 'K' in sold_value:
        return int(float(sold_value.replace('K', '')) * 1000)
    return int(sold_value)

In [None]:
df_Laz_product['Sold'] = df_Laz_product['Sold'].str.replace(' sold', '')
df_Laz_product['Sold'] = df_Laz_product['Sold'].apply(convert_sold_to_int)
df_Laz_product['Sold']

Run this cell if the computer hasn't installed the nltk library yet

In [None]:
# !pip install nltk

In [None]:
categories_keywords = {
    "đồ bộ": ["đồ bộ", "set", "bộ", "quần áo", "sét"],
    "đồ lót": ["đồ lót", "underwear", "quần lót", "áo lót", "áo ngực", "bra", "panty", "boxer", "quần chíp"],
    "váy/quần": ["váy", "quần", "skirt", "pants", "jeans"],
    "áo": ["áo", "t-shirt", "shirt", "croptop", "yếm", "khoác", "áo khoác", "vest", "blazer"],
    "đầm": ["đầm", "dress", "váy ngủ"], 
    "nón": ["nón", "mũ", "hat", "helmet"],
    "vớ": ["vớ", "tất", "socks"],
    "giày/dép": ["giày", "dép", "bata", "cao gót", "shoes", "slipper", "guốc", "boots", "xăng đan", "sandals", "sneakers"],
    "trang sức": ["trang sức", "nhẫn", "vòng", "dây chuyền", "earrings", "necklace"]
}

In [None]:
for c, k in categories_keywords.items():
    print('{0}: {1}'.format(c, k))

Classify products using n-grams techniques

In [None]:
import unicodedata
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

def generate_ngrams(string, n):
    string = unicodedata.normalize('NFC', string)
    tokens = word_tokenize(string, language='english')
    tokens = [token.lower() for token in tokens]
    return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
    
def classify_product(name, categories_keywords):
    _2grams = generate_ngrams(name, 2)
    _1grams = generate_ngrams(name, 1)
    
    for category, keywords in categories_keywords.items():
        for keyword in keywords:
            if keyword in _2grams + _1grams: 
                return category 
    
    return "khác"

Add a column `['Category']` to the `df_Laz_product` DataFrame using the `classify_product` function provided above

In [None]:
df_Laz_product['Category'] = df_Laz_product['Name'].apply(lambda name: classify_product(name, categories_keywords))

Re-check the new column

In [None]:
Laz_category_counts = df_Laz_product['Category'].value_counts()
print(Laz_category_counts)

Identify the outliers and manually replace them with the correct values

In [None]:
df_Laz_product[df_Laz_product['Category'] == 'khác']

In [None]:
df_Laz_product.loc[119, 'Url']

In [None]:
df_Laz_product.loc[17, 'Category'] = 'đồ bộ'
df_Laz_product.loc[38, 'Category'] = 'đồ bộ'
df_Laz_product.loc[39, 'Category'] = 'áo'
df_Laz_product.loc[66, 'Category'] = 'đồ bộ'
df_Laz_product.loc[111, 'Category'] = 'đồ bộ'
df_Laz_product.loc[119, 'Category'] = 'đồ bộ'

Check if the dataframe has NaN value

In [None]:
Laz_nan_summary = df_Laz_product.isna().sum()
print(Laz_nan_summary)

Check if the dataframe has duplicated value

In [None]:
print(len(df_Laz_product[df_Laz_product.duplicated()]))

### df_Laz_feedback

Overview the feedback dataframe

In [None]:
df_Laz_feedback = pd.read_csv('lazada_feedbacks.csv')

In [None]:
df_Laz_feedback

In [None]:
df_Laz_feedback.info()

Check if the dataframe has NaN value

In [None]:
print(df_Laz_feedback.isna().sum())

Delete all the record containing NaN value

In [None]:
df_Laz_feedback = df_Laz_feedback.dropna(subset=['Content'])

Check if the dataframe has duplicated value

In [None]:
print(len(df_Laz_feedback[df_Laz_feedback.duplicated()]))

Delete all the record containing duplicated value

In [None]:
df_Laz_feedback = df_Laz_feedback.drop_duplicates()

## Tiki

### df_Tiki_product

In [None]:
df_Tiki_product = pd.read_csv('tiki_products.csv')

In [None]:
df_Tiki_product.info()

In [None]:
df_Tiki_product

Convert `['Price']` to float

In [None]:
df_Tiki_product['Price'] = df_Tiki_product['Price'].str.replace('.', '').str.replace('₫', '').astype(float)
df_Tiki_product['Price']

Convert `['Sold']` to int

In [None]:
df_Tiki_product['Sold'] = df_Tiki_product['Sold'].str.replace('Đã bán ', '')
df_Tiki_product['Sold'] = df_Tiki_product['Sold'].apply(convert_sold_to_int)
df_Tiki_product['Sold']

Run this cell if the computer hasn't installed the nltk library yet

In [None]:
# !pip install nltk

In [None]:
categories_keywords = {
    "đồ bộ": ["đồ bộ", "set", "bộ", "quần áo", "sét"],
    "đồ lót": ["đồ lót", "underwear", "quần lót", "áo lót", "áo ngực", "bra", "panty", "boxer", "quần chíp"],
    "váy/quần": ["váy", "quần", "skirt", "pants", "jeans"],
    "áo": ["áo", "t-shirt", "shirt", "croptop", "yếm", "khoác", "áo khoác", "vest", "blazer"],
    "đầm": ["đầm", "dress", "váy ngủ"], 
    "nón": ["nón", "mũ", "hat", "helmet"],
    "vớ": ["vớ", "tất", "socks"],
    "giày/dép": ["giày", "dép", "bata", "cao gót", "shoes", "slipper", "guốc", "boots", "xăng đan", "sandals", "sneakers"],
    "trang sức": ["trang sức", "nhẫn", "vòng", "dây chuyền", "earrings", "necklace"]
}

In [None]:
for c, k in categories_keywords.items():
    print('{0}: {1}'.format(c, k))

Classify products using n-grams techniques

In [None]:
import unicodedata
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

def generate_ngrams(string, n):
    string = unicodedata.normalize('NFC', string)
    tokens = word_tokenize(string, language='english')
    tokens = [token.lower() for token in tokens]
    return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]
    
def classify_product(name, categories_keywords):
    _2grams = generate_ngrams(name, 2)
    _1grams = generate_ngrams(name, 1)
    
    for category, keywords in categories_keywords.items():
        for keyword in keywords:
            if keyword in _2grams + _1grams: 
                return category 
    
    return "khác"

Add a column `['Category']` to the `df_Tiki_product` DataFrame using the `classify_product` function provided above

In [None]:
df_Tiki_product['Category'] = df_Tiki_product['Name'].apply(lambda name: classify_product(name, categories_keywords))

Re-check the new column

In [None]:
Tiki_category_counts = df_Tiki_product['Category'].value_counts()
print(Tiki_category_counts)

Identify the outliers and manually replace them with the correct values

In [None]:
df_Tiki_product[df_Tiki_product['Category'] == 'khác']

In [None]:
df_Tiki_product.loc[104, 'Category'] = 'đồ bộ'

Check if the dataframe has NaN value

In [None]:
Tiki_nan_summary = df_Tiki_product.isna().sum()
print(Tiki_nan_summary)

Check if the dataframe has duplicated value

In [None]:
print(len(df_Tiki_product[df_Tiki_product.duplicated()]))

### df_Tiki_feedback

In [None]:
df_Tiki_feedback = pd.read_csv('tiki_feedbacks.csv')

In [None]:
df_Tiki_feedback

In [None]:
df_Tiki_feedback.info()

Check if the dataframe has NaN value

In [None]:
print(df_Tiki_feedback.isna().sum())

Delete all the record containing NaN value

In [None]:
df_Tiki_feedback = df_Tiki_feedback.dropna(subset=['Content'])

Check if the dataframe has duplicated value

In [None]:
print(len(df_Tiki_feedback[df_Tiki_feedback.duplicated()]))

Delete all the record containing duplicated value

In [None]:
df_Tiki_feedback = df_Tiki_feedback.drop_duplicates()

# Analysis on each e-market platform

## Lazada

### df_Laz_product

In [None]:
df_Laz_product.info()

In [None]:
df_Laz_product.describe()

In [None]:
Laz_category_counts = df_Laz_product['Category'].value_counts()
print(Laz_category_counts)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = Laz_category_counts.index, y = Laz_category_counts.values, palette = 'pastel')
plt.title('Phân bố các giá trị trong cột Category', fontsize = 16)
plt.xlabel('Loại', fontsize = 14)
plt.ylabel('Số lượng', fontsize = 14)
plt.xticks(rotation = 45, ha = 'right')
plt.show()

In [None]:
df_Laz_product.to_csv('lazada_products_.csv', index=False)

In [None]:
Laz_avgPrice_per_category = df_Laz_product.groupby('Category')['Price'].mean().round()

In [None]:
print(Laz_avgPrice_per_category)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = Laz_avgPrice_per_category.index, y = Laz_avgPrice_per_category.values, palette = 'pastel')
plt.title('Giá tiền trung bình mỗi loại', fontsize = 16)
plt.xlabel('Loại', fontsize = 14)
plt.ylabel('Giá trung bình', fontsize = 14)
plt.xticks(rotation = 45, ha = 'right')
plt.show()

In [None]:
df_Laz_product[df_Laz_product['Category'] == 'áo'][['Name', 'Price', 'Sold', 'Category']]

In [None]:
df_Laz_product[df_Laz_product['Price'] == df_Laz_product['Price'].max()][['Name', 'Price', 'Sold', 'Category']]

In [None]:
df_Laz_product[df_Laz_product['Price'] == df_Laz_product['Price'].min()][['Name', 'Price', 'Sold', 'Category']]

In [None]:
df_Laz_product[df_Laz_product['Price'] > 150000][['Name', 'Price', 'Sold', 'Category']]

In [None]:
df_Laz_product[(df_Laz_product['Category'] == 'đồ bộ') & (df_Laz_product['Price'] > 150000)][['Name', 'Price', 'Sold', 'Category']]

In [None]:
df_Laz_product[(df_Laz_product['Category'] == 'đồ bộ') & (df_Laz_product['Price'] <= 150000)][['Name', 'Price', 'Sold', 'Category']]

In [None]:
df_Laz_product[df_Laz_product['Price'] > 150000]['Category'].value_counts()

In [None]:
df_Laz_product[df_Laz_product['Price'] <= 150000]['Category'].value_counts()

In [None]:
df_Laz_product_dobo = df_Laz_product[df_Laz_product['Category'] == 'đồ bộ'].copy(deep = True)
df_Laz_product_dobo['Phân khúc'] = df_Laz_product_dobo['Price'].apply(lambda x: 1 if x > 150000 else 0)

In [None]:
df_Laz_product_dobo

In [None]:
count_by_segment = df_Laz_product_dobo['Phân khúc'].value_counts().reset_index()
count_by_segment.columns = ['Phân khúc', 'Số lượng sản phẩm']

plt.figure(figsize=(10, 6))
sns.barplot(x='Phân khúc', y='Số lượng sản phẩm', data=count_by_segment, palette='pastel')
plt.title('Số lượng sản phẩm theo Phân khúc')
plt.xlabel('Phân khúc')
plt.ylabel('Số lượng sản phẩm')
plt.show()

In [None]:
df_Laz_product[df_Laz_product['Category'] == 'đồ bộ']['Price'].describe()

In [None]:
df_Laz_product[df_Laz_product['Category'] == 'đồ bộ'][['Name', 'Price', 'Sold', 'Category']]

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Price', data=df_Laz_product, palette='Set2')
plt.title(' Biểu đồ Box Plot thể hiện sự phân phối của giá sản phẩm quần áo nữ trên Lazada (VNĐ)')
plt.xlabel('Giá tiền')
plt.ylabel('')
plt.show()

In [None]:
stats = {
    'Mean': df_Laz_product['Price'].mean(),
    'Median': df_Laz_product['Price'].median(),
    'Min': df_Laz_product['Price'].min(),
    'Max': df_Laz_product['Price'].max(),
    'Std': df_Laz_product['Price'].std()
}

colors = ['#FF9999', '#66B3FF', '#99FF99', '#FFCC99', '#FFD700']
plt.bar(stats.keys(), stats.values(), color = colors)
plt.title('Thống kê giá tiền các sản phẩm quần áo nữ trên Lazada (VNĐ)')
plt.show()

Run this cell if the computer hasn't installed the scikit-learn libraries yet

In [None]:
# !pip install scikit-learn

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

cluster_price = df_Laz_product.copy(deep = True)

prices = cluster_price[['Price']].values
scaler = StandardScaler()
prices_scaled = scaler.fit_transform(prices)

dbscan = DBSCAN(eps = 0.5, min_samples = 5)
cluster_price['Cluster_DBSCAN'] = dbscan.fit_predict(prices_scaled)

print(cluster_price)

In [None]:
cluster_price[cluster_price['Cluster_DBSCAN'] == 0][['Price', 'Cluster_DBSCAN']].describe()

In [None]:
cluster_price[cluster_price['Cluster_DBSCAN'] != 0][['Price', 'Cluster_DBSCAN']].describe()

In [None]:
plt.scatter(cluster_price['Price'], np.zeros_like(cluster_price['Price']),
            c=cluster_price['Cluster_DBSCAN'], cmap='viridis', marker='o')
plt.xlabel('Price')
plt.title('DBSCAN Clustering')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
df_Laz_product_pk = df_Laz_product.copy(deep = True)
df_Laz_product_pk['Phân khúc'] = df_Laz_product_pk['Price'].apply(lambda x: 1 if x > 150000 else 0)

count_by_segment = df_Laz_product_pk['Phân khúc'].value_counts().reset_index()
count_by_segment.columns = ['Phân khúc', 'Số lượng sản phẩm']

plt.figure(figsize=(10, 6))
sns.barplot(x='Phân khúc', y='Số lượng sản phẩm', data=count_by_segment, palette='pastel')
plt.title('Số lượng sản phẩm theo Phân khúc')
plt.xlabel('Phân khúc')
plt.ylabel('Số lượng sản phẩm')
plt.show()

In [None]:
df_Laz_product['Sold'].sum()

In [None]:
df_Laz_product['Sold'].describe()

In [None]:
stats = {
    'Mean': df_Laz_product['Sold'].mean(),
    'Median': df_Laz_product['Sold'].median(),
    'Min': df_Laz_product['Sold'].min(),
    'Max': df_Laz_product['Sold'].max(),
    'Std': df_Laz_product['Sold'].std()
}

colors = ['#FF9999', '#66B3FF', '#99FF99', '#FFCC99', '#FFD700']
plt.bar(stats.keys(), stats.values(), color = colors)
plt.title('Thống kê số lượng bán các sản phẩm quần áo nữ trên Lazada')
plt.show()

In [None]:
df_Laz_product[df_Laz_product['Sold'] == 0]

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_Laz_product['Sold'], bins=10, kde=False, color='skyblue')
plt.title('Độ phân bố số lượng bán')
plt.xlabel('Số lượng bán')
plt.ylabel('Số sản phẩm')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_Laz_product[df_Laz_product['Sold'] > 2000]['Sold'], bins=5, kde=False, color='skyblue')
plt.title('Độ phân bố số lượng bán')
plt.xlabel('Số lượng bán')
plt.ylabel('Số sản phẩm')
plt.show()

In [None]:
df_Laz_product[df_Laz_product['Sold'] > 4000]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df_Laz_product['Price'], df_Laz_product['Sold'], alpha=0.5)
plt.title('Mối tương quan giữa Giá và Số lượng bán')
plt.xlabel('Giá')
plt.ylabel('Số lượng bán')
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

df_Laz_product_kmeans = df_Laz_product[['Price', 'Sold']].copy(deep = True)
X = df_Laz_product_kmeans

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Phương pháp Elbow
sse = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)  # inertia_ là tổng khoảng cách từ điểm đến tâm cụm

# Vẽ biểu đồ Elbow
plt.figure(figsize=(8, 5))
plt.plot(k_range, sse, marker='o')
plt.xlabel('Số lượng cụm (k)')
plt.ylabel('Tổng khoảng cách (SSE)')
plt.title('Phương pháp Elbow để tìm số cụm hợp lý')
plt.show()

# Phương pháp Silhouette
silhouette_scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    silhouette_scores.append(silhouette_score(X_scaled, labels))

# Vẽ biểu đồ Silhouette
plt.figure(figsize=(8, 5))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.xlabel('Số lượng cụm (k)')
plt.ylabel('Chỉ số Silhouette')
plt.title('Phương pháp Silhouette để tìm số cụm hợp lý')
plt.show()

In [None]:
from sklearn.cluster import KMeans

df_Laz_product_kmeans = df_Laz_product[['Price', 'Sold']].copy(deep = True)
X = df_Laz_product_kmeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
df_Laz_product_kmeans['Cluster'] = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(df_Laz_product_kmeans['Price'], df_Laz_product_kmeans['Sold'], c=df_Laz_product_kmeans['Cluster'], cmap='viridis', alpha=0.6, s=100)
plt.title('Phân cụm KMeans dựa trên Giá và Số lượng bán')
plt.xlabel('Giá tiền')
plt.ylabel('Số lượng bán')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
df_Laz_product_kmeans[df_Laz_product_kmeans['Cluster'] == 1]

In [None]:
df_Laz_product['Product_Object'] = df_Laz_product_kmeans['Cluster']

In [None]:
df_Laz_product[df_Laz_product['Product_Object'] == 1] = 3

In [None]:
df_Laz_product['Product_Object'] = df_Laz_product['Product_Object'].apply(lambda x: 1 if x == 2 else 2 if x == 0 else 3 if x == 3 else x)

In [None]:
df_Laz_product.to_csv('cleaned_lazada_products.csv', index=False)

### df_Laz_feedback

In [None]:
df_Laz_feedback.info()

In [None]:
set(list(df_Laz_feedback['Product_Url']))

In [None]:
len(set(list(df_Laz_feedback['Product_Url'])))

In [None]:
feedbacks = df_Laz_feedback['Content']

In [None]:
import re
def text_preprocess(text):
    text = text.lower()
    text = re.sub(r'([:,./?:#()])', r' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

In [None]:
feedbacks = [text_preprocess(fb) for fb in feedbacks]

Run this cell if the computer hasn't installed the gensim library yet

In [None]:
# !pip install gensim

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(feedbacks, vector_size=100, window=10, min_count=1, workers=4)

In [None]:
similar_words = model.wv.most_similar('tốt', topn=20)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('đẹp', topn=20)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('tệ', topn=10)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('xấu', topn=10)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('ổn', topn=5)
print(similar_words)

In [None]:
from nltk import ngrams

positive_words = ['tuyệt vời', 'hài lòng', 'thích', 'đẹp', 'mát', 'xinh', 'mát', 'dày dặn', 'ưng ý', 'lý tưởng', 'hấp dẫn', 'hoàn hảo', 'thoải mái', 'mê', 'tốt', 'ưng', 'bền', 'vừa ý', 'dễ thương', 'cute', 'đáng mua', 'nên mua']
negative_words = ['không hài lòng', 'thất vọng', 'kém', 'xấu', 'tệ', 'gớm', 'chán', 'nóng', 'lỗi', 'khó chịu', 'kì', 'cứng', 'thấy ghê', 'hơi', 'mắc', 'đắt', 'không đúng', 'ko đúng', 'vớ vẩn', 'ni lông', 'nilon', 'không thích', 'ko thích', 'ko ưng', 'phải chi', 'thô']

def calculate_satisfaction_score(review):
    score = 0
    for word in review:
        if word in positive_words:
            score += 1 
        elif word in negative_words:
            score -= 1 

    bigrams = ngrams(review, 2)
    for bigram in bigrams:
        bigram_str = ' '.join(bigram)
        if bigram_str in positive_words:
            score += 1 
        elif bigram_str in negative_words:
            score -= 1 

    return score

satisfaction_scores = [calculate_satisfaction_score(fb) for fb in feedbacks]
df_Laz_feedback['SentScore'] = satisfaction_scores

In [None]:
df_Laz_feedback[['Content', 'SentScore']]

In [None]:
def get_review_vector(review):
    words = review
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [None]:
def classify_score(score):
    if score < 0:
        return 0
    elif score == 0:
        return 1
    else:
        return 2

In [None]:
labels = df_Laz_feedback['SentScore'].apply(classify_score)

In [None]:
df_Laz_feedback['Label'] = labels

In [None]:
df_Laz_feedback['Label'].value_counts()

In [None]:
df_Laz_feedback['Label'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Số lượng đánh giá theo mức độ hài lòng')
plt.xlabel('Mức độ hài lòng')
plt.ylabel('Số lượng đánh giá')
plt.xticks(ticks=[0, 1, 2], labels=['Tiêu cực', 'Trung tính', 'Tích cực'], rotation=0)
plt.show()

Run this cell if the computer hasn't installed the wordcloud library yet

In [None]:
# !pip install wordcloud

In [None]:
from wordcloud import WordCloud

positive_comments = ' '.join(df_Laz_feedback[df_Laz_feedback['Label'] == 2]['Content'])
neutral_comments = ' '.join(df_Laz_feedback[df_Laz_feedback['Label'] == 1]['Content'])
negative_comments = ' '.join(df_Laz_feedback[df_Laz_feedback['Label'] == 0]['Content'])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_comments)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud cho Đánh Giá Tích Cực')
plt.show()

In [None]:
from wordcloud import WordCloud

positive_comments = ' '.join(df_Laz_feedback[df_Laz_feedback['Label'] == 2]['Content'])
neutral_comments = ' '.join(df_Laz_feedback[df_Laz_feedback['Label'] == 1]['Content'])
negative_comments = ' '.join(df_Laz_feedback[df_Laz_feedback['Label'] == 0]['Content'])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(negative_comments)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud cho Đánh Giá Tiêu Cực')
plt.show()

In [None]:
df_Laz_feedback['Length'] = df_Laz_feedback['Content'].str.split().str.len()
mean_length = df_Laz_feedback.groupby('Label')['Length'].mean()

colors = ['#FF6F61', '#FFCC6A', '#6D9DC5']
mean_length.plot(kind='bar', color=colors)

plt.title('Độ dài trung bình của đánh giá theo mức độ hài lòng')
plt.xlabel('Mức độ hài lòng')
plt.ylabel('Độ dài trung bình')
plt.xticks(ticks=[0, 1, 2], labels=['Tiêu cực', 'Trung tính', 'Tích cực'], rotation=0)
plt.show()

In [None]:
df_Laz_feedback['Length'].mean()

In [None]:
mean_length

In [None]:
df_merged = pd.merge(df_Laz_feedback, df_Laz_product, left_on='Product_Url', right_on='Url')

In [None]:
df_merged['Category'].value_counts()

In [None]:
label_counts = df_merged['Label'].value_counts()

plt.figure(figsize=(8, 8))
wedges, texts, autotexts = plt.pie(label_counts, autopct='%1.1f%%', startangle=90, colors=plt.cm.Pastel1.colors)

labels = ['Tích cực (2)', 'Trung tính (1)', 'Tiêu cực (0)']
plt.legend(wedges, labels, title="Mức độ hài lòng", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))

plt.title('Phân bố mức độ hài lòng của khách hàng (%)')
plt.axis('equal')
plt.show()


In [None]:
df_merged['Label'].value_counts()

In [None]:
category_feedback = df_merged.groupby('Category')['Label'].mean().reset_index()
category_feedback.columns = ['Category', 'Average Satisfaction']

pastel_colors = ['#FFB3BA', '#FFDFBA', '#BAFFC9', '#BAE1FF', '#FFC3A0', '#FF677D']
category_feedback.plot(kind='bar', x='Category', y='Average Satisfaction', color=pastel_colors[:len(category_feedback)])
plt.title('Mức độ hài lòng trung bình theo loại sản phẩm')
plt.xlabel('Loại sản phẩm')
plt.ylabel('Mức độ hài lòng trung bình')
plt.xticks(rotation=45)
plt.show()

In [None]:
product_satisfaction = df_merged.groupby('Product_Object')['Label'].mean().reset_index()
product_satisfaction.columns = ['Product_Object', 'Average_Satisfaction']

pastel_colors = ['#FFB3BA', '#FFDFBA', '#BAFFC9', '#BAE1FF', '#FF677D', '#FFC3A0']
product_satisfaction.plot(kind='bar', x='Product_Object', y='Average_Satisfaction', color=pastel_colors[:len(product_satisfaction)])
plt.title('Mức độ hài lòng trung bình của từng đối tượng sản phẩm')
plt.xlabel('Đối tượng khách hàng')
plt.ylabel('Mức độ hài lòng trung bình')
plt.xticks(ticks=[0, 1, 2], labels=['Đối tượng 1', 'Đối tượng 2', 'Đối tượng 3'], rotation=0)
plt.show()

In [None]:
df_merged['Label'].mean()

In [None]:
product_satisfaction

In [None]:
label_distribution = df_merged.groupby(['Product_Object', 'Label']).size().unstack().fillna(0)

labels = ['Tiêu cực', 'Trung tính', 'Tích cực']
colors = ['#FFB3BA', '#FFDFBA', '#BAFFC9']

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, Product_Object in enumerate(label_distribution.index):
    axes[i].pie(label_distribution.loc[Product_Object], 
                labels=labels, 
                autopct='%1.1f%%', 
                startangle=90, 
                colors=colors)
    axes[i].set_title(f'Đối tượng {Product_Object}')

plt.suptitle('Phân bố mức độ hài lòng của từng đối tượng sản phẩm')
plt.tight_layout()
plt.show()

In [None]:
df_Laz_feedback

In [None]:
df_Laz_feedback.to_csv('cleaned_lazada_feedbacks.csv', index=False)

## Tiki

### df_Tiki_product

In [None]:
df_Tiki_product.info()

In [None]:
df_Tiki_product.describe()

In [None]:
Tiki_category_counts = df_Tiki_product['Category'].value_counts()
print(Tiki_category_counts)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = Tiki_category_counts.index, y = Tiki_category_counts.values, palette = 'pastel')
plt.title('Phân bố các giá trị trong cột Category', fontsize = 16)
plt.xlabel('Loại', fontsize = 14)
plt.ylabel('Số lượng', fontsize = 14)
plt.xticks(rotation = 45, ha = 'right')
plt.show()

In [None]:
Tiki_avgPrice_per_category = df_Tiki_product.groupby('Category')['Price'].mean().round()

In [None]:
print(Tiki_avgPrice_per_category)

In [None]:
plt.figure(figsize = (10, 6))
sns.barplot(x = Tiki_avgPrice_per_category.index, y = Tiki_avgPrice_per_category.values, palette = 'pastel')
plt.title('Giá tiền trung bình mỗi loại', fontsize = 16)
plt.xlabel('Loại', fontsize = 14)
plt.ylabel('Giá trung bình', fontsize = 14)
plt.xticks(rotation = 45, ha = 'right')
plt.show()

In [None]:
df_Tiki_product[df_Tiki_product['Category'] == 'đồ lót'][['Name', 'Price', 'Category']]

In [None]:
df_Tiki_product[df_Tiki_product['Category'] == 'váy/quần'][['Name', 'Price', 'Category']]

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='Price', data=df_Tiki_product, palette='Set2')
plt.title(' Biểu đồ Box Plot thể hiện sự phân phối của giá sản phẩm quần áo nữ trên Tiki (VNĐ)')
plt.xlabel('Giá tiền')
plt.ylabel('')
plt.show()

In [None]:
stats = {
    'Mean': df_Tiki_product['Price'].mean(),
    'Median': df_Tiki_product['Price'].median(),
    'Min': df_Tiki_product['Price'].min(),
    'Max': df_Tiki_product['Price'].max(),
    'Std': df_Tiki_product['Price'].std()
}

colors = ['#FF9999', '#66B3FF', '#99FF99', '#FFCC99', '#FFD700']
plt.bar(stats.keys(), stats.values(), color = colors)
plt.title('Thống kê giá tiền các sản phẩm quần áo nữ trên Tiki (VNĐ)')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

cluster_price = df_Tiki_product.copy(deep = True)

prices = cluster_price[['Price']].values
scaler = StandardScaler()
prices_scaled = scaler.fit_transform(prices)

dbscan = DBSCAN(eps = 0.5, min_samples = 20)
cluster_price['Cluster_DBSCAN'] = dbscan.fit_predict(prices_scaled)

print(cluster_price)

In [None]:
cluster_price[cluster_price['Cluster_DBSCAN'] == 0][['Price', 'Cluster_DBSCAN']].describe()

In [None]:
cluster_price[cluster_price['Cluster_DBSCAN'] != 0][['Price', 'Cluster_DBSCAN']].describe()

In [None]:
plt.scatter(cluster_price['Price'], np.zeros_like(cluster_price['Price']),
            c=cluster_price['Cluster_DBSCAN'], cmap='viridis', marker='o')
plt.xlabel('Price')
plt.title('DBSCAN Clustering')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
df_Tiki_product_pk = df_Tiki_product.copy(deep = True)
df_Tiki_product_pk['Phân khúc'] = df_Tiki_product_pk['Price'].apply(lambda x: 1 if x > 310000 else 0)

count_by_segment = df_Tiki_product_pk['Phân khúc'].value_counts().reset_index()
count_by_segment.columns = ['Phân khúc', 'Số lượng sản phẩm']

plt.figure(figsize=(10, 6))
sns.barplot(x='Phân khúc', y='Số lượng sản phẩm', data=count_by_segment, palette='pastel')
plt.title('Số lượng sản phẩm theo Phân khúc')
plt.xlabel('Phân khúc')
plt.ylabel('Số lượng sản phẩm')
plt.show()

In [None]:
df_Tiki_product_pk[df_Tiki_product_pk['Phân khúc'] == 0].describe()

In [None]:
df_Tiki_product_pk[df_Tiki_product_pk['Phân khúc'] == 1].describe()

In [None]:
categories = df_Tiki_product['Category'].unique()

fig, axs = plt.subplots(2, 2, figsize=(20, 20))
colors = ['#ffb3ba', '#ffdfba']
for i, category in enumerate(categories):
    # Lọc các sản phẩm theo loại Category
    category_data = df_Tiki_product[df_Tiki_product['Category'] == category]

    low_price_count = (category_data['Price'] <= 310000).sum()
    high_price_count = (category_data['Price'] > 310000).sum()

    sizes = [low_price_count, high_price_count]
    labels = ['Giá thấp (≤ 310.000 VNĐ)', 'Giá cao (> 310.000 VNĐ)']

    ax = axs[i//2, i%2]
    ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90, textprops={'fontsize': 16})
    ax.set_title(f'Phân bố giá của {category.capitalize()}', fontsize = 18)

plt.subplots_adjust(hspace=0, wspace=0)
plt.tight_layout()
plt.show()

In [None]:
df_Tiki_product['Sold'].sum()

In [None]:
df_Tiki_product['Sold'].describe()

In [None]:
stats = {
    'Mean': df_Tiki_product['Sold'].mean(),
    'Median': df_Tiki_product['Sold'].median(),
    'Min': df_Tiki_product['Sold'].min(),
    'Max': df_Tiki_product['Sold'].max(),
    'Std': df_Tiki_product['Sold'].std()
}

colors = ['#FF9999', '#66B3FF', '#99FF99', '#FFCC99', '#FFD700']
plt.bar(stats.keys(), stats.values(), color = colors)
plt.title('Thống kê số lượng bán các sản phẩm quần áo nữ trên Tiki')
plt.show()

In [None]:
len(df_Tiki_product[df_Tiki_product['Sold'] == 0])

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_Tiki_product['Sold'], bins=10, kde=False, color='skyblue')
plt.title('Độ phân bố số lượng bán')
plt.xlabel('Số lượng bán')
plt.ylabel('Số sản phẩm')
plt.show()

In [None]:
df_Tiki_product[df_Tiki_product['Sold'] > 150]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df_Tiki_product['Price'], df_Tiki_product['Sold'], alpha=0.5)
plt.title('Mối tương quan giữa Giá và Số lượng bán')
plt.xlabel('Giá')
plt.ylabel('Số lượng bán')
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np

df_Tiki_product_kmeans = df_Tiki_product[['Price', 'Sold']].copy(deep = True)
X = df_Tiki_product_kmeans

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

sse = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    sse.append(kmeans.inertia_)

# Vẽ biểu đồ Elbow
plt.figure(figsize=(8, 5))
plt.plot(k_range, sse, marker='o')
plt.xlabel('Số lượng cụm (k)')
plt.ylabel('Tổng khoảng cách (SSE)')
plt.title('Phương pháp Elbow để tìm số cụm hợp lý')
plt.show()

# Phương pháp Silhouette
silhouette_scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    silhouette_scores.append(silhouette_score(X_scaled, labels))

# Vẽ biểu đồ Silhouette
plt.figure(figsize=(8, 5))
plt.plot(range(2, 11), silhouette_scores, marker='o')
plt.xlabel('Số lượng cụm (k)')
plt.ylabel('Chỉ số Silhouette')
plt.title('Phương pháp Silhouette để tìm số cụm hợp lý')
plt.show()

In [None]:
from sklearn.cluster import KMeans

df_Tiki_product_kmeans = df_Tiki_product[['Price', 'Sold']].copy(deep = True)
X = df_Tiki_product_kmeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=2, random_state=42)
df_Tiki_product_kmeans['Cluster'] = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(df_Tiki_product_kmeans['Price'], df_Tiki_product_kmeans['Sold'], c=df_Tiki_product_kmeans['Cluster'], cmap='viridis', alpha=0.6, s=100)
plt.title('Phân cụm KMeans dựa trên Giá và Số lượng bán')
plt.xlabel('Giá tiền')
plt.ylabel('Số lượng bán')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
from sklearn.cluster import KMeans

df_Tiki_product_kmeans = df_Tiki_product[['Price', 'Sold']].copy(deep = True)
X = df_Tiki_product_kmeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42)
df_Tiki_product_kmeans['Cluster'] = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(df_Tiki_product_kmeans['Price'], df_Tiki_product_kmeans['Sold'], c=df_Tiki_product_kmeans['Cluster'], cmap='viridis', alpha=0.6, s=100)
plt.title('Phân cụm KMeans dựa trên Giá và Số lượng bán')
plt.xlabel('Giá tiền')
plt.ylabel('Số lượng bán')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
df_Tiki_product_kmeans[df_Tiki_product_kmeans['Cluster'] == 0].describe()

In [None]:
df_Tiki_product_kmeans[df_Tiki_product_kmeans['Cluster'] == 1].describe()

In [None]:
df_Tiki_product_kmeans[df_Tiki_product_kmeans['Cluster'] == 2]

In [None]:
df_Tiki_product['Product_Object'] = df_Tiki_product_kmeans['Cluster']

In [None]:
df_Tiki_product['Product_Object'] = df_Tiki_product['Product_Object'].apply(lambda x: 1 if x == 0 else 2 if x == 1 else 3 if x == 2 else x)

In [None]:
df_Tiki_product.to_csv('cleaned_tiki_products.csv', index=False)

### df_Tiki_feedback

In [None]:
df_Tiki_feedback.info()

In [None]:
set(list(df_Tiki_feedback['Product_Url']))

In [None]:
len(set(list(df_Tiki_feedback['Product_Url'])))

In [None]:
feedbacks = df_Tiki_feedback['Content']

In [None]:
import re
def text_preprocess(text):
    text = text.lower()
    text = re.sub(r'([:,./?:#()])', r' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.split()

In [None]:
feedbacks = [text_preprocess(fb) for fb in feedbacks]

Run this cell if the computer hasn't installed the gensim library yet

In [None]:
# !pip install gensim

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(feedbacks, vector_size=100, window=10, min_count=1, workers=4)

In [None]:
similar_words = model.wv.most_similar('tốt', topn=20)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('đẹp', topn=20)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('tệ', topn=10)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('xấu', topn=10)
print(similar_words)

In [None]:
similar_words = model.wv.most_similar('ổn', topn=5)
print(similar_words)

In [None]:
from nltk import ngrams

positive_words = ['tuyệt vời', 'hài lòng', 'thích', 'đẹp', 'mát', 'xinh', 'mát', 'dày dặn', 'ưng ý', 'lý tưởng', 'hấp dẫn', 'hoàn hảo', 'thoải mái', 'mê', 'tốt', 'ưng', 'bền', 'vừa ý', 'dễ thương', 'cute', 'đáng mua', 'nên mua']
negative_words = ['không hài lòng', 'thất vọng', 'kém', 'xấu', 'tệ', 'gớm', 'chán', 'nóng', 'lỗi', 'khó chịu', 'kì', 'cứng', 'thấy ghê', 'hơi', 'mắc', 'đắt', 'không đúng', 'ko đúng', 'vớ vẩn', 'ni lông', 'nilon', 'không thích', 'ko thích', 'ko ưng', 'phải chi', 'thô']

def calculate_satisfaction_score(review):
    score = 0
    for word in review:
        if word in positive_words:
            score += 1 
        elif word in negative_words:
            score -= 1 

    bigrams = ngrams(review, 2)
    for bigram in bigrams:
        bigram_str = ' '.join(bigram)
        if bigram_str in positive_words:
            score += 1 
        elif bigram_str in negative_words:
            score -= 1 

    return score

satisfaction_scores = [calculate_satisfaction_score(fb) for fb in feedbacks]
df_Tiki_feedback['SentScore'] = satisfaction_scores

In [None]:
df_Tiki_feedback[['Content', 'SentScore']]

In [None]:
def get_review_vector(review):
    words = review
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [None]:
def classify_score(score):
    if score < 0:
        return 0
    elif score == 0:
        return 1
    else:
        return 2

In [None]:
labels = df_Tiki_feedback['SentScore'].apply(classify_score)

In [None]:
df_Tiki_feedback['Label'] = labels

In [None]:
df_Tiki_feedback['Label'].value_counts()

In [None]:
df_Tiki_feedback['Label'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Số lượng đánh giá theo mức độ hài lòng')
plt.xlabel('Mức độ hài lòng')
plt.ylabel('Số lượng đánh giá')
plt.xticks(ticks=[0, 1, 2], labels=['Tiêu cực', 'Trung tính', 'Tích cực'], rotation=0)
plt.show()

Run this cell if the computer hasn't installed the wordcloud library yet

In [None]:
# !pip install wordcloud

In [None]:
from wordcloud import WordCloud

positive_comments = ' '.join(df_Tiki_feedback[df_Tiki_feedback['Label'] == 2]['Content'])
neutral_comments = ' '.join(df_Tiki_feedback[df_Tiki_feedback['Label'] == 1]['Content'])
negative_comments = ' '.join(df_Tiki_feedback[df_Tiki_feedback['Label'] == 0]['Content'])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_comments)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud cho Đánh Giá Tích Cực')
plt.show()

In [None]:
from wordcloud import WordCloud

positive_comments = ' '.join(df_Tiki_feedback[df_Tiki_feedback['Label'] == 2]['Content'])
neutral_comments = ' '.join(df_Tiki_feedback[df_Tiki_feedback['Label'] == 1]['Content'])
negative_comments = ' '.join(df_Tiki_feedback[df_Tiki_feedback['Label'] == 0]['Content'])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(negative_comments)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud cho Đánh Giá Tiêu Cực')
plt.show()

In [None]:
df_Tiki_feedback['Length'] = df_Tiki_feedback['Content'].str.split().str.len()
mean_length = df_Tiki_feedback.groupby('Label')['Length'].mean()

colors = ['#FF6F61', '#FFCC6A', '#6D9DC5']
mean_length.plot(kind='bar', color=colors)

plt.title('Độ dài trung bình của đánh giá theo mức độ hài lòng')
plt.xlabel('Mức độ hài lòng')
plt.ylabel('Độ dài trung bình')
plt.xticks(ticks=[0, 1, 2], labels=['Tiêu cực', 'Trung tính', 'Tích cực'], rotation=0)
plt.show()

In [None]:
df_Tiki_feedback['Length'].mean()

In [None]:
mean_length

In [None]:
df_merged = pd.merge(df_Tiki_feedback, df_Tiki_product, left_on='Product_Url', right_on='Url')

In [None]:
df_merged['Category'].value_counts()

In [None]:
label_counts = df_merged['Label'].value_counts()

plt.figure(figsize=(8, 8))
wedges, texts, autotexts = plt.pie(label_counts, autopct='%1.1f%%', startangle=90, colors=plt.cm.Pastel1.colors)

labels = ['Tích cực (2)', 'Trung tính (1)', 'Tiêu cực (0)']
plt.legend(wedges, labels, title="Mức độ hài lòng", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))

plt.title('Phân bố mức độ hài lòng của khách hàng (%)')
plt.axis('equal')
plt.show()

In [None]:
df_merged['Label'].value_counts()

In [None]:
category_feedback = df_merged.groupby('Category')['Label'].mean().reset_index()
category_feedback.columns = ['Category', 'Average Satisfaction']

pastel_colors = ['#FFB3BA', '#FFDFBA', '#BAFFC9', '#BAE1FF', '#FFC3A0', '#FF677D']
category_feedback.plot(kind='bar', x='Category', y='Average Satisfaction', color=pastel_colors[:len(category_feedback)])
plt.title('Mức độ hài lòng trung bình theo loại sản phẩm')
plt.xlabel('Loại sản phẩm')
plt.ylabel('Mức độ hài lòng trung bình')
plt.xticks(rotation=45)
plt.show()

In [None]:
df_merged.groupby('Category')['Label'].mean()

In [None]:
product_satisfaction = df_merged.groupby('Product_Object')['Label'].mean().reset_index()
product_satisfaction.columns = ['Product_Object', 'Average_Satisfaction']

pastel_colors = ['#FFB3BA', '#FFDFBA', '#BAFFC9', '#BAE1FF', '#FF677D', '#FFC3A0']
product_satisfaction.plot(kind='bar', x='Product_Object', y='Average_Satisfaction', color=pastel_colors[:len(product_satisfaction)])
plt.title('Mức độ hài lòng trung bình của từng đối tượng sản phẩm')
plt.xlabel('Đối tượng khách hàng')
plt.ylabel('Mức độ hài lòng trung bình')
plt.xticks(ticks=[0, 1, 2], labels=['Đối tượng 1', 'Đối tượng 2', 'Đối tượng 3'], rotation=0)
plt.show()

In [None]:
df_merged['Label'].mean()

In [None]:
product_satisfaction

In [None]:
label_distribution = df_merged.groupby(['Product_Object', 'Label']).size().unstack().fillna(0)

labels = ['Tiêu cực', 'Trung tính', 'Tích cực']
colors = ['#FFB3BA', '#FFDFBA', '#BAFFC9']

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for i, Product_Object in enumerate(label_distribution.index):
    axes[i].pie(label_distribution.loc[Product_Object], 
                labels=labels, 
                autopct='%1.1f%%', 
                startangle=90, 
                colors=colors)
    axes[i].set_title(f'Đối tượng {Product_Object}')

plt.suptitle('Phân bố mức độ hài lòng của từng đối tượng sản phẩm')
plt.tight_layout()
plt.show()

In [None]:
label_distribution

In [None]:
len(df_merged[df_merged['Product_Object'] == 1])

In [None]:
len(df_merged[df_merged['Product_Object'] == 2])

In [None]:
len(df_merged[df_merged['Product_Object'] == 3])

In [None]:
df_Tiki_feedback

In [None]:
df_Tiki_feedback.to_csv('cleaned_tiki_feedbacks.csv', index=False)

# Analysis comprehensive overview based on the whole 2 datasets

In [None]:
df_Laz_product = pd.read_csv('cleaned_lazada_products.csv')
df_Laz_feedback = pd.read_csv('cleaned_lazada_feedbacks.csv')
df_Tiki_product = pd.read_csv('cleaned_tiki_products.csv')
df_Tiki_feedback = pd.read_csv('cleaned_tiki_feedbacks.csv')

In [None]:
df_Laz_product

In [None]:
df_Laz_feedback

In [None]:
df_Tiki_product

In [None]:
df_Tiki_feedback

In [None]:
df_Laz_product['Platform'] = 'Lazada'
df_Tiki_product['Platform'] = 'Tiki'
df_product = pd.concat([df_Laz_product, df_Tiki_product])

In [None]:
df_Laz_feedback['Platform'] = 'Lazada'
df_Tiki_feedback['Platform'] = 'Tiki'
df_feedback = pd.concat([df_Laz_feedback, df_Tiki_feedback])

In [None]:
df_feedback

In [None]:
df_product

In [None]:
plt.figure(figsize=(10, 6))
colors = {'Lazada': '#A3C1F0', 'Tiki': '#F7B799'}
sns.boxplot(x='Platform', y='Sold', data=df_product, palette=colors)
plt.title('Phân phối số lượng bán giữa Lazada và Tiki')
plt.ylabel('Số lượng bán')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
colors = {'Lazada': '#A3C1F0', 'Tiki': '#F7B799'}
sns.boxplot(x='Platform', y='Price', data=df_product, palette=colors)
plt.title('Phân phối giá sản phẩm giữa Lazada và Tiki')
plt.ylabel('Giá sản phẩm (VNĐ)')
plt.show()

In [None]:
avg_label = df_feedback.groupby('Platform')['Label'].mean().reset_index()

plt.figure(figsize=(8, 5))
colors = {'Lazada': '#A3C1F0', 'Tiki': '#F7B799'}
sns.barplot(x='Platform', y='Label', data=avg_label, palette=colors)
plt.title('Mức độ hài lòng trung bình giữa Lazada và Tiki')
plt.ylabel('SentScore trung bình')
plt.show()

In [None]:
avg_label

In [None]:
df_feedback['Label'].mean()

In [None]:
plt.figure(figsize=(10, 6))
colors = {'Lazada': '#A3C1F0', 'Tiki': '#F7B799'}
sns.countplot(x='Label', hue='Platform', data=df_feedback, palette=colors)
plt.title('Phân phối mức độ hài lòng (Label) giữa Lazada và Tiki')
plt.xlabel('Label (0: Tiêu cực, 1: Trung tính, 2: Tích cực)')
plt.ylabel('Số lượng đánh giá')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
colors = {'Lazada': '#A3C1F0', 'Tiki': '#F7B799'}
sns.scatterplot(x='Price', y='Sold', hue='Platform_y', data=df_product.merge(df_feedback, left_on='Url', right_on='Product_Url'), palette=colors)
plt.title('Mối quan hệ giữa số lượng bán và giá sản phẩm')
plt.xlabel('Giá sản phẩm')
plt.ylabel('Số lượng bán')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
colors = {'Lazada': '#A3C1F0', 'Tiki': '#F7B799'}
sns.scatterplot(x='Sold', y='SentScore', hue='Platform_y', data=df_product.merge(df_feedback, left_on='Url', right_on='Product_Url'), palette=colors)
plt.title('Mối quan hệ giữa số lượng bán và điểm đánh giá')
plt.xlabel('Số lượng bán')
plt.ylabel('SentScore')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
colors = {'Lazada': '#A3C1F0', 'Tiki': '#F7B799'}
sns.scatterplot(x='Length', y='SentScore', hue='Platform', data=df_feedback, palette=colors)
plt.title('Mối quan hệ giữa độ dài đánh giá và điểm SentScore')
plt.xlabel('Độ dài đánh giá (Length)')
plt.ylabel('SentScore')
plt.show()

In [None]:
from scipy.stats import skew

skewness_lazada = skew(df_Laz_feedback['SentScore'])
skewness_tiki = skew(df_Tiki_feedback['SentScore'])

plt.figure(figsize=(10, 6))
sns.kdeplot(df_Laz_feedback['SentScore'], label=f'Lazada (Skewness: {skewness_lazada:.2f})', shade=True, color='blue')
sns.kdeplot(df_Tiki_feedback['SentScore'], label=f'Tiki (Skewness: {skewness_tiki:.2f})', shade=True, color='orange')
plt.title('Phân phối SentScore giữa Lazada và Tiki')
plt.xlabel('SentScore')
plt.ylabel('Mật độ')
plt.legend()
plt.show()

In [None]:
skewness_lazada

In [None]:
skewness_tiki

In [None]:
df_merged = df_product.merge(df_feedback, left_on='Url', right_on='Product_Url', how='inner')

In [None]:
df_merged = df_merged.drop(columns=['Url', 'Product_Url', 'Product_Object', 'Platform_x'], errors='ignore')

In [None]:
df_merged

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

df_kmeans = df_merged[['Price', 'Sold']].copy(deep = True)
X = df_kmeans
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=2, random_state=42)
df_kmeans['Cluster'] = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(df_kmeans['Price'], df_kmeans['Sold'], c=df_kmeans['Cluster'], cmap='viridis', alpha=0.6, s=100)
plt.title('Phân cụm KMeans dựa trên Giá và Số lượng bán')
plt.xlabel('Giá tiền')
plt.ylabel('Số lượng bán')
plt.colorbar(label='Cluster')
plt.show()

In [None]:
df_kmeans[df_kmeans['Cluster'] == 1].describe()

In [None]:
df_kmeans[df_kmeans['Cluster'] == 0].describe()

In [None]:
import re
def text_preprocess(text):
    text = text.lower()
    text = re.sub(r'([:,./?:#()])', r' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df_feedback['Content'] = [text_preprocess(fb) for fb in df_feedback['Content']]

In [None]:
df_feedback['Content']

In [None]:
from collections import Counter

word_counts = Counter(df_feedback['Content'].str.split(expand=True).stack())
print(word_counts.most_common(10))

In [None]:
word_counts_df = pd.DataFrame(word_counts.most_common(10), columns=['Word', 'Frequency'])

sns.set_style("white")
sns.barplot(x='Word', y='Frequency', data=word_counts_df, palette='pastel')
plt.xticks(rotation=45)
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(df_feedback['Content'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(X[0], X).flatten()
top_indices = cosine_similarities.argsort()[-5:][::-1]

for i in top_indices:
    print(df_feedback['Content'][i])

In [None]:
df_product['Name'] = [text_preprocess(fb) for fb in df_product['Name']]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(df_product['Name'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = cosine_similarity(X[0], X).flatten()
top_indices = cosine_similarities.argsort()[-5:][::-1]

for i in top_indices:
    print(df_product['Name'][i])

In [None]:
top_indices

In [None]:
from wordcloud import WordCloud

text = " ".join(df_feedback['Content'])
wordcloud = WordCloud(width=1200, height=800, background_color='white', colormap='twilight_shifted').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()