In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import unicodedata
from IPython.display import Image, display
from wordcloud import WordCloud

# EXTRACT AND PRE-PROCESSING DATA

## Product dataframe 

In [None]:
df_product = pd.read_csv(r'C:\Users\ASUS\Desktop\T\ĐAN_KLTN\getdata\combined_data.csv')
df_product

In [None]:
df_product.info()

In [None]:
df_product.duplicated().sum()

In [None]:
df_product = df_product.drop_duplicates()

In [None]:
df_product = df_product.drop(columns=['countReviews'])
df_product

In [None]:
df_product.loc[:, 'discounts'] = df_product['discounts'].apply(lambda x: unicodedata.normalize('NFC', str(x)) if pd.notna(x) else x)

In [None]:
df_product['discounts'] = df_product['discounts'].str.extract(r'(\d+)').astype(float) / 100
df_product

In [None]:
df_product.loc[:, 'discounts'] = df_product['discounts'].fillna(0)

In [None]:
df_product['discounts'].median()

In [None]:
df_product.loc[df_product['discounts'] >= 1, 'discounts'] = df_product['discounts'].median()
df_product['discounts'].describe()

In [None]:
df_product.loc[:, 'img'] = 'C:/Users/ASUS/Desktop/T/ĐAN_KLTN/getImages/' + df_product.index.astype(str) + '.jpg'
df_product

In [None]:
for path in df_product.loc[2790:2795, 'img']:
    display(Image(filename=path))

In [None]:
df_product.loc[:, 'titles'] = df_product['titles'].apply(lambda x: unicodedata.normalize('NFC', x))

In [None]:
len(df_product[df_product['prices'] <= 0])

In [None]:
df_product[df_product['prices'] <= 0]

In [None]:
df_product = df_product.drop([1208])

In [None]:
len(df_product[df_product['countSales'] < 0])

### Classifying products from images and titles ---> *failed*

In [None]:
csv_path = 'C:/Users/ASUS/Desktop/T/ĐAN_KLTN/kaggle_clothes_train/images.csv'
img_dir = 'C:/Users/ASUS/Desktop/T/ĐAN_KLTN/kaggle_clothes_train/images_compressed'

df_train_img = pd.read_csv(csv_path)
df_train_img

In [None]:
import os
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

IMG_SIZE = (64, 64)

X_train, y_train = [], []
for _, row in df_train_img.iterrows():
    img_path = os.path.join(img_dir, row['image'] + '.jpg')
    try:
        img = Image.open(img_path).convert('RGB').resize(IMG_SIZE)
        img_array = np.array(img).flatten()
        X_train.append(img_array)
        y_train.append(row['label'])
    except:
        print(f"Lỗi đọc ảnh train: {img_path}")

In [None]:
X_train

In [None]:
y_train

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train_enc)

X_predict = []
valid_img_names = []
failed_imgs = []

In [None]:
for _, row in df_product.iterrows():
    img_path = os.path.join(img_dir, row['img'])
    try:
        img = Image.open(img_path).convert('RGB').resize(IMG_SIZE)
        img_array = np.array(img).flatten()
        X_predict.append(img_array)
        valid_img_names.append(row['img'])
    except:
        print(f"Lỗi đọc ảnh predict: {img_path}")
        failed_imgs.append(row['img'])

In [None]:
y_pred_enc = clf.predict(X_predict)
y_pred_labels = le.inverse_transform(y_pred_enc)

df_pred = pd.DataFrame({'img': valid_img_names, 'category': y_pred_labels})
df_product = df_product.merge(df_pred, on='img', how='left')

print(df_product[['img', 'category']].head())

In [None]:
df_product['category'].value_counts()

In [None]:
df_product.loc[:, 'category'] = df_product['category'].fillna('Not sure')

In [None]:
df_known = df_product[df_product['category'] != 'Not sure']
df_unknown = df_product[df_product['category'] == 'Not sure']

In [None]:
df_known

In [None]:
df_unknown

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_known = vectorizer.fit_transform(df_known['titles'].astype(str))
X_unknown = vectorizer.transform(df_unknown['titles'].astype(str))

le_nlp = LabelEncoder()
y_known = le_nlp.fit_transform(df_known['category'])

In [None]:
clf_nlp = RandomForestClassifier(n_estimators=100, random_state=42)
clf_nlp.fit(X_known, y_known)
y_pred_nlp = clf_nlp.predict(X_unknown)
predicted_labels = le_nlp.inverse_transform(y_pred_nlp)
df_product.loc[df_product['category'] == 'Not sure', 'category'] = predicted_labels

In [None]:
df_product['category'].value_counts()

In [None]:
def show_images_with_categories(df, img_dir, n=12):
    df = df.reset_index(drop=True)
    total = min(n, len(df))
    cols = 4
    rows = (total // cols) + (total % cols > 0)
    
    plt.figure(figsize=(4 * cols, 4 * rows))
    
    for i in range(total):
        img_name = df.loc[i, 'img']
        category = df.loc[i, 'category']
        img_path = os.path.join(img_dir, img_name)
        
        try:
            img = Image.open(img_path).convert('RGB')
            plt.subplot(rows, cols, i + 1)
            plt.imshow(img)
            plt.axis('off')
            plt.title(category, fontsize=10)
        except:
            print(f"Lỗi khi mở ảnh: {img_path}")
    
    plt.tight_layout()
    plt.show()

In [None]:
show_images_with_categories(df_product, img_dir, n=100)

### Classifying products from titles (manual)

In [None]:
category_keywords = {
    'T-Shirt': ['t-shirt', 'tee', 'áo thun'],
    'Pants': ['pants', 'quần dài', 'trousers', 'jeans', 'quần'],
    'Longsleeve': ['longsleeve', 'dài tay', 'áo tay dài', 'áo dài'],
    'Shoes': ['giày', 'shoes', 'sneaker', 'loafers', 'boot', 'dép', 'dép lê', 'sục', 'boost', 'bốt', 'high heels', 'guốc', 'cao gót', 'sandal', 'xăng đan'],
    'Dress': ['váy', 'dress', 'đầm', 'đầm xòe'],
    'Shirt': ['shirt', 'sơ mi', 'áo sơ mi', 'áo'],
    'Hoodie': ['hoodie'],
    'Outwear': ['áo khoác', 'khoác', 'jacket', 'coat', 'outerwear'],
    'Shorts': ['short', 'quần short', 'quần đùi'],
    'Body': ['body', 'bodysuit'],
    'Hat': ['hat', 'mũ', 'nón', 'rộng vành', 'lưỡi trai', 'tóc'],
    'Undershirt': ['undershirt', 'áo lót', 'áo trong', 'bra'],
    'Skirt': ['skirt', 'chân váy', 'váy tennis', 'váy xòe', 'xòe'],
    'Blazer': ['blazer'],
    'Set': ['set', 'sét', 'đồ ngủ', 'pijama', 'đồ bộ', 'bộ đồ', 'đồ lam', 'bộ', 'pháp phục', 'jump', 'đồng phục']
}

In [None]:
import re

def classify_from_title(title):
    title = re.sub(r'[^\w\s]', ' ', title)
    title = re.sub(r'[\U00010000-\U0010ffff]', '', title)  # remove emojis
    title = re.sub(r'\s+', ' ', title).strip()
    
    title_lower = title.lower()
    categories_found = []
    
    for category, keywords in category_keywords.items():
        if any(keyword in title_lower for keyword in keywords):
            categories_found.append(category)
            
        if len(categories_found) == 3: # maximun 3 categories
            break
            
    return categories_found

In [None]:
df_product['category'] = df_product.apply(lambda row: classify_from_title(row['titles']), axis=1)
df_product['category'].value_counts()

In [None]:
df_product[df_product['category'].apply(lambda x: len(x) == 0)]

In [None]:
df_product['category'] = df_product['category'].apply(lambda x: ['Not sure'] if isinstance(x, list) and len(x) == 0 else x)

### Overview

In [None]:
df_product.info()

In [None]:
df_product

## Product's comments dataframe 

In [None]:
df_cmt = pd.read_csv(r'C:\Users\ASUS\Desktop\T\ĐAN_KLTN\getcomment\combined_data.csv')
df_cmt

In [None]:
df_cmt.info()

In [None]:
df_cmt = df_cmt.drop(columns=['like_count'])
df_cmt = df_cmt.drop(columns=['stt'])
df_cmt = df_cmt.drop(columns=['name_comment'])
df_cmt

In [None]:
df_cmt['star_count'] = pd.to_numeric(df_cmt['star_count'], errors='coerce').fillna(0).astype(int)

In [None]:
df_cmt[(df_cmt['star_count'] < 0) | (df_cmt['star_count'] > 5)]

In [None]:
df_cmt['content_comment'].isna().sum()

In [None]:
df_cmt = df_cmt.dropna(subset=['content_comment'])

In [None]:
df_cmt.loc[:, 'content_comment'] = df_cmt['content_comment'].apply(lambda x: unicodedata.normalize('NFC', x) if isinstance(x, str) else x)
df_cmt

In [None]:
df_cmt.info()

# VISUALIZE ATTRIBUTES IN DATAFRAMES

## Product dataframe 

In [None]:
df_product.describe()

In [None]:
df_product['prices'].describe().apply(lambda x: format(x, ',.0f'))

In [None]:
price_bins = [0, 100000, 500000, 1000000, float('inf')]
price_labels = ['0-100k', '100k-500k', '500k-1M', 'Above 1M']
df_product['price_range'] = pd.cut(df_product['prices'], bins=price_bins, labels=price_labels, right=False)

In [None]:
price_range_count = df_product['price_range'].value_counts()
price_range_count

In [None]:
price_range_percentage = (price_range_count / price_range_count.sum()) * 100
price_range_percentage

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
axes[0].pie(
    price_range_percentage,
    labels=[f'{label} ({pct:.1f}%)' for label, pct in zip(price_range_percentage.index, price_range_percentage)],
    startangle=140,
    colors=plt.cm.Pastel1.colors,
    autopct='%1.1f%%'
)
axes[0].set_title('Product Price Range - Pie Chart')
axes[0].axis('equal')

sns.countplot(ax=axes[1], x='price_range', data=df_product, palette='Pastel1', order=price_labels)
axes[1].set_title('Product Price Range - Bar Chart')
axes[1].set_xlabel('Price Range')
axes[1].set_ylabel('Number of Products')
plt.tight_layout()
plt.show()

In [None]:
df_product['discounts'].describe()

In [None]:
plt.figure(figsize=(8, 4))
sns.boxplot(x=df_product['discounts'], color='skyblue')
plt.title('Boxplot of Product Discounts')
plt.xlabel('Discount (%)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_product['discounts'], kde=True, color='lightgreen', bins=30)
plt.title('Discount Distribution')
plt.xlabel('Discount (%)')
plt.ylabel('Frequency')
plt.show()

In [None]:
len(df_product[df_product['discounts'] == 0])

In [None]:
df_product['countSales'].describe()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_product['countSales'], kde=True, color='orange', bins=30)
plt.title('Product Sales Distribution')
plt.xlabel('Number of Sales')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='prices', y='countSales', hue='price_range', data=df_product, palette='viridis', alpha=0.6)
plt.title('Prices vs. Sales Count', fontsize=16)
plt.xlabel('Price', fontsize=12)
plt.ylabel('Sales Count', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
price_above500k_sales_above500 = df_product[(df_product['prices'] > 500000) & (df_product['countSales'] > 500)]
price_above500k_sales_above500

In [None]:
category_counts = df_product.explode('category')['category'].value_counts()
category_counts

In [None]:
unique_categories = df_product.explode('category')['category'].unique()
unique_categories

In [None]:
plt.figure(figsize=(14, 8))
sns.boxplot(data=df_product.explode('category').reset_index(drop=True), x='category', y='discounts', palette='Set3')
plt.title('Distribution of Discounts by Product Category')
plt.xlabel('Category')
plt.ylabel('Discounts')
plt.tight_layout()
plt.show()

In [None]:
df_product.explode('category').groupby('category')['discounts'].describe()

In [None]:
discounted_per_category = df_product.explode('category')[df_product.explode('category')['discounts'] > 0.0].groupby('category').size()
discounted_per_category

In [None]:
discount_per_category_ratio = (discounted_per_category / category_counts).fillna(0).sort_values(ascending=False) * 100
discount_per_category_ratio

In [None]:
no_discount_per_category_ratio = 100 - discount_per_category_ratio
stacked_ratio_df = pd.DataFrame({'Discounted (%)': discount_per_category_ratio, 'Not Discounted (%)': no_discount_per_category_ratio}).sort_values('Discounted (%)')
stacked_ratio_df.plot(kind='barh', stacked=True, figsize=(10, 7), color=['#4CAF50', '#c1e3c1'])

plt.xlabel('Percentage of Products')
plt.title('Percentage of Discounted vs Non-Discounted Products by Category')
plt.legend(title='Discount Status', loc='lower right')
plt.tight_layout()
plt.show()

In [None]:
import math

n_categories = len(unique_categories)
n_cols = 2
n_rows = math.ceil(n_categories / n_cols)
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 6 * n_rows))

for i, category in enumerate(unique_categories):
    category_data = df_product[df_product['category'].apply(lambda x: category in x)]
    row = i // n_cols
    col = i % n_cols
    sns.countplot(ax=axes[row, col], x='price_range', data=category_data, palette='Pastel1')
    axes[row, col].set_title(f'Price Range Distribution for {category}')
    axes[row, col].set_xlabel('Price Range')
    axes[row, col].set_ylabel('Number of Products')
    
if n_categories % n_cols != 0:
    for j in range(i + 1, n_rows * n_cols):
        fig.delaxes(axes.flatten()[j])

plt.tight_layout()
plt.show()

In [None]:
price_range_category = df_product.explode('category').groupby(['category', 'price_range']).size().reset_index(name='count')
price_range_category

In [None]:
pivot_df = price_range_category.pivot(index='category', columns='price_range', values='count').fillna(0)
pivot_df = pivot_df.loc[pivot_df.sum(axis=1).sort_values(ascending=False).index]

pivot_df.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='Pastel1', edgecolor='black')
plt.title("Product Distribution by Category and Price Range", fontsize=16)
plt.xlabel("Product Category")
plt.ylabel("Number of Products")
plt.xticks(rotation=45)
plt.legend(title="Price Range")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_product, x='discounts', y='countSales', alpha=0.6)
plt.title('Relationship between Discounts and Sales Count')
plt.xlabel('Discount (as a fraction, e.g., 0.4 = 40%)')
plt.ylabel('Sales Count')
plt.tight_layout()
plt.show()

In [None]:
df_product['discounted'] = df_product['discounts'] > 0
avg_sales_overall = df_product.groupby('discounted')['countSales'].mean()
avg_sales_by_category = df_product.explode('category').groupby(['category', 'discounted'])['countSales'].mean().unstack()

fig, axes = plt.subplots(2, 1, figsize=(12, 10))
avg_sales_overall.plot(kind='bar', color=['#a3c9f1', '#2166ac'], ax=axes[0])
axes[0].set_title('Average Units Sold: Discounted vs Non-Discounted')
axes[0].set_ylabel('Average Units Sold')
axes[0].set_xticks([0, 1])
axes[0].set_xticklabels(['No Discount', 'Discount'], rotation=0)

avg_sales_by_category.plot(kind='bar', ax=axes[1], color=['#a3c9f1', '#2166ac'])
axes[1].set_title('Average Sales with and without Discount')
axes[1].set_ylabel('Average Units Sold')
plt.tight_layout()
plt.show()

In [None]:
from wordcloud import WordCloud
titles = ' '.join(df_product['titles'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bicubic')
plt.axis('off')
plt.show()

In [None]:
import unicodedata
import nltk
from nltk.tokenize import word_tokenize

def generate_ngrams(string, n):
    string = unicodedata.normalize('NFC', string)
    tokens = word_tokenize(string, language='english')
    tokens = [token.lower() for token in tokens]
    return [' '.join(tokens[i:i + n]) for i in range(len(tokens) - n + 1)]

In [None]:
from collections import Counter

_1gram = generate_ngrams(titles, 1)
word_counts = Counter(_1gram)
word_counts.most_common(20)

In [None]:
_2grams = generate_ngrams(titles, 2)
_2words_counts = Counter(_2grams)
_2words_counts.most_common(20)

## Product's comments dataframe 

In [None]:
df_cmt.info()

In [None]:
df_cmt.describe()

In [None]:
star_counts = df_cmt['star_count'].value_counts()
star_counts

In [None]:
plt.figure(figsize=(10, 6))
ax = sns.countplot(x='star_count', data=df_cmt, palette='Accent')
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', 
                (p.get_x() + p.get_width() / 2., height), 
                ha='center', va='bottom', fontsize=10)  

plt.title('Star Rating Distribution in Comments')
plt.xlabel('Star Rating')
plt.ylabel('Number of Comments')
plt.show()

In [None]:
df = pd.merge(df_product, df_cmt, left_on='links', right_on='Link', how='inner')
df

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='star_count', y='countSales', data=df, palette='Set3')
plt.title('Sales Distribution by Star Rating')
plt.xlabel('Star Rating')
plt.ylabel('Number of Sales')
plt.show()

In [None]:
text = ' '.join(df_cmt['content_comment'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Comments')
plt.show()