# Cài đặt bộ dữ liệu Rocket

In [None]:
# Cài đặt Kaggle API nếu chưa có
pip install -q kaggle            

# Upload file kaggle.json
from google.colab import files
files.upload()  # Chọn file kaggle.json từ máy bạn

# Tạo thư mục và cấu hình Kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Tải dataset
!kaggle datasets download -d benroshan/ecommerce-data
!unzip ecommerce-data.zip -d ecommerce_data


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/benroshan/ecommerce-data
License(s): CC0-1.0
Archive:  ecommerce-data.zip
  inflating: ecommerce_data/List of Orders.csv  
  inflating: ecommerce_data/Order Details.csv  
  inflating: ecommerce_data/Sales target.csv  


In [None]:
import pandas as pd

# Đọc dữ liệu
orders = pd.read_csv("ecommerce_data/List of Orders.csv", encoding='ISO-8859-1')
details = pd.read_csv("ecommerce_data/Order Details.csv", encoding='ISO-8859-1')

# Merge 2 bảng theo Order ID
df = pd.merge(details, orders, on='Order ID')


In [None]:
df.head(5)

Unnamed: 0,Order ID,Amount,Profit,Quantity,Category,Sub-Category,Order Date,CustomerName,State,City
0,B-25601,1275.0,-1148.0,7,Furniture,Bookcases,01-04-2018,Bharat,Gujarat,Ahmedabad
1,B-25601,66.0,-12.0,5,Clothing,Stole,01-04-2018,Bharat,Gujarat,Ahmedabad
2,B-25601,8.0,-2.0,3,Clothing,Hankerchief,01-04-2018,Bharat,Gujarat,Ahmedabad
3,B-25601,80.0,-56.0,4,Electronics,Electronic Games,01-04-2018,Bharat,Gujarat,Ahmedabad
4,B-25602,168.0,-111.0,2,Electronics,Phones,01-04-2018,Pearl,Maharashtra,Pune


In [None]:
# Xoá các giá trị thiếu
df.dropna(inplace=True)

# Chọn các đặc trưng số
features = ['Amount', 'Profit', 'Quantity']
data = df[features].copy()

# Chuẩn hóa dữ liệu
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)


In [None]:
X_scaled

array([[ 2.14219766e+00, -6.88396768e+00,  1.49100194e+00],
       [-4.80949337e-01, -1.65420566e-01,  5.75340566e-01],
       [-6.06790963e-01, -1.06278426e-01, -3.40320812e-01],
       ...,
       [ 1.17234927e+00,  1.26581922e+00, -7.98151501e-01],
       [-5.50379199e-01, -3.53078576e-02, -7.98151501e-01],
       [-4.67931238e-01,  1.77426420e-04, -7.98151501e-01]])

In [None]:
!pip install torch torchvision scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# Mô hình, thuật toán và đánh giá

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from numpy.linalg import eigh

# Dữ liệu: X_scaled từ bước tiền xử lý
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)

# AutoEncoder
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, latent_dim=10):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )

    def forward(self, x):
        latent = self.encoder(x)
        out = self.decoder(latent)
        return out, latent


In [None]:
def train_autoencoder(model, X, epochs=100, lr=1e-3):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output, _ = model(X)
        loss = criterion(output, X)
        loss.backward()
        optimizer.step()
        if (epoch+1) % 10 == 0:
            print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


In [None]:
def DEKM(model, X, k=4, Iter=10):
    train_autoencoder(model, X)  # bước 1: huấn luyện AE ban đầu

    for it in range(Iter):
        model.eval()
        with torch.no_grad():
            _, H = model(X)  # bước 2: lấy đặc trưng ẩn
            H_np = H.numpy()

        # bước 3: KMeans trên đặc trưng ẩn
        kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
        labels = kmeans.fit_predict(H_np)

        # bước 4: tính Sw
        Sw = np.zeros((H_np.shape[1], H_np.shape[1]))
        for i in range(k):
            cluster_points = H_np[labels == i]
            mu_i = np.mean(cluster_points, axis=0, keepdims=True)
            for h in cluster_points:
                diff = (h - mu_i).reshape(-1, 1)
                Sw += diff @ diff.T

        # bước 5: tính eigenvectors (Equation 5 trong paper)
        eigvals, eigvecs = eigh(Sw)
        V = eigvecs[:, :model.encoder[-1].out_features]  # chọn các vector nhỏ nhất

        # bước 6: tối ưu hóa encoder bằng loss mới
        # L_new = L_AE + lambda * ||H - H*V*V^T||^2
        V = torch.tensor(V, dtype=torch.float32)
        optimizer = optim.Adam(model.parameters(), lr=1e-3)
        lambda_reg = 0.1

        for epoch in range(30):
            model.train()
            optimizer.zero_grad()
            out, H = model(X)
            reconstruction_loss = nn.MSELoss()(out, X)
            projection = H @ V @ V.T
            constraint_loss = torch.norm(H - projection)
            total_loss = reconstruction_loss + lambda_reg * constraint_loss
            total_loss.backward()
            optimizer.step()

        print(f"Iteration {it+1}, Total Loss: {total_loss.item():.4f}")

    return labels, H_np


In [None]:
input_dim = X_scaled.shape[1]
model = AutoEncoder(input_dim=input_dim)
cluster_labels, H_np = DEKM(model, X_tensor, k=4, Iter=10)

# Gán vào dataframe gốc
df['Cluster'] = cluster_labels


Epoch 10, Loss: 0.9021
Epoch 20, Loss: 0.7297
Epoch 30, Loss: 0.5205
Epoch 40, Loss: 0.3066
Epoch 50, Loss: 0.1817
Epoch 60, Loss: 0.1184
Epoch 70, Loss: 0.0689
Epoch 80, Loss: 0.0326
Epoch 90, Loss: 0.0178
Epoch 100, Loss: 0.0135
Iteration 1, Total Loss: 0.0014
Iteration 2, Total Loss: 0.0006
Iteration 3, Total Loss: 0.0005
Iteration 4, Total Loss: 0.0004
Iteration 5, Total Loss: 0.0003
Iteration 6, Total Loss: 0.0003
Iteration 7, Total Loss: 0.0003
Iteration 8, Total Loss: 0.0003
Iteration 9, Total Loss: 0.0002
Iteration 10, Total Loss: 0.0002


**Chỉ số đánh giá**

In [None]:
from sklearn.metrics import silhouette_score

score = silhouette_score(H_np, cluster_labels)
print(f"Silhouette Score: {score:.4f}")


Silhouette Score: 0.4545


In [None]:
from sklearn.metrics import calinski_harabasz_score

ch_score = calinski_harabasz_score(H_np, cluster_labels)
print(f"Calinski-Harabasz Index: {ch_score:.4f}")


Calinski-Harabasz Index: 701.7701


In [None]:
from sklearn.metrics import davies_bouldin_score

db_score = davies_bouldin_score(H_np, cluster_labels)
print(f"Davies-Bouldin Index: {db_score:.4f}")


Davies-Bouldin Index: 0.9349
