In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# I. Load Dataset

In [1]:
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import re
from sklearn.model_selection import train_test_split
import random
warnings.filterwarnings("ignore", category=RuntimeWarning)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [None]:
# Đọc dữ liệu
events = pd.read_csv("/kaggle/input/ecommerce-dataset/events.csv")
items1 = pd.read_csv("/kaggle/input/ecommerce-dataset/item_properties_part1.csv")
items2 = pd.read_csv("/kaggle/input/ecommerce-dataset/item_properties_part2.csv")
categories = pd.read_csv("/kaggle/input/ecommerce-dataset/category_tree.csv")



In [None]:

# Gộp dữ liệu items
items = pd.concat([items1, items2], ignore_index=True)

# Giữ lại các event quan trọng
events = events[events["event"].isin(["view", "addtocart", "transaction"])]

# Mã hóa user_id và item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
events["user_id"] = user_encoder.fit_transform(events["visitorid"])
events["item_id"] = item_encoder.fit_transform(events["itemid"])

# Chuyển timestamp về dạng datetime
events["timestamp"] = pd.to_datetime(events["timestamp"], unit="ms")

# Xử lý category_id
items_category = items[items["property"] == "categoryid"][["itemid", "value"]]
items_category.rename(columns={"value": "categoryid"}, inplace=True)
items_category["categoryid"] = pd.to_numeric(items_category["categoryid"], errors="coerce").astype("Int64")
items = items.merge(items_category, on="itemid", how="left")
items_with_category = items[["itemid", "categoryid"]].drop_duplicates()

# Kết hợp category_id vào events
df_final = events.merge(items_with_category, on="itemid", how="left")
df_final.rename(columns={"categoryid": "category_id"}, inplace=True)

# Xử lý missing category_id
num_categories = df_final["category_id"].nunique()
df_final["category_id"] = df_final["category_id"].fillna(num_categories + 1).astype(int)

# Lấy parent_category
df_final = df_final.merge(categories, left_on="category_id", right_on="categoryid", how="left")
df_final.rename(columns={"parentid": "parent_category"}, inplace=True)
df_final["parent_category"] = df_final["parent_category"].fillna(-1).astype(int)

# Xử lý giá sản phẩm
items_price_raw = items[items["value"].str.contains(r"n\d+\.\d+", na=False)].copy()
def extract_price(value):
    prices = re.findall(r"n(\d+\.\d+)", value)
    return float(prices[0]) if prices else None
items_price_raw["price"] = items_price_raw["value"].apply(extract_price)
items_price = items_price_raw[["itemid", "price"]].dropna().drop_duplicates(subset="itemid")

# Thêm giá vào df_final
df_final = df_final.merge(items_price, on="itemid", how="left")

# Thêm thông tin user
unique_users = df_final["user_id"].unique()
user_info = pd.DataFrame({
    "user_id": unique_users,
    "age": np.random.randint(18, 66, size=len(unique_users)),   # Tuổi 18-65
    "gender": np.random.randint(0, 2, size=len(unique_users))   # 0: nữ, 1: nam
})
df_final = df_final.merge(user_info, on="user_id", how="left")

# Mã hóa event thành interaction_value
df_final["interaction_value"] = df_final["event"].map({"view": 1, "addtocart": 2, "transaction": 3}).astype(np.float32)

# Chuẩn hóa tuổi
scaler = MinMaxScaler()
df_final["age"] = scaler.fit_transform(df_final[["age"]])

# Xử lý missing giá bằng median
median_price = df_final["price"].median()
df_final["price"] = df_final["price"].fillna(median_price)
df_final["price"] = np.log1p(df_final["price"])
df_final["price"] = scaler.fit_transform(df_final[["price"]])

# Lưu datetime gốc để xử lý tuần hoàn
df_final["datetime"] = df_final["timestamp"]

# Trích xuất các feature theo thời gian
df_final["hour"] = df_final["datetime"].dt.hour
df_final["dayofweek"] = df_final["datetime"].dt.dayofweek
df_final["month"] = df_final["datetime"].dt.month
df_final["is_weekend"] = df_final["dayofweek"].isin([5, 6]).astype(int)

# Encode sin/cos cho giờ và ngày
df_final["hour_sin"] = np.sin(2 * np.pi * df_final["hour"] / 24)
df_final["hour_cos"] = np.cos(2 * np.pi * df_final["hour"] / 24)
df_final["dayofweek_sin"] = np.sin(2 * np.pi * df_final["dayofweek"] / 7)
df_final["dayofweek_cos"] = np.cos(2 * np.pi * df_final["dayofweek"] / 7)

# Tính recency và session
df_final = df_final.sort_values(by=["user_id", "timestamp"])
df_final["timestamp_numeric"] = df_final["timestamp"].astype(np.int64) / 1e9
df_final["time_diff"] = df_final.groupby("user_id")["timestamp_numeric"].diff().fillna(0)
df_final["new_session"] = (df_final["time_diff"] > 1800).astype(int)
df_final["session_id"] = df_final.groupby("user_id")["new_session"].cumsum()

# Chuẩn hóa timestamp liên tục
df_final["timestamp_norm"] = scaler.fit_transform(df_final[["timestamp_numeric"]])

# Lọc cột cuối cùng
df_final = df_final[[
    "user_id", "item_id", "category_id", "parent_category", "price", "age", "gender",
    "interaction_value", "hour", "dayofweek", "is_weekend",
    "hour_sin", "hour_cos", "dayofweek_sin", "dayofweek_cos",
    "time_diff", "session_id", "timestamp_norm"
]]

print(df_final.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Count number of sessions per user
session_counts = df_final.groupby("user_id")["session_id"].nunique().reset_index()
session_counts.columns = ["user_id", "num_sessions"]

# Count how many users have each number of sessions
session_count_values = session_counts["num_sessions"].value_counts().reset_index()
session_count_values.columns = ["num_sessions", "user_count"]

# Create a complete range from 1 to 30
full_range = pd.DataFrame({'num_sessions': range(1, 31)})

# Merge with our actual data to fill in zeros where needed
session_count_values = full_range.merge(session_count_values, 
                                       on='num_sessions', 
                                       how='left').fillna(0)

# Create the plot
plt.figure(figsize=(18, 6))  # Increase figure width for better visibility
ax = sns.barplot(x="num_sessions", 
                 y="user_count", 
                 data=session_count_values,
                 color="lightcoral")

# Customize the plot
plt.title("Phân bố số lượng sessions theo người dùng", fontsize=14, pad=20)
plt.xlabel("Số sessions", fontsize=12, labelpad=10)
plt.ylabel("Số người dùng", fontsize=12, labelpad=10)

# Set x-axis to show all integers from 1 to 30
plt.xticks(range(30), range(1, 31))

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Add grid lines
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

# Remove spines for cleaner look
sns.despine()

# Add value labels on top of bars
for p in ax.patches:
    if p.get_height() > 0:  # Only label bars with height > 0
        ax.annotate(f"{int(p.get_height())}", 
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center', 
                   xytext=(0, 5), 
                   textcoords='offset points')

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Trực quan hóa phân bố của interaction_value
plt.figure(figsize=(10, 6))
sns.countplot(x="interaction_value", data=df_final, palette="Set2")
plt.title("Phân bố interaction_value", fontsize=14)
plt.xlabel("Interaction Value", fontsize=12)
plt.ylabel("Số lượng", fontsize=12)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_final["price"], kde=True, color="skyblue", bins=30)
plt.title("Phân bố giá sản phẩm", fontsize=14)
plt.xlabel("Giá (log1p)", fontsize=12)
plt.ylabel("Số lượng", fontsize=12)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df_final["time_diff"], kde=True, color="orange", bins=50)
plt.title("Phân bố thời gian giữa các hành động (Recency)", fontsize=14)
plt.xlabel("Thời gian giữa các hành động (giây)", fontsize=12)
plt.ylabel("Số lượng", fontsize=12)
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x="dayofweek", data=df_final, palette="Blues")
plt.title("Phân bố theo ngày trong tuần", fontsize=14)
plt.xlabel("Ngày trong tuần", fontsize=12)
plt.ylabel("Số lượng", fontsize=12)
plt.xticks(ticks=range(7), labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x="hour", data=df_final, palette="coolwarm")
plt.title("Phân bố theo giờ trong ngày", fontsize=14)
plt.xlabel("Giờ trong ngày", fontsize=12)
plt.ylabel("Số lượng", fontsize=12)
plt.show()


# II. Data Preprocessing

# LightFM 

In [None]:
from sklearn.model_selection import train_test_split
# 80% train-test
train, test = train_test_split(df_final, test_size=0.2, random_state=42)
# 70% train và 10% validation
train, valid = train_test_split(train, test_size=0.125, random_state=42)  # 0.125 * 80% = 10%

In [2]:
dtypes = {
    "user_id": "int32",
    "item_id": "int32",
    "category_id": "int32",
    "parent_category": "int32",
    "price": "float32",
    "age": "float32",
    "gender": "float32",
    "interaction_value": "float32",
    
    # Các thuộc tính mới
    "hour": "float32",
    "dayofweek": "float32",
    "is_weekend": "float32",
    "hour_sin": "float32",
    "hour_cos": "float32",
    "dayofweek_sin": "float32",
    "dayofweek_cos": "float32",
    "time_diff": "float32",
    "session_id": "int32",
    "timestamp_norm": "float32"
}
# for col, dtype in dtypes.items():
#     df_final[col] = df_final[col].astype(dtype)

In [None]:
# train.to_csv("train.csv", index=False)
# valid.to_csv("valid.csv", index=False)
# test.to_csv("test.csv", index=False)

In [3]:
train = pd.read_csv("Datasets/NEW_retail/train.csv", dtype=dtypes)
valid = pd.read_csv("Datasets/NEW_retail/valid.csv", dtype=dtypes)
test = pd.read_csv("Datasets/NEW_retail/test.csv", dtype=dtypes)

In [4]:
from tensorflow.keras.utils import to_categorical

print(f"Train Shape: {train.shape}")
print(f"Validation Shape: {valid.shape}")
print(f"Test Shape: {test.shape}")

print("Example data:")
print(train.head())




AttributeError: module 'tensorflow._api.v2.compat.v2.__internal__' has no attribute 'register_load_context_function'

In [None]:
print(train["interaction_value"].value_counts(normalize=True))

In [None]:
!pip install lightfm

In [None]:
from lightfm import LightFM
from lightfm.data import Dataset
from tqdm import tqdm
import numpy as np
import os
from scipy.sparse import csr_matrix  

num_threads = min(os.cpu_count(), 8)

# Chuẩn bị dataset với user_id và item_id
all_user_ids = np.unique(np.concatenate((train["user_id"].unique(),
                                         valid["user_id"].unique(),
                                         test["user_id"].unique())))
all_item_ids = np.unique(np.concatenate((train["item_id"].unique(),
                                         valid["item_id"].unique(),
                                         test["item_id"].unique())))
dataset = Dataset()
dataset.fit(users=all_user_ids, items=all_item_ids)

# Xây dựng interactions matrix
(interactions_train, _) = dataset.build_interactions(zip(train["user_id"], train["item_id"], train["interaction_value"]))
(interactions_valid, _) = dataset.build_interactions(zip(valid["user_id"], valid["item_id"], valid["interaction_value"]))
(interactions_test, _) = dataset.build_interactions(zip(test["user_id"], test["item_id"], test["interaction_value"]))

# Chuyển interactions sang dạng sparse matrix (CSR) - Ma trận thưa
interactions_train_csr = csr_matrix(interactions_train)
interactions_valid_csr = csr_matrix(interactions_valid)
interactions_test_csr = csr_matrix(interactions_test)

embedding_dim = 128  
model_LightFM = LightFM(loss="warp", no_components=embedding_dim)

num_epochs = 100

for epoch in tqdm(range(num_epochs), desc="Training Progress"):
    model_LightFM.fit_partial(interactions_train_csr, epochs=1, num_threads=num_threads)
    print(f"Epoch {epoch+1}/{num_epochs} finished!")

print("Training completed!")


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Tạo ma trận R (4 người dùng x 5 sản phẩm)
R = np.array([
    [5, 3, 0, 1, 0],
    [4, 0, 0, 1, 0],
    [1, 1, 0, 5, 4],
    [0, 0, 5, 4, 0]
])

# Áp dụng SVD
U, s, VT = np.linalg.svd(R, full_matrices=False)
Sigma = np.diag(s)

# Giảm số chiều xuống k=2 (ví dụ: 2 đặc trưng tiềm ẩn)
k = 2
U_k = U[:, :k]
Sigma_k = Sigma[:k, :k]
VT_k = VT[:k, :]

# Thiết lập đồ thị
fig, axs = plt.subplots(1, 4, figsize=(16, 4))
sns.heatmap(R, cmap="Blues", cbar=True, annot=True, ax=axs[0])
axs[0].set_title("Ma trận R (4x5)")
axs[0].set_xlabel("Sản phẩm")
axs[0].set_ylabel("Người dùng")

sns.heatmap(U_k, cmap="Greens", cbar=True, annot=True, ax=axs[1])
axs[1].set_title("U (4x2)")
axs[1].set_xlabel("Đặc trưng ẩn")
axs[1].set_ylabel("Người dùng")

sns.heatmap(Sigma_k, cmap="Oranges", cbar=True, annot=True, ax=axs[2])
axs[2].set_title("Σ (2x2)")
axs[2].set_xlabel("Đặc trưng ẩn")
axs[2].set_ylabel("Đặc trưng ẩn")

sns.heatmap(VT_k, cmap="Purples", cbar=True, annot=True, ax=axs[3])
axs[3].set_title("Vᵀ (2x5)")
axs[3].set_xlabel("Sản phẩm")
axs[3].set_ylabel("Đặc trưng ẩn")

plt.tight_layout()
plt.suptitle("Phân rã ma trận R ≈ UΣVᵀ bằng SVD (k=2)", y=1.05, fontsize=16)
plt.show()


ModuleNotFoundError: No module named 'numpy'

In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score

precision = precision_at_k(model_LightFM, interactions_test_csr, k=5, num_threads=num_threads).mean()
recall = recall_at_k(model_LightFM, interactions_test_csr, k=5, num_threads=num_threads).mean()
auc = auc_score(model_LightFM, interactions_test_csr, num_threads=num_threads).mean()

print(f"Precision@5: {precision:.4f}")
print(f"Recall@5: {recall:.4f}")
print(f"AUC: {auc:.4f}")


In [None]:
# Trích xuất embeddings
user_embeddings = model_LightFM.user_embeddings
item_embeddings = model_LightFM.item_embeddings
print("Shape of user embedding: ",user_embeddings.shape)
print("Shape of item embedding: ",item_embeddings.shape)

In [None]:
# Lưu embeddings ra file .npy
np.save("user_embeddings.npy", user_embeddings)
np.save("item_embeddings.npy", item_embeddings)


In [None]:
# user_embeddings = np.load("user_embeddings.npy")
# item_embeddings = np.load("item_embeddings.npy")


In [None]:
num_users, embedding_dim = user_embeddings.shape
num_products = item_embeddings.shape[0]

print("Number of users = ", num_users)
print("Number of products = ", num_products)
print("Demension embedding = ", embedding_dim)

In [None]:
# Kiểm tra số lượng danh mục sản phẩm
num_categories = train["category_id"].nunique()
print("Number of categories =", num_categories)

# Kiểm tra số lượng unique parent_category
num_parent_categories = train["parent_category"].nunique()
print("Number of parent categories =", num_parent_categories)

# Build CNN + LightFM Model

# IV. Build Model LightFM + CNN

In [None]:
import tensorflow as tf
import numpy as np
from sklearn.utils import class_weight

y_labels = train["interaction_value"].values - 1  # trừ 1 để về dạng [0, 1, 2]
class_weights_np = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_labels),
    y=y_labels
)
class_weights_dict = dict(enumerate(class_weights_np))
print("Class Weights:", class_weights_dict)

In [None]:
def focal_loss_with_class_weights(class_weights, gamma=2.0):
    def loss(y_true, y_pred):
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1.0)
        cross_entropy = -y_true * tf.math.log(y_pred)
        focal = tf.pow(1 - y_pred, gamma)
        class_weights_tensor = tf.constant(class_weights, dtype=tf.float32)
        weighted_focal = class_weights_tensor * focal * cross_entropy
        return tf.reduce_mean(tf.reduce_sum(weighted_focal, axis=-1))
    return loss

In [None]:
metadata_config = {
    "user_input":          {"shape": (1,), "dtype": 'int32', "embedding": True, "dim": num_users, "weights": user_embeddings},
    "product_input":       {"shape": (1,), "dtype": 'int32', "embedding": True, "dim": num_products, "weights": item_embeddings},
    "category_input":      {"shape": (1,), "dtype": 'int32', "embedding": True, "dim": num_categories},
    "parent_category_input": {"shape": (1,), "dtype": 'int32', "embedding": True, "dim": num_parent_categories},
    "timestamp_input":     {"shape": (1,), "dtype": 'float32', "dense": True},
    "price_input":         {"shape": (1,), "dtype": 'float32', "dense": True},
    "age_input":           {"shape": (1,), "dtype": 'float32', "dense": True},
    "gender_input":        {"shape": (1,), "dtype": 'float32', "dense": True},

    "hour_input":          {"shape": (1,), "dtype": 'float32', "dense": True},
    "dayofweek_input":     {"shape": (1,), "dtype": 'float32', "dense": True},
    "is_weekend_input":    {"shape": (1,), "dtype": 'float32', "dense": True},
    
    "hour_sin_input":      {"shape": (1,), "dtype": 'float32', "dense": True},
    "hour_cos_input":      {"shape": (1,), "dtype": 'float32', "dense": True},
    "dayofweek_sin_input": {"shape": (1,), "dtype": 'float32', "dense": True},
    "dayofweek_cos_input": {"shape": (1,), "dtype": 'float32', "dense": True},
    
    "time_diff_input":     {"shape": (1,), "dtype": 'float32', "dense": True},
    "session_id_input":    {"shape": (1,), "dtype": 'int32', "dense": True}
}


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Embedding, Flatten, Dense, Dropout, Concatenate,
                                     Multiply, BatchNormalization, Reshape, GaussianNoise, Activation, Lambda, Layer, Conv1D, GlobalMaxPooling1D, LSTM)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import MultiHeadAttention
import tensorflow as tf

def build_model(trainable_embedding=False):
    inputs = {}
    embeddings = {}
    dense_features = []

    # Print input shapes for debugging
    print(f"User embedding matrix shape: {user_embeddings.shape}")
    print(f"Product embedding matrix shape: {item_embeddings.shape}")
    
    for name, cfg in metadata_config.items():
        inp = Input(shape=cfg["shape"], dtype=cfg["dtype"], name=name)
        inputs[name] = inp

        if cfg.get("embedding"):
            # Special handling for user and product embeddings
            if name == "user_input":
                emb_weights = [user_embeddings]
                emb_dim = user_embeddings.shape[1]
            elif name == "product_input":
                emb_weights = [item_embeddings]
                emb_dim = item_embeddings.shape[1]
            else:
                emb_weights = None
                emb_dim = embedding_dim
                
            emb_layer = Embedding(
                input_dim=cfg["dim"], 
                output_dim=emb_dim,
                weights=emb_weights,
                trainable=trainable_embedding
            )(inp)
            
            emb_layer = GaussianNoise(0.01)(emb_layer)
            emb_layer = Flatten()(emb_layer)
            embeddings[name] = emb_layer

        elif cfg.get("dense"):
            dense = Dense(16, activation='relu')(inp)
            dense = BatchNormalization()(dense)
            dense = Dropout(0.2)(dense)
            dense_features.append(dense)
            
    # === Conv1D Layer ===
    user_emb = embeddings["user_input"]
    product_emb = embeddings["product_input"]
    
    # Reshape embeddings to have an additional dimension for Conv1D (batch_size, sequence_length, embedding_dim)
    user_emb_reshaped = Reshape((-1, 1))(user_emb)  # Reshape to (batch_size, 1, embedding_dim)
    product_emb_reshaped = Reshape((-1, 1))(product_emb)  # Reshape to (batch_size, 1, embedding_dim)
    
    # Apply Conv1D
    user_emb_conv = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(user_emb_reshaped)
    user_emb_conv = GlobalMaxPooling1D()(user_emb_conv)  # Dùng GlobalMaxPooling1D để giảm chiều

    product_emb_conv = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu')(product_emb_reshaped)
    product_emb_conv = GlobalMaxPooling1D()(product_emb_conv)

    # === Attention Mechanism ===
    user_emb = embeddings["user_input"]
    product_emb = embeddings["product_input"]
    
    # Reshape for attention (batch_size, 1, embedding_dim)
    expand_dims = Lambda(lambda x: tf.expand_dims(x, axis=1))
    user_vector = expand_dims(user_emb)
    product_vector = expand_dims(product_emb)
    
    # Verify shapes
    print(f"User vector shape: {user_vector.shape}")
    print(f"Product vector shape: {product_vector.shape}")

    # MultiHeadAttention - using 4 heads with key_dim=32 (128/4)
    attention_output = MultiHeadAttention(
        num_heads=4, 
        key_dim=32  # 128-dim embeddings divided by 4 heads
    )(user_vector, product_vector)
    
    attention_output = Flatten()(attention_output)
    attention_output = BatchNormalization()(attention_output)

    # Element-wise product of user and product embeddings
    user_product_interaction = Multiply()([user_emb, product_emb])

    # Merge all features
    merged = Concatenate()([
        user_emb, 
        product_emb, 
        user_product_interaction,
        embeddings.get("category_input", []),
        embeddings.get("parent_category_input", []),
        attention_output,
        *dense_features
    ])
    
    print(f"Merged features shape: {merged.shape}")

    # === Dense Layers ===
    x = Dense(512, kernel_regularizer=l2(1e-4))(merged)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.4)(x)
    
    x = Dense(256, kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.4)(x)
    
    x = Dense(128, kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.3)(x)
    
    x = Dense(64, kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.3)(x)
    
    print("Before output layer:", x.shape)
    
    # Output layer
    output = Dense(3, activation='softmax')(x)

    model = Model(inputs=list(inputs.values()), outputs=output)
    
    # model.compile(
    #     optimizer=Adam(learning_rate=1e-3),
    #     loss=focal_loss_with_class_weights(class_weights=class_weights_np, gamma=2.0),
    #     metrics=['accuracy']
    # )
    
    # for layer in model.layers:
    #     print(f"{layer.name}: {layer.output.shape}")

    return model
    
# Build the model
model = build_model(trainable_embedding=False)

In [None]:
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True,
    mode='min'
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.1,
    patience=15,
    min_lr=1e-6
)

model_checkpoint = ModelCheckpoint(
    filepath='model_frozen_embeddings.keras',
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)

callbacks = [early_stopping, reduce_lr, model_checkpoint]

In [None]:
from tensorflow.keras.utils import to_categorical

# Prepare data
X_test = [
    test["user_id"].values,
    test["item_id"].values,
    test["category_id"].values,
    test["parent_category"].values,
    test["timestamp_norm"].values,
    test["price"].values,
    test["age"].values,
    test["gender"].values,
    
    test["hour"].values,
    test["dayofweek"].values,
    test["is_weekend"].values,
    
    test["hour_sin"].values,
    test["hour_cos"].values,
    test["dayofweek_sin"].values,
    test["dayofweek_cos"].values,
    
    test["time_diff"].values,
    test["session_id"].values
]

X_valid = [
    valid["user_id"].values,
    valid["item_id"].values,
    valid["category_id"].values,
    valid["parent_category"].values,
    valid["timestamp_norm"].values,
    valid["price"].values,
    valid["age"].values,
    valid["gender"].values,
    
    valid["hour"].values,
    valid["dayofweek"].values,
    valid["is_weekend"].values,
    
    valid["hour_sin"].values,
    valid["hour_cos"].values,
    valid["dayofweek_sin"].values,
    valid["dayofweek_cos"].values,
    
    valid["time_diff"].values,
    valid["session_id"].values
]

X_train = [
    train["user_id"].values,
    train["item_id"].values,
    train["category_id"].values,
    train["parent_category"].values,
    train["timestamp_norm"].values,
    train["price"].values,
    train["age"].values,
    train["gender"].values,
    
    train["hour"].values,
    train["dayofweek"].values,
    train["is_weekend"].values,
    
    train["hour_sin"].values,
    train["hour_cos"].values,
    train["dayofweek_sin"].values,
    train["dayofweek_cos"].values,
    
    train["time_diff"].values,
    train["session_id"].values
]

y_train = to_categorical(y_labels, num_classes=3)
y_valid = to_categorical(valid["interaction_value"].values - 1, num_classes=3)
y_test = to_categorical(test["interaction_value"].values - 1, num_classes=3)

In [None]:
from tensorflow.keras.metrics import AUC,Precision, Recall
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss=focal_loss_with_class_weights(class_weights=class_weights_np, gamma=2.0),
    metrics=['accuracy', AUC(), Precision(), Recall()]
)

model.fit(
    X_train,
    y_train,
    validation_data=(X_valid, y_valid),
    epochs=100,
    batch_size=128,
    shuffle=True,
    callbacks= callbacks
)


In [None]:
# Evaluate mô hình và in kết quả
test_loss, test_accuracy, test_auc, test_precision, test_recall = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test AUC: {test_auc:.4f}, " \
      f"Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# In ra confusion matrix và classification report
print("classification_report:")
print(classification_report(y_true, y_pred, digits=4))

# Vẽ confusion matrix bằng seaborn
conf_matrix = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


## fine-tuning

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

early_stopping_ft = EarlyStopping(
    monitor='val_loss',
    patience=10, 
    restore_best_weights=True,
    mode='min'
)

reduce_lr_ft = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5, 
    min_lr=1e-6,
    verbose=1
)

model_checkpoint_ft = ModelCheckpoint(
    filepath='best_model_finetune.keras',  
    monitor='val_loss',
    save_best_only=True,
    mode='min',
    verbose=1
)

callbacks_finetune = [early_stopping_ft, reduce_lr_ft, model_checkpoint_ft]


In [None]:
# Fine-tune embedding
for layer in model.layers:
    if 'user_emb' in layer.name or 'product_emb' in layer.name:
        layer.trainable = True

# Re-compile với learning rate nhỏ hơn
model.compile(optimizer=Adam(learning_rate=1e-4),
            loss=focal_loss_with_class_weights(class_weights=class_weights_np, gamma=2.0),
              metrics=['accuracy', AUC(), Precision(), Recall()])

# Fine-tuning
model.fit(
    X_train,
    y_train,
    validation_data=(X_valid, y_valid),
    epochs=1,
    batch_size=128,
    shuffle=True,
    callbacks= callbacks_finetune
)


In [None]:
# Evaluate mô hình và in kết quả
test_loss, test_accuracy, test_auc, test_precision, test_recall = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test AUC: {test_auc:.4f}, " \
      f"Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

# Giả sử model đã được huấn luyện và có dữ liệu đầu vào X_test, y_test
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

# In ra confusion matrix và classification report
print("classification_report:")
print(classification_report(y_true, y_pred, digits=4))

# Vẽ confusion matrix bằng seaborn
conf_matrix = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_true), yticklabels=np.unique(y_true))
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


# Evaluation

In [None]:
# Evaluate on validation set
val_loss, val_accuracy, val_auc, val_precision, val_recall = model.evaluate(X_valid, y_valid)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation AUC: {val_auc:.4f}, Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}")

# Evaluate on test set
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test AUC: {test_auc:.4f}, " \
      f"Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}")

# Evaluate on train set
train_loss, train_accuracy, train_auc, train_precision, train_recall = model.evaluate(X_train, y_train)
print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Train AUC: {train_auc:.4f}, " \
      f"Train Precision: {train_precision:.4f}, Train Recall: {train_recall:.4f}")


In [None]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Dự đoán kết quả từ mô hình
y_pred = model.predict(X_test)  # x_test là input test

# Chuyển one-hot sang label (class index)
y_true_cls = np.argmax(y_test, axis=1)
y_pred_cls = np.argmax(y_pred, axis=1)

# In báo cáo phân loại và độ chính xác
print(classification_report(y_true_cls, y_pred_cls))
print("Accuracy:", accuracy_score(y_true_cls, y_pred_cls))


# Cold start problem

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, precision_recall_fscore_support, roc_auc_score


In [None]:
# Tách người dùng và sản phẩm cold-start
train_users, train_items = set(train["user_id"]), set(train["item_id"])
test_users, test_items = set(test["user_id"]), set(test["item_id"])

cold_start_users = test_users - train_users
cold_start_items = test_items - train_items

mask_cold_user = test["user_id"].isin(cold_start_users)
mask_cold_item = test["item_id"].isin(cold_start_items)

test_cold_user, test_cold_item = test[mask_cold_user], test[mask_cold_item]
test_non_cold_user, test_non_cold_item = test[~mask_cold_user], test[~mask_cold_item]

y_test_cold_user, y_test_cold_item = y_test[mask_cold_user.values], y_test[mask_cold_item.values]
y_test_non_cold_user, y_test_non_cold_item = y_test[~mask_cold_user.values], y_test[~mask_cold_item.values]


In [None]:
# Hàm chuẩn bị input
def prepare_input(df):
    return [df[col].values for col in [
        "user_id", "item_id", "category_id", "parent_category", "timestamp_norm", "price", "age", "gender",
        "hour", "dayofweek", "is_weekend", "hour_sin", "hour_cos", "dayofweek_sin", "dayofweek_cos", "time_diff", "session_id"
    ]]


In [None]:
# Hàm evaluate: thêm precision, recall, auc
def evaluate_cold_start(model, df, y_true, name="Cold Start"):
    X_eval = prepare_input(df)
    y_pred_probs = model.predict(X_eval, batch_size=128, verbose=0)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_true, axis=1) if len(y_true.shape) > 1 else y_true

    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    auc = roc_auc_score(pd.get_dummies(y_true), y_pred_probs, multi_class='ovr') if len(np.unique(y_true)) > 1 else 0

    print(f"\nĐánh giá cho {name}:")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}, AUC: {auc:.4f}")
    print(classification_report(y_true, y_pred, digits=4))


In [None]:
# Evaluate cho các nhóm
evaluate_cold_start(model, test_cold_user, y_test_cold_user, name="User Cold Start")
evaluate_cold_start(model, test_cold_item, y_test_cold_item, name="Item Cold Start")
evaluate_cold_start(model, test_non_cold_user, y_test_non_cold_user, name="User Non-Cold Start")
evaluate_cold_start(model, test_non_cold_item, y_test_non_cold_item, name="Item Non-Cold Start")


In [None]:
# Hàm lấy f1-score per class
def get_f1_per_class(model, df, y_true):
    X_eval = prepare_input(df)
    y_pred_probs = model.predict(X_eval, batch_size=128, verbose=0)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_true, axis=1) if len(y_true.shape) > 1 else y_true

    _, _, f1, _ = precision_recall_fscore_support(y_true, y_pred, labels=[0, 1, 2], zero_division=0)
    return f1


In [None]:
# Tính f1 theo từng nhóm
f1_user_cold = get_f1_per_class(model, test_cold_user, y_test_cold_user)
f1_user_non_cold = get_f1_per_class(model, test_non_cold_user, y_test_non_cold_user)
f1_item_cold = get_f1_per_class(model, test_cold_item, y_test_cold_item)
f1_item_non_cold = get_f1_per_class(model, test_non_cold_item, y_test_non_cold_item)

In [None]:
# Plot
labels = ['Class 1', 'Class 2', 'Class 3']
x = np.arange(len(labels))
width = 0.2

plt.figure(figsize=(12, 6))
plt.bar(x - 1.5*width, f1_user_cold, width, label='User Cold-start', color='skyblue')
plt.bar(x - 0.5*width, f1_user_non_cold, width, label='User Non-cold', color='dodgerblue')
plt.bar(x + 0.5*width, f1_item_cold, width, label='Item Cold-start', color='lightcoral')
plt.bar(x + 1.5*width, f1_item_non_cold, width, label='Item Non-cold', color='indianred')

plt.xticks(x, labels)
plt.xlabel("Class (interaction value)")
plt.ylabel("F1-score")
plt.title("So sánh F1-score theo class giữa Cold-start và Non-cold-start")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
