In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()

    # 2. Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # 3. Tokenize
    tokens = nltk.word_tokenize(text)

    # 4. Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]

    # 5. Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    # 6. Return as string
    return ' '.join(tokens)

In [4]:
def encode_categories(df_products):
    df_categories = df_products[['category', 'group']]

    df_category = pd.get_dummies(df_categories['category'], prefix='leaf')
    df_group = pd.get_dummies(df_categories['group'], prefix='group')

    df_cat = pd.concat([df_category, df_group], axis=1)

    df_cat.index = df_products['id']
    return df_cat

def encode_tags(df_products):
    # Tách các tags trong mỗi dòng thành list, loại bỏ khoảng trắng thừa
    tag_lists = df_products['tags'].fillna('').apply(lambda x: [tag.strip() for tag in x.split(',') if tag.strip()])

    # Lấy tập hợp tất cả các tag duy nhất
    all_tags = sorted(set(tag for tags in tag_lists for tag in tags))

    # Tạo cột one-hot cho từng tag
    tag_df = pd.DataFrame(0, index=df_products.index, columns=all_tags)

    for i, tags in enumerate(tag_lists):
        tag_df.loc[i, tags] = 1

    # Gán lại index là id nếu có
    if 'id' in df_products.columns:
        tag_df.index = df_products['id']

    return tag_df

def get_textual_features(df_products):
    df_products = df_products.copy()
    # Create combined column
    df_products['combined_text'] = df_products['name']

    # TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df_products['combined_text'])

    # Convert TF-IDF matrix to DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),
                            index=df_products['id'],
                            columns=vectorizer.get_feature_names_out())

    return tfidf_df

def encode_brand(df_products):
    df_brands = pd.get_dummies(df_products['brand']).astype(float)
    df_brands.index = df_products['id']

    return df_brands



In [5]:
def build_item_profiles(df_products):
    categorical_features = encode_categories(df_products).fillna(0).astype(float)
    tag_features = encode_tags(df_products)
    brand_features = encode_brand(df_products)
    # textual_features = get_textual_features(df_products)

    item_profile = categorical_features \
    .join(tag_features, how='left') \
    .join(brand_features, how='left')
    # .join(textual_features, how='left')

    return item_profile

In [6]:
def build_user_profiles(df_rates, item_profiles):
    user_profiles = {}

    for user_id in df_rates['customer_id'].unique():
        user_ratings = df_rates[df_rates['customer_id'] == user_id]

        rated_items = item_profiles.loc[user_ratings['product_id']]

        stars = user_ratings['stars'].values.reshape(-1, 1)

        weighted = rated_items.values * stars
        profile_vector = weighted.sum(axis=0) / stars.sum()

        user_profiles[user_id] = profile_vector

    return pd.DataFrame.from_dict(user_profiles, orient='index', columns=item_profiles.columns)


In [7]:
df_products = pd.read_csv('Data//products.csv')
df_rates = pd.read_csv('Data//rates.csv')

In [8]:
train, test = train_test_split(df_rates, test_size=0.3, random_state=42, shuffle=True)

In [9]:
item_profiles = build_item_profiles(df_products)
user_profiles = build_user_profiles(train, item_profiles)

In [10]:
item_profiles = item_profiles.reset_index()
item_profiles = item_profiles.rename(columns={'id': 'product_id'})
user_profiles = user_profiles.reset_index()
user_profiles = user_profiles.rename(columns={'index': 'customer_id'})

In [11]:
user_profiles.to_csv('Data//user_profiles.csv', index=False)
item_profiles.to_csv('Data//item_profiles.csv', index=False)

In [12]:
train.to_csv('Data//train.csv', index=False)
test.to_csv('Data//test.csv', index=False)

In [13]:
average_ratings_per_user = train.groupby('customer_id')['product_id'].count().mean()
print("Số lượt đánh giá trung bình mỗi khách hàng trong tập train:", average_ratings_per_user)

average_ratings_per_user = test.groupby('customer_id')['product_id'].count().mean()
print("Số lượt đánh giá trung bình mỗi khách hàng trong tập test:", average_ratings_per_user)

Số lượt đánh giá trung bình mỗi khách hàng trong tập train: 18.0
Số lượt đánh giá trung bình mỗi khách hàng trong tập test: 7.74


# ***Evaluate***

In [14]:
user_profiles = pd.read_csv('Data//user_profiles.csv')
item_profiles = pd.read_csv('Data//item_profiles.csv')
df_products = pd.read_csv('Data//products.csv')
test = pd.read_csv('Data//test.csv')

In [15]:
def precision_at_k(user_profiles, item_profiles, interactions, top_k=10):
    # Tạo dict để lưu precision@k của từng user
    precisions = []

    # Tạo ma trận cosine similarity giữa user và item
    user_feature_matrix = user_profiles.drop(columns=['customer_id']).values
    item_feature_matrix = item_profiles.drop(columns=['product_id']).values

    similarity_matrix = cosine_similarity(user_feature_matrix, item_feature_matrix)

    # Mapping lại index để lấy ID đúng
    user_ids = user_profiles['customer_id'].values
    item_ids = item_profiles['product_id'].values

    for i, user_id in enumerate(user_ids):
        # Tính độ tương đồng cho từng user với tất cả item
        sim_scores = similarity_matrix[i]

        # Lấy top_k item có độ tương đồng cao nhất
        top_indices = sim_scores.argsort()[::-1][:top_k]
        recommended_product_ids = item_ids[top_indices]

        # Lấy danh sách item thực sự tương tác
        true_items = interactions[interactions['customer_id'] == user_id]['product_id'].values

        # Tính precision@k
        relevant_and_recommended = set(recommended_product_ids) & set(true_items)
        precision = len(relevant_and_recommended) / top_k
        precisions.append(precision)

    # Trả về trung bình precision@k toàn bộ người dùng
    return sum(precisions) / len(precisions) if precisions else 0.0

In [16]:
def precision_at_k_by_category(user_profiles, item_profiles, interactions, df_products, top_k=10):
    precisions = []

    # Chuẩn bị ma trận cosine similarity
    user_feature_matrix = user_profiles.drop(columns=['customer_id']).values
    item_feature_matrix = item_profiles.drop(columns=['product_id']).values
    similarity_matrix = cosine_similarity(user_feature_matrix, item_feature_matrix)

    # Mapping ID
    user_ids = user_profiles['customer_id'].values
    item_ids = item_profiles['product_id'].values

    # Ánh xạ product_id -> category
    product_to_category = df_products.set_index('id')['parent'].to_dict()

    for i, user_id in enumerate(user_ids):
        sim_scores = similarity_matrix[i]
        top_indices = sim_scores.argsort()[::-1][:top_k]
        recommended_product_ids = item_ids[top_indices]

        # Lấy danh sách sản phẩm thực sự đã tương tác
        true_items = interactions[interactions['customer_id'] == user_id]['product_id'].values

        # Lấy category của sản phẩm đã tương tác
        true_categories = set(product_to_category.get(pid) for pid in true_items)

        # Tính số lượng sản phẩm được gợi ý nằm cùng category
        correct = 0
        for pid in recommended_product_ids:
            cat = product_to_category.get(pid)
            if cat in true_categories:
                correct += 1

        precision = correct / top_k
        precisions.append(precision)

    return sum(precisions) / len(precisions) if precisions else 0.0

In [17]:
score1 = precision_at_k(user_profiles, item_profiles, test)
print(f'Precision@10_1: {score1:.4f}')

score = precision_at_k_by_category(user_profiles, item_profiles, test, df_products, top_k=10)
print(f'Precision@10_2: {score:.4f}')


Precision@10_1: 0.0320
Precision@10_2: 0.7240


# ***Demo***

In [18]:
def recommend_for_user(user_id, top_k=10):
    # Lấy vector người dùng
    user_vector = user_profiles[user_profiles['customer_id'] == user_id].drop(columns=['customer_id']).values.reshape(1, -1)

    # Ma trận đặc trưng sản phẩm
    item_feature_matrix = item_profiles.drop(columns=['product_id']).values

    # Tính độ tương đồng cosine
    similarities = cosine_similarity(user_vector, item_feature_matrix)[0]

    # Lấy product_id tương ứng
    product_ids = item_profiles['product_id'].values

    # Tạo DataFrame chứa độ tương đồng
    sim_df = pd.DataFrame({
        'product_id': product_ids,
        'similarity': similarities
    })

    # Lấy danh sách sản phẩm mà user đã đánh giá
    rated_products = df_rates[df_rates['customer_id'] == user_id]['product_id'].tolist()

    # Loại bỏ sản phẩm đã đánh giá
    sim_df = sim_df[~sim_df['product_id'].isin(rated_products)]

    # Sắp xếp theo độ tương đồng và lấy top_k
    top_recommendations = sim_df.sort_values(by='similarity', ascending=False).head(top_k)

    return top_recommendations[['product_id', 'similarity']]


In [19]:
df_products = pd.read_csv('Data//products.csv')
product_to_category = df_products.set_index('id')['parent'].to_dict()

In [20]:
top_recommendations = recommend_for_user(1, 10)
df_products_renamed = df_products.rename(columns={'id': 'product_id'})

# Gộp (merge) để gắn thông tin thêm
top_recommendations = top_recommendations.merge(
    df_products_renamed[['product_id', 'category', 'parent', 'group', 'brand']],
    on='product_id',
    how='left'
)
top_recommendations

Unnamed: 0,product_id,similarity,category,parent,group,brand
0,20,0.669916,Dress,Dresses,Women's Fashion,H&M
1,141,0.669916,Dress,Dresses,Women's Fashion,H&M
2,37,0.635488,Wool Coat,Outerwear,Women's Fashion,H&M
3,53,0.635488,Flip-Flops,Shoes,Women's Fashion,H&M
4,12,0.635488,Denim Jacket,Outerwear,Women's Fashion,H&M
5,173,0.635488,Culottes,Pants,Women's Fashion,H&M
6,133,0.606798,Hoodie,Outerwear,Women's Fashion,Nike
7,16,0.59317,Sneakers,Shoes,Women's Fashion,Nike
8,19,0.565197,Sneakers,Shoes,Women's Fashion,Adidas
9,190,0.534355,Wide-Leg Pants,Pants,Women's Fashion,Bloom
