In [6]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from scipy import sparse
from scipy.sparse import coo_matrix
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
import numpy as np

data_path = '../data'
# === 1. 讀取資料 ===
#df = pd.read_csv("C:/Users/user/Desktop/output/split_cluster_2_with_desc.csv", usecols=['ShopMemberId', 'SalePageId'])
# 讀取商品ID與標題
#title = pd.read_csv("C:/Users/user/Desktop/output/split_cluster_2_with_desc.csv", usecols=['SalePageId','SalePageTitle'])
# 去除 SalePageId 重複，只保留第一次出現的那筆
#title = title.drop_duplicates(subset='SalePageId', keep='first')


In [37]:
df.to_csv('C:/Users/user/Desktop/member_item_count.csv', index=False, encoding='utf-8')
title.to_csv('C:/Users/user/Desktop/SalePageTitle.csv', index=False, encoding='utf-8')


In [2]:
unique_product_count = df['SalePageId'].nunique()
print("不重複的商品數量（SalePageId）有:", unique_product_count)
unique_member_count = df['ShopMemberId'].nunique()
print("不重複的會員數量（SalePageId）有:", unique_member_count)
print("資料集總筆數:", len(df))

不重複的商品數量（SalePageId）有: 35086
不重複的會員數量（SalePageId）有: 27401
資料集總筆數: 1097410


In [7]:
# === 2. 編碼會員與商品 ID 成數字編號 ===
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(df['ShopMemberId'])
item_ids = item_encoder.fit_transform(df['SalePageId'])


# === 3. 建立購買次數矩陣（稀疏） ===
# 統計每個會員對每個商品的購買次數
df['PurchaseCount'] = 1
user_item_counts = pd.DataFrame({'user_id': user_ids, 'item_id': item_ids, 'count': 1})
user_item_counts = user_item_counts.groupby(['user_id', 'item_id']).count().reset_index()

# 建立 COO 稀疏矩陣，再轉 CSR 格式
sparse_matrix = coo_matrix(
    (user_item_counts['count'], (user_item_counts['user_id'], user_item_counts['item_id'])),
    shape=(len(user_encoder.classes_), len(item_encoder.classes_))
).tocsr()

# 儲存向量空間
sparse.save_npz(f'C:/Users/user/Desktop/user_item_matrix.npz', sparse_matrix)


In [34]:
# === 4. 建立推薦函數（用 Cosine 相似度） ===
def recommend_items_sparse(target_user_id_str, sparse_matrix, user_encoder, item_encoder, top_n=5):
    # 將會員字串 ID 轉為數字編號
    if target_user_id_str not in user_encoder.classes_:
        raise ValueError("會員 ID 不存在")
    
    user_idx = user_encoder.transform([target_user_id_str])[0]

    # 計算目標會員與其他會員的相似度
    target_vector = sparse_matrix[user_idx]
    similarities = cosine_similarity(target_vector, sparse_matrix).flatten()

    # 排除自己
    similarities[user_idx] = 0
    similar_user_indices = similarities.argsort()[::-1][:20]  # 取前20名相似會員

    # 彙總這些相似會員的商品購買
    similar_users_matrix = sparse_matrix[similar_user_indices]
    summed_scores = similar_users_matrix.sum(axis=0)

    # 去除已購買的商品
    target_items = sparse_matrix[user_idx].toarray().flatten()
    summed_scores = summed_scores.A1  # Convert to flat array
    summed_scores[target_items > 0] = 0

    # 找出分數最高的商品
    recommended_item_indices = summed_scores.argsort()[::-1][:top_n]

    # 轉回原本的商品 ID
    recommended_item_ids = item_encoder.inverse_transform(recommended_item_indices)

    return recommended_item_ids


In [35]:
target_user_str_id = 'bvYeaSNec+BrbPJrB/wCHPY3qvNx72H2fAxXqy58vIc='  # 替換為任意會員 ID
recommendations = recommend_items_sparse(target_user_str_id, sparse_matrix, user_encoder, item_encoder, top_n=5)

# 你的推薦結果（SalePageId list）
recommended_item_ids = recommendations  # 這是之前推薦函數回傳的 list 或 numpy array

# 把推薦商品ID轉成 DataFrame
rec_df = pd.DataFrame({'SalePageId': recommended_item_ids})

# 將推薦結果和標題合併
rec_with_title = rec_df.merge(title, on='SalePageId', how='left')
print("推薦給會員", target_user_str_id, "的商品:")
for idx, row in rec_with_title.iterrows():
    print(idx+1,". ",row['SalePageTitle'])


推薦給會員 bvYeaSNec+BrbPJrB/wCHPY3qvNx72H2fAxXqy58vIc= 的商品:
1 .  【11月首購限定】KIRAGURIN螺旋極細刷毛牙刷
2 .  Ora2me亮白香氛牙膏_沁心香橙薄荷香130g
3 .  李施德霖薄荷除菌漱口水1+1(500+250ml)
4 .  日本獅王細潔適齦佳極致8效漱口水600ml柑橘
5 .  日本獅王細潔適齦佳極效8效牙膏柑橘薄荷95g
