# Data slicing

In [157]:
import numpy as np
labels = np.load("cluster_labels_min_df_5.npy")

In [158]:
from collections import Counter

label_counts = Counter(labels)
for label, count in sorted(label_counts.items()):
    if label == -1:
        print(f"Noise: {count} points")
    else:
        print(f"Cluster {label}: {count} points")

Noise: 108746 points
Cluster 0: 69 points
Cluster 1: 97 points
Cluster 2: 89 points
Cluster 3: 159 points
Cluster 4: 78 points
Cluster 5: 181 points
Cluster 6: 250 points
Cluster 7: 493 points
Cluster 8: 1346 points
Cluster 9: 470 points
Cluster 10: 206 points
Cluster 11: 144 points
Cluster 12: 273 points
Cluster 13: 510 points
Cluster 14: 195 points
Cluster 15: 69 points
Cluster 16: 187 points
Cluster 17: 355 points
Cluster 18: 205 points
Cluster 19: 227 points
Cluster 20: 69 points
Cluster 21: 136 points
Cluster 22: 233 points
Cluster 23: 165 points
Cluster 24: 162 points
Cluster 25: 663 points
Cluster 26: 106 points
Cluster 27: 928 points
Cluster 28: 86 points
Cluster 29: 165 points
Cluster 30: 87 points
Cluster 31: 543 points
Cluster 32: 652 points
Cluster 33: 129 points
Cluster 34: 703 points
Cluster 35: 93 points
Cluster 36: 39189 points


In [159]:
import re
from tqdm import tqdm

with open('../dataset/movie_new2.txt', 'r') as f:
    lines = f.readlines()

def extract_sequence(line):
    pairs = re.findall(r'\((\d+),\s*(\d+)\)', line)
    sequence = [int(pairs[0][0])] + [int(p[1]) for p in pairs]
    return sequence
sequences = [extract_sequence(line) for line in tqdm(lines, desc="Extracting sequences...")]

Extracting sequences...: 100%|██████████| 400000/400000 [00:04<00:00, 86466.83it/s] 


In [160]:
movie_ids = sorted(set(movie_id for seq in sequences for movie_id in seq))
movie_id_to_cluster = {movie_id: cluster_label for movie_id, cluster_label in zip(movie_ids, labels)}

In [161]:
from collections import defaultdict

cluster_to_movies = defaultdict(list)
for m, c in movie_id_to_cluster.items():
    if c != -1:  # -1 通常是 noise
        cluster_to_movies[c].append(m)

In [162]:
cluster_to_user_sequences = defaultdict(list)
cluster_to_seen_sequences = defaultdict(set)  # 新增：追蹤每個 cluster 已經加入過哪些序列

for seq in sequences: 

    movie_list = set(seq)

    if(len(movie_list) < 15):
        continue

    # 找出此 user 涉及到的所有 cluster
    clusters_hit = set()
    for movie_id in movie_list:
        if movie_id in movie_id_to_cluster:
            clusters_hit.add(movie_id_to_cluster[movie_id])

    # 把該序列加到對應 cluster 的新資料集中
    for c in clusters_hit:
        seq_tuple = tuple(seq)  # list 無法 hash，轉成 tuple
        if seq_tuple not in cluster_to_seen_sequences[c]:
            cluster_to_user_sequences[c].append(seq)
            cluster_to_seen_sequences[c].add(seq_tuple)

In [169]:
print(len(cluster_to_user_sequences))

37


## 合併相似 cluster，讓所有 cluster 大小都大於 100000

In [172]:
from scipy.sparse import load_npz
X_tfidf = load_npz("../data_preprocessing/X_tfidf_sparse_min_df_5.npz")

In [196]:
from sklearn.metrics.pairwise import cosine_similarity

# 建立 cluster_centroids：每個 cluster 的 TF-IDF 中心向量
cluster_centroids = {}
for c in cluster_to_user_sequences:
    indices = [i for i, seq in enumerate(sequences) 
                if any(mid in movie_id_to_cluster and movie_id_to_cluster[mid] == c for mid in seq)]
    if indices:
        centroid = np.asarray(X_tfidf[indices].mean(axis=0))
        cluster_centroids[c] = (centroid)
# 合併小群
threshold = 10000
updated = True

while updated:
    updated = False

    # 找出所有小群（每次都重算，因為數量在變）
    small_clusters = [c for c in cluster_to_user_sequences if len(cluster_to_user_sequences[c]) < threshold]
    if not small_clusters:
        break

    for cid in small_clusters:
        if cid not in cluster_centroids:
            continue

        profile = np.asarray(cluster_centroids[cid])
        # 候選包含所有其他群（不限制大小）
        candidates = [other for other in cluster_centroids if other != cid]

        if not candidates:
            continue

        similarities = [
            cosine_similarity(profile, np.asarray(cluster_centroids[other]))[0, 0]
            for other in candidates
        ]
        best_idx = np.argmax(similarities)
        merge_to = candidates[best_idx]

        # 合併 sequence
        cluster_to_user_sequences[merge_to].extend(cluster_to_user_sequences[cid])

        # 更新中心向量
        indices_merge = [
            i for i, seq in enumerate(sequences)
            if any(mid in movie_id_to_cluster and movie_id_to_cluster[mid] == merge_to for mid in seq)
        ]
        cluster_centroids[merge_to] = np.asarray(X_tfidf[indices_merge].mean(axis=0))

        # 更新映射（movie_id_to_cluster）
        for mid, cl in movie_id_to_cluster.items():
            if cl == cid:
                movie_id_to_cluster[mid] = merge_to

        # 移除舊 cluster
        del cluster_to_user_sequences[cid]
        del cluster_centroids[cid]

        updated = True
        break  # 每次只做一個，重新檢查

In [184]:
print(len(small_clusters))

31


In [197]:
print(len(cluster_to_user_sequences))

19


## Store the slicing data to each cluster file (cluster_{id}.txt)

In [198]:
import os
import shutil

cluster_folder = "../dataset/cluster_movie"

# 若資料夾存在，先刪除其內容；若不存在則建立
if os.path.exists(cluster_folder):
    # 刪除裡面所有檔案
    for filename in os.listdir(cluster_folder):
        file_path = os.path.join(cluster_folder, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)
else:
    os.makedirs(cluster_folder)

In [199]:
for cluster_id, sequences in cluster_to_user_sequences.items():
    filename = f"../dataset/cluster_movie/cluster_{cluster_id}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        for seq in sequences:
            pairs = [f"({seq[i]}, {seq[i+1]})" for i in range(len(seq) - 1)]
            line = "#".join(pairs)
            f.write(line + "\n")