# Data slicing

In [25]:
import numpy as np
labels = np.load("cluster_labels.npy")

In [26]:
from collections import Counter

label_counts = Counter(labels)
for label, count in sorted(label_counts.items()):
    if label == -1:
        print(f"Noise: {count} points")
    else:
        print(f"Cluster {label}: {count} points")

Noise: 389813 points
Cluster 0: 50 points
Cluster 1: 24 points
Cluster 2: 125 points
Cluster 3: 50 points
Cluster 4: 93 points
Cluster 5: 38 points
Cluster 6: 28 points
Cluster 7: 28 points
Cluster 8: 71 points
Cluster 9: 104 points
Cluster 10: 46 points
Cluster 11: 141 points
Cluster 12: 138 points
Cluster 13: 46 points
Cluster 14: 54 points
Cluster 15: 21 points
Cluster 16: 24 points
Cluster 17: 79 points
Cluster 18: 79 points
Cluster 19: 28 points
Cluster 20: 22 points
Cluster 21: 65 points
Cluster 22: 45 points
Cluster 23: 31 points
Cluster 24: 186 points
Cluster 25: 22 points
Cluster 26: 200 points
Cluster 27: 38 points
Cluster 28: 89 points
Cluster 29: 28 points
Cluster 30: 42 points
Cluster 31: 216 points
Cluster 32: 136 points
Cluster 33: 88 points
Cluster 34: 36 points
Cluster 35: 28 points
Cluster 36: 194 points
Cluster 37: 167 points
Cluster 38: 93 points
Cluster 39: 217 points
Cluster 40: 395 points
Cluster 41: 20 points
Cluster 42: 27 points
Cluster 43: 70 points
Cluster 4

In [27]:
import re
from tqdm import tqdm

with open('../dataset/movie_new2.txt', 'r') as f:
    lines = f.readlines()

def extract_sequence(line):
    pairs = re.findall(r'\((\d+),\s*(\d+)\)', line)
    sequence = [int(pairs[0][0])] + [int(p[1]) for p in pairs]
    return sequence
sequences = [extract_sequence(line) for line in tqdm(lines, desc="Extracting sequences...")]

Extracting sequences...: 100%|██████████| 400000/400000 [00:02<00:00, 172071.57it/s]


In [28]:
movie_ids = sorted(set(movie_id for seq in sequences for movie_id in seq))
movie_id_to_cluster = {movie_id: cluster_label for movie_id, cluster_label in zip(movie_ids, labels)}

In [29]:
from collections import defaultdict

cluster_to_movies = defaultdict(list)
for m, c in movie_id_to_cluster.items():
    if c != -1:  # -1 通常是 noise
        cluster_to_movies[c].append(m)

In [36]:
all_lengths = [len(seq) for seq in sequences]
print(f"Max length: {max(all_lengths)}")
print(f"95th percentile:", np.percentile(all_lengths, 95))

top_kc = min(32, int(np.percentile(all_lengths, 90)))  # 調整為前 90% 可接受值

Max length: 15
95th percentile: 15.0


In [39]:
cluster_to_user_sequences = defaultdict(list)
for seq in sequences:
    if len(seq) < 15:
        continue 
    movie_list = set(seq)

    # 找出此 user 涉及到的所有 cluster
    clusters_hit = set()
    for movie_id in movie_list:
        if movie_id in movie_id_to_cluster:
            clusters_hit.add(movie_id_to_cluster[movie_id])

    # 把該序列加到對應 cluster 的新資料集中
    for c in clusters_hit:
        cluster_to_user_sequences[c].append(seq)

In [40]:
print(len(cluster_to_user_sequences))

546


## Store the slicing data to each cluster file (cluster_{id}.txt)

In [41]:

for cluster_id, sequences in cluster_to_user_sequences.items():
    filename = f"../dataset/cluster_movie/cluster_{cluster_id}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        for seq in sequences:
            pairs = [f"({seq[i]}, {seq[i+1]})" for i in range(len(seq) - 1)]
            line = "#".join(pairs)
            f.write(line + "\n")