# Data slicing

In [92]:
import numpy as np
labels = np.load("cluster_labels_min_df_5.npy")

In [93]:
from collections import Counter

label_counts = Counter(labels)
for label, count in sorted(label_counts.items()):
    if label == -1:
        print(f"Noise: {count} points")
    else:
        print(f"Cluster {label}: {count} points")

Noise: 108746 points
Cluster 0: 69 points
Cluster 1: 97 points
Cluster 2: 89 points
Cluster 3: 159 points
Cluster 4: 78 points
Cluster 5: 181 points
Cluster 6: 250 points
Cluster 7: 493 points
Cluster 8: 1346 points
Cluster 9: 470 points
Cluster 10: 206 points
Cluster 11: 144 points
Cluster 12: 273 points
Cluster 13: 510 points
Cluster 14: 195 points
Cluster 15: 69 points
Cluster 16: 187 points
Cluster 17: 355 points
Cluster 18: 205 points
Cluster 19: 227 points
Cluster 20: 69 points
Cluster 21: 136 points
Cluster 22: 233 points
Cluster 23: 165 points
Cluster 24: 162 points
Cluster 25: 663 points
Cluster 26: 106 points
Cluster 27: 928 points
Cluster 28: 86 points
Cluster 29: 165 points
Cluster 30: 87 points
Cluster 31: 543 points
Cluster 32: 652 points
Cluster 33: 129 points
Cluster 34: 703 points
Cluster 35: 93 points
Cluster 36: 39189 points


In [94]:
import re
from tqdm import tqdm

with open('../dataset/movie_new2.txt', 'r') as f:
    lines = f.readlines()

def extract_sequence(line):
    pairs = re.findall(r'\((\d+),\s*(\d+)\)', line)
    sequence = [int(pairs[0][0])] + [int(p[1]) for p in pairs]
    return sequence
sequences = [extract_sequence(line) for line in tqdm(lines, desc="Extracting sequences...")]

Extracting sequences...: 100%|██████████| 400000/400000 [00:04<00:00, 98454.75it/s] 


In [95]:
movie_ids = sorted(set(movie_id for seq in sequences for movie_id in seq))
movie_id_to_cluster = {movie_id: cluster_label for movie_id, cluster_label in zip(movie_ids, labels)}

In [96]:
from collections import defaultdict

cluster_to_movies = defaultdict(list)
for m, c in movie_id_to_cluster.items():
    if c != -1:  # -1 通常是 noise
        cluster_to_movies[c].append(m)

In [97]:
cluster_to_user_sequences = defaultdict(list)
cluster_to_seen_sequences = defaultdict(set)  # 新增：追蹤每個 cluster 已經加入過哪些序列

for seq in sequences: 

    movie_list = set(seq)

    if(len(movie_list) < 15):
        continue

    # 找出此 user 涉及到的所有 cluster
    clusters_hit = set()
    for movie_id in movie_list:
        if movie_id in movie_id_to_cluster:
            clusters_hit.add(movie_id_to_cluster[movie_id])

    # 把該序列加到對應 cluster 的新資料集中
    for c in clusters_hit:
        seq_tuple = tuple(seq)  # list 無法 hash，轉成 tuple
        if seq_tuple not in cluster_to_seen_sequences[c]:
            cluster_to_user_sequences[c].append(seq)
            cluster_to_seen_sequences[c].add(seq_tuple)

In [98]:
print(len(cluster_to_user_sequences))

37


## Store the slicing data to each cluster file (cluster_{id}.txt)

In [99]:
import os
import shutil

cluster_folder = "../dataset/cluster_movie"

# 若資料夾存在，先刪除其內容；若不存在則建立
if os.path.exists(cluster_folder):
    # 刪除裡面所有檔案
    for filename in os.listdir(cluster_folder):
        file_path = os.path.join(cluster_folder, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)
else:
    os.makedirs(cluster_folder)

In [100]:
for cluster_id, sequences in cluster_to_user_sequences.items():
    filename = f"../dataset/cluster_movie/cluster_{cluster_id}.txt"
    with open(filename, "w", encoding="utf-8") as f:
        for seq in sequences:
            pairs = [f"({seq[i]}, {seq[i+1]})" for i in range(len(seq) - 1)]
            line = "#".join(pairs)
            f.write(line + "\n")