# 讀取 dataset

In [35]:
import re
import numpy as np
from tqdm import tqdm
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

# 步驟 1：讀取 movie.txt
with open('../dataset/movie_new2.txt', 'r') as f:
    lines = f.readlines()

# 每筆觀看序列轉為電影 ID 的 list (ex: (910, 905, ...))

In [36]:
def extract_sequence(line):
    pairs = re.findall(r'\((\d+),\s*(\d+)\)', line)
    sequence = [int(pairs[0][0])] + [int(p[1]) for p in pairs]
    return sequence
sequences = [extract_sequence(line) for line in tqdm(lines, desc="Extracting sequences...")]

Extracting sequences...: 100%|██████████| 400000/400000 [00:03<00:00, 121818.77it/s]


# 轉為字串形式提供給 TF-IDF (ex: "910, 905, ...")

In [37]:
corpus = [' '.join(map(str, seq)) for seq in tqdm(sequences, desc="Building corpus...")]

Building corpus...: 100%|██████████| 400000/400000 [00:00<00:00, 438556.39it/s]


# 步驟 4：使用 TF-IDF 向量化（含 N-gram）

In [38]:
print("Building TF-IDF matrix...")
# vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(2, 2))
# vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), min_df=5)  # 使用 bi-gram, 忽略少於 5 個 bi-gram的序列
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(2, 3), min_df=5)
X_tfidf = vectorizer.fit_transform(corpus)  # X_tfidf 是稀疏矩陣
print("finished building TF-IDF matrix.")
print("TF-IDF matrix shape:", X_tfidf.shape)

Building TF-IDF matrix...
finished building TF-IDF matrix.
TF-IDF matrix shape: (400000, 255617)


# 儲存 X_tfidf 成 .npz

In [39]:

sparse.save_npz("X_tfidf_sparse_2_3_min_df_5.npz", X_tfidf)

# 查看某個使用者的非零項目

In [40]:
user_index = 0  # 第幾位使用者
row = X_tfidf.getrow(user_index)  # 取出一行稀疏向量
nonzero_indices = row.nonzero()[1]  # 取得非零欄位的索引
feature_names = vectorizer.get_feature_names_out()

# 顯示所有非零特徵及其 TF-IDF 值
for idx in nonzero_indices:
    print(f"{feature_names[idx]}: {row[0, idx]:.4f}")

910 905: 0.2237
905 2600: 0.1798
2600 1535: 0.1570
1535 1060: 0.2337
1060 648: 0.2429
648 2876: 0.1669
2876 336: 0.2010
336 1278: 0.1891
1278 3905: 0.1847
3905 152: 0.1979
152 113: 0.1562
113 1346: 0.1739
1346 1221: 0.2114
1221 1342: 0.1528
905 2600 1535: 0.2216
2600 1535 1060: 0.2460
1535 1060 648: 0.2429
648 2876 336: 0.2288
336 1278 3905: 0.2378
1278 3905 152: 0.2320
3905 152 113: 0.2249
152 113 1346: 0.1961
1346 1221 1342: 0.2429
