# Clustering
This jupyter notebook aims to build a high-precision document clustering analysis system. First, through a “dual-validation” strategy combining Scikit-learn standard libraries with manual implementation, we established a mathematically precise and equivalent TF-IDF vectorization model. We then employed Latent LSA to reduce the dimensionality and remove noise from high-dimensional sparse matrices. Combined with the MiniBatch K-Means algorithm, this enabled efficient unsupervised clustering and topic segmentation of massive document datasets.


To further uncover deep semantic relationships, the system incorporates Sentence Transformer-based semantic encoding and UMAP manifold dimension reduction techniques. 

# TF-IDF Clustering

- This module aims to construct a document vectorization model based on TF-IDF to provide core support for downstream similarity retrieval tasks. In order to ensure the accuracy and controllability of the algorithm implementation, the system adopts a “two-way implementation” strategy, i.e., while using the Scikit-learn standard library for production-level construction, it maintains a set of manually-implemented MyTfidfVectorizer for algorithmic logic validation, and both of them ensure the precise equivalence of the mathematical expressions through a strict consistency checking mechanism. The two ensure the precise equivalence of mathematical expressions through a strict consistency checking mechanism.


In [2]:
import sys
from pathlib import Path
import os

# Set up root directory
ROOT_DIR = Path("..").resolve()
if str(ROOT_DIR) not in sys.path:
    sys.path.append(str(ROOT_DIR))

# Add src/tf_idf to sys.path to allow importing local modules like my_tfidf
SRC_TFIDF_DIR = ROOT_DIR / "src" / "tf_idf"
if str(SRC_TFIDF_DIR) not in sys.path:
    sys.path.append(str(SRC_TFIDF_DIR))


In [None]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from pathlib import Path
#Config
# ROOT_DIR = Path(__file__).resolve().parents[2]  # Modified for notebook
if 'ROOT_DIR' not in locals():
    ROOT_DIR = Path('..').resolve()
LSA_INPUT_PATH = ROOT_DIR / "data" / "lsa" / "lsa_reduced.npz"
CLUSTER_LABELS_PATH = ROOT_DIR / "data" / "lsa" / "cluster_labels.npy"

K_FIXED = 40

def main():
    # 1) Load LSA reduced matrix
    data = np.load(LSA_INPUT_PATH)
    X = data['X_reduced']
    print(f"[i] Loaded LSA matrix: shape={X.shape}")

    print(f"[i] Running MiniBatchKMeans with fixed K={K_FIXED}")
    kmeans = MiniBatchKMeans(n_clusters=K_FIXED, init='k-means++', n_init=10, random_state=42)
    final_labels = kmeans.fit_predict(X)

    inertia = kmeans.inertia_
    silhouette = silhouette_score(X, final_labels)
    print(f"[i] Inertia={inertia:.2f}, Silhouette={silhouette:.4f}")

    np.save(CLUSTER_LABELS_PATH, final_labels)
    print(f"[i] Cluster labels saved to: {CLUSTER_LABELS_PATH}")

    # 4)Output Final clustering results:
    unique, counts = np.unique(final_labels, return_counts=True)
    print("Final clustering results:")
    for cid, count in zip(unique, counts):
        print(f" - Cluster {cid}: {count} papers")

if __name__ == "__main__":
    main()


## Build TF-IDF (Standard)

- The data pipeline takes the pre-processed text (title and abstract) in JSONL format as input, and performs feature extraction with uniform configuration parameters. The system limits the vocabulary size to 100,000 and enables a combination of Unigram and Bigram to capture phrase-level features. In terms of noise control, the module loads a customized deactivated word list, and uses min_df=5 and max_df=0.8 to dynamically remove very low-frequency long-tailed noise and too high-frequency general-purpose words.

In [2]:
"""
Build TF-IDF sparse matrix + vectorizer; save to disk; also export doc index -> (id, title) mapping.

dependency:
  pip install scikit-learn scipy joblib tqdm

Input:
  A JSONL file. Each line is one paper with field "processed_content" (title+abstract, already cleaned).

Output:
  - tfidf_matrix.npz: scipy csr_matrix
  - tfidf_vectorizer.joblib: Trained TfidfVectorizer, use for future transform/query
  - doc_ids.npy / doc_titles.npy: Arrays of IDs / titles corresponding to each row of the matrix
  - Terminal output: Scale information, non-zero feature statistics for the first few documents
"""

import json
import os
import numpy as np
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import joblib
from pathlib import Path
#Config

# ROOT_DIR = Path(__file__).resolve().parents[2]  # Modified for notebook
if 'ROOT_DIR' not in locals():
    ROOT_DIR = Path('..').resolve()
INPUT_JSONL = ROOT_DIR / "data" / "preprocess" / "arxiv-cs-data-with-citations-final-dataset_preprocessed.json"
OUT_DIR = ROOT_DIR / "data" / "tf_idf"
TFIDF_NPZ_PATH = OUT_DIR / "tfidf_matrix.npz"
VECTORIZER_PKL_PATH = OUT_DIR / "tfidf_vectorizer.joblib"
DOC_IDS_NPY = OUT_DIR / "doc_ids.npy"
DOC_TITLES_NPY = OUT_DIR / "doc_titles.npy"
CUSTOM_STOPWORDS_PATH = ROOT_DIR / "src" / "custom_stopwords.txt"

# Recommended params for ~900k CS papers (tune if needed)
def load_custom_stopwords(path: Path) -> list[str]:
    if not path.exists():
        print(f"[warn] Custom stopword file not found: {path}")
        return []
    words = set()
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            token = line.strip()
            if not token or token.startswith("#"):
                continue
            words.add(token)
    print(f"[i] Loaded {len(words)} custom stopwords from {path}")
    return sorted(words)


VECTORIZER_KW = dict(
    max_df=0.8,        #drop terms in >=80% docs
    min_df=5,         #drop terms in <5 docs
    max_features=100_000,  #cap vocab size to control memory
    ngram_range=(1, 2),    # # Only unigrams; change to (1,2) for phrases (significantly increases scale)
    sublinear_tf=True,     # log/sublinear TF scaling
    norm="l2",             # 
    dtype=np.float32,      # halve memory vs float64
    stop_words=None,       # Data has been cleaned and stemmed; no additional universal stop words are applied here
    lowercase=False,       # processed_content already lower
)

# Sample inspection
PRINT_TOP_N_DOCS = 5       # inspect first N docs
TOP_TERMS_PER_DOC = 5     # show top-K terms per doc


def read_corpus_and_meta(jsonl_path):
    """
    Read corpus & metadata: return (texts, ids, titles)
    """
    texts, ids, titles = [], [], []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Reading JSONL"):
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError:
                # print("[warn] malformed JSON line skipped")
                continue
            # --------------------------------------------------------------
            text = rec.get("processed_content") or ""
            paper_id = rec.get("id") or ""
            title = rec.get("title") or ""
            texts.append(text)
            ids.append(paper_id)
            titles.append(title)
    return texts, np.array(ids), np.array(titles)


def main():
    os.makedirs(OUT_DIR, exist_ok=True)
    custom_stopwords = load_custom_stopwords(CUSTOM_STOPWORDS_PATH)
    if custom_stopwords:
        VECTORIZER_KW["stop_words"] = custom_stopwords
    # 1) Read texts & metadata
    texts, ids, titles = read_corpus_and_meta(INPUT_JSONL)
    print(f"[i] Loaded documents: {len(texts):,}")
    # 2)TF-IDF:Fit + transform
    vectorizer = TfidfVectorizer(**VECTORIZER_KW)
    X = vectorizer.fit_transform(texts)
    print(f"[i] TF-IDF shape: {X.shape}, nnz={X.nnz:,}, dtype={X.dtype}, type={type(X)}")
    # 3)Persist to disk
    sparse.save_npz(TFIDF_NPZ_PATH, X)
    joblib.dump(vectorizer, VECTORIZER_PKL_PATH)
    np.save(DOC_IDS_NPY, ids)
    np.save(DOC_TITLES_NPY, titles)
    print(f"[i] Saved: {TFIDF_NPZ_PATH}")
    print(f"[i] Saved: {VECTORIZER_PKL_PATH}")
    print(f"[i] Saved: {DOC_IDS_NPY}, {DOC_TITLES_NPY}")
    #4 Quick sanity check: show top terms for first few docs
    feats = vectorizer.get_feature_names_out()
    for di in range(min(PRINT_TOP_N_DOCS, X.shape[0])):
        row = X[di]
        idx = row.indices
        val = row.data
        if val.size == 0:
            print(f"Doc#{di} ({ids[di]}): <empty vector>")
            continue
        order = np.argsort(-val)[:TOP_TERMS_PER_DOC]
        top_terms = [(feats[idx[j]], float(val[j])) for j in order]
        print(f"Doc#{di} ({ids[di]}) '{titles[di]}' top terms:", top_terms)

if __name__ == "__main__":
    main()

[i] Loaded 465 custom stopwords from /work3/s242644/ds/PaperTrail/src/custom_stopwords.txt


Reading JSONL: 732367it [00:14, 51120.80it/s]


[i] Loaded documents: 732,367




[i] TF-IDF shape: (732367, 100000), nnz=72,361,100, dtype=float32, type=<class 'scipy.sparse._csr.csr_matrix'>
[i] Saved: /work3/s242644/ds/PaperTrail/data/tf_idf/tfidf_matrix.npz
[i] Saved: /work3/s242644/ds/PaperTrail/data/tf_idf/tfidf_vectorizer.joblib
[i] Saved: /work3/s242644/ds/PaperTrail/data/tf_idf/doc_ids.npy, /work3/s242644/ds/PaperTrail/data/tf_idf/doc_titles.npy
Doc#0 (2107.12674) 'Vision-Guided Forecasting -- Visual Context for Multi-Horizon Time
  Series Forecasting' top terms: [('forecast', 0.1998753696680069), ('horizon', 0.18677020072937012), ('vehicl', 0.16268596053123474), ('visual multi', 0.1325095146894455), ('face camera', 0.13044185936450958)]
Doc#1 (2107.12675) 'Feature Fusion Methods for Indexing and Retrieval of Biometric Data:
  Application to Face Recognition with Privacy Protection' top terms: [('biometr', 0.25766581296920776), ('biometr identif', 0.24242226779460907), ('templat', 0.19307062029838562), ('protect', 0.16008898615837097), ('homomorph encrypt',

## Build TF-IDF (Manual)
- The data pipeline takes the pre-processed text (title and abstract) in JSONL format as input, and performs feature extraction with uniform configuration parameters. The system limits the vocabulary size to 100,000 and enables a combination of Unigram and Bigram to capture phrase-level features. In terms of noise control, the module loads a customized deactivated word list, and uses min_df=5 and max_df=0.8 to dynamically remove very low-frequency long-tailed noise and too high-frequency general-purpose words.


In [3]:
"""
Build TF-IDF sparse matrix + vectorizer; save to disk; also export doc index -> (id, title) mapping.

dependency:
  pip install scikit-learn scipy joblib tqdm

Input:
  A JSONL file. Each line is one paper with field "processed_content" (title+abstract, already cleaned).

Output:
  - tfidf_matrix.npz: scipy csr_matrix
  - tfidf_vectorizer.joblib: Trained TfidfVectorizer, use for future transform/query
  - doc_ids.npy / doc_titles.npy: Arrays of IDs / titles corresponding to each row of the matrix
  - Terminal output: Scale information, non-zero feature statistics for the first few documents
"""

import json
import os
import numpy as np
from tqdm import tqdm
from pathlib import Path
from scipy import sparse
from my_tfidf import MyTfidfVectorizer
import joblib

# 1) Config
# ROOT_DIR = Path(__file__).resolve().parents[2]  # Modified for notebook
if 'ROOT_DIR' not in locals():
    ROOT_DIR = Path('..').resolve()

INPUT_JSONL = ROOT_DIR / "data" / "preprocess" / "arxiv-cs-data-with-citations-final-dataset_preprocessed.json"

OUT_DIR = ROOT_DIR / "data" / "tf_idf_manual"
TFIDF_NPZ_PATH = OUT_DIR / "tfidf_matrix.npz"
VECTORIZER_PKL_PATH = OUT_DIR / "tfidf_vectorizer.joblib"
DOC_IDS_NPY = OUT_DIR / "doc_ids.npy"
DOC_TITLES_NPY = OUT_DIR / "doc_titles.npy"
CUSTOM_STOPWORDS_PATH = ROOT_DIR / "src" / "custom_stopwords.txt"

# Recommended params for ~900k CS papers (tune if needed)
def load_custom_stopwords(path: Path) -> list[str]:
    if not path.exists():
        print(f"[warn] Custom stopword file not found: {path}")
        return []
    words = set()
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            token = line.strip()
            if not token or token.startswith("#"):
                continue
            words.add(token)
    print(f"[i] Loaded {len(words)} custom stopwords from {path}")
    return sorted(words)


VECTORIZER_KW = dict(
    max_df=0.8,        #drop terms in >=80% docs
    min_df=5,         #drop terms in <5 docs
    max_features=100_000,  #cap vocab size to control memory
    ngram_range=(1, 2),    # # Only unigrams; change to (1,2) for phrases (significantly increases scale)
    sublinear_tf=True,     # log/sublinear TF scaling
    norm="l2",             
    dtype=np.float32,      # halve memory vs float64
    stop_words=None,       # Data has been cleaned and stemmed; no additional universal stop words are applied here
    lowercase=False,       # processed_content already lower
)

# Sample inspection
PRINT_TOP_N_DOCS = 5       # inspect first N docs
TOP_TERMS_PER_DOC = 5     # show top-K terms per doc


def read_corpus_and_meta(jsonl_path):
    """
    Read corpus & metadata: return (texts, ids, titles)
    """
    texts, ids, titles = [], [], []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Reading JSONL"):
          
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
            except json.JSONDecodeError:
               
                continue
           
            text = rec.get("processed_content") or ""
            paper_id = rec.get("id") or ""
            title = rec.get("title") or ""
            texts.append(text)
            ids.append(paper_id)
            titles.append(title)
    return texts, np.array(ids), np.array(titles)


def main():
    OUT_DIR.mkdir(parents=True, exist_ok=True)
    custom_stopwords = load_custom_stopwords(CUSTOM_STOPWORDS_PATH)
    if custom_stopwords:
        VECTORIZER_KW["stop_words"] = custom_stopwords
    # 1) Read texts & metadata
    texts, ids, titles = read_corpus_and_meta(INPUT_JSONL)
    print(f"[i] Loaded documents: {len(texts):,}")
    # 2)TF-IDF:Fit + transform
    vectorizer = MyTfidfVectorizer(**VECTORIZER_KW)
    X = vectorizer.fit_transform(texts)
    print(f"[i] TF-IDF shape: {X.shape}, nnz={X.nnz:,}, dtype={X.dtype}, type={type(X)}")
    # 3)Persist to disk
    sparse.save_npz(TFIDF_NPZ_PATH, X)
    joblib.dump(vectorizer, VECTORIZER_PKL_PATH)
    np.save(DOC_IDS_NPY, ids)
    np.save(DOC_TITLES_NPY, titles)
    print(f"[i] Saved: {TFIDF_NPZ_PATH}")
    print(f"[i] Saved: {VECTORIZER_PKL_PATH}")
    print(f"[i] Saved: {DOC_IDS_NPY}, {DOC_TITLES_NPY}")
    #4 Quick sanity check: show top terms for first few docs
    feats = vectorizer.get_feature_names_out()
    for di in range(min(PRINT_TOP_N_DOCS, X.shape[0])):
        row = X[di]
        idx = row.indices
        val = row.data
        if val.size == 0:
            print(f"Doc#{di} ({ids[di]}): <empty vector>")
            continue
        order = np.argsort(-val)[:TOP_TERMS_PER_DOC]
        top_terms = [(feats[idx[j]], float(val[j])) for j in order]
        print(f"Doc#{di} ({ids[di]}) '{titles[di]}' top terms:", top_terms)

if __name__ == "__main__":
    main()

[i] Loaded 465 custom stopwords from /work3/s242644/ds/PaperTrail/src/custom_stopwords.txt


Reading JSONL: 732367it [00:14, 51484.74it/s]


[i] Loaded documents: 732,367




[i] TF-IDF shape: (732367, 100000), nnz=72,361,100, dtype=float64, type=<class 'scipy.sparse._csr.csr_matrix'>
[i] Saved: /work3/s242644/ds/PaperTrail/data/tf_idf_manual/tfidf_matrix.npz
[i] Saved: /work3/s242644/ds/PaperTrail/data/tf_idf_manual/tfidf_vectorizer.joblib
[i] Saved: /work3/s242644/ds/PaperTrail/data/tf_idf_manual/doc_ids.npy, /work3/s242644/ds/PaperTrail/data/tf_idf_manual/doc_titles.npy
Doc#0 (2107.12674) 'Vision-Guided Forecasting -- Visual Context for Multi-Horizon Time
  Series Forecasting' top terms: [('forecast', 0.19987536629425323), ('horizon', 0.18677019627576547), ('vehicl', 0.16268595269368802), ('visual multi', 0.13250951186926718), ('face camera', 0.1304418637540388)]
Doc#1 (2107.12675) 'Feature Fusion Methods for Indexing and Retrieval of Biometric Data:
  Application to Face Recognition with Privacy Protection' top terms: [('biometr', 0.2576658227490619), ('biometr identif', 0.2424222594275143), ('templat', 0.19307061494616037), ('protect', 0.16008897706957

## Validation
- In order to verify the correctness of the manually implemented logic, the script compares the CSR sparse matrices, serialized Vectorizer objects, and metadata indexes generated by the manually implemented version with those generated by the native version of Scikit-learn.

In [4]:
import os
import numpy as np
from scipy import sparse
import joblib
import sys

from pathlib import Path



# 1) Config
# ROOT_DIR = Path(__file__).resolve().parents[2]  # Modified for notebook
if 'ROOT_DIR' not in locals():
    ROOT_DIR = Path('..').resolve()

DIR_A = ROOT_DIR / "data" / "tf_idf"
DIR_B = ROOT_DIR / "data" / "tf_idf_manual"


MATRIX_FILE = "tfidf_matrix.npz"
VECTORIZER_FILE = "tfidf_vectorizer.joblib"
IDS_FILE = "doc_ids.npy"
TITLES_FILE = "doc_titles.npy"

TOLERANCE = 1e-7



def compare_matrices(file_a, file_b):
    """compare two .npz sparse matrices"""
    try:
        A = sparse.load_npz(file_a)
        B = sparse.load_npz(file_b)
    except FileNotFoundError as e:
        print(f"  [error] file not found: {e.filename}")
        return False
    
    # 1. check shape
    if A.shape != B.shape:
        print(f"  [failed] shape mismatch: {A.shape} vs {B.shape}")
        return False
    
    # 2. check values
    try:
        C = A - B
    except Exception as e:
        print(f"  [failed] matrix cannot be subtracted (format may be different?): {e}")
        return False
        
    if C.nnz == 0:
        # no non-zero elements, means completely equal
        return True
        
    # check if all difference values are almost 0
    are_close = np.allclose(C.data, 0, atol=TOLERANCE)
    if not are_close:
        print(f"  [failed] matrix has {C.nnz} values with difference (greater than {TOLERANCE}).")
        print(f"  for example, the maximum difference value: {np.max(np.abs(C.data))}")
    return are_close

def compare_vectorizers(file_a, file_b):
    """compare two .joblib vectorizers"""
    try:
        A = joblib.load(file_a)
        B = joblib.load(file_b)
    except FileNotFoundError as e:
        print(f"  [error] file not found: {e.filename}")
        return False
    
    # 1. compare vocabulary (most important)
    if A.vocabulary_ != B.vocabulary_:
        print(f"  [failed] vocabulary (vocabulary_) is inconsistent.")
        len_a = len(A.vocabulary_)
        len_b = len(B.vocabulary_)
        if len_a != len_b:
            print(f"  vocabulary size is different: {len_a} vs {len_b}")
        return False
    
    # 2. compare feature names (order must also be consistent)
    try:
        feats_a = A.get_feature_names_out()
        feats_b = B.get_feature_names_out()
        if not np.array_equal(feats_a, feats_b):
            print(f"  [failed] feature names (feature_names_out) are inconsistent.")
            return False
    except Exception as e:
        print(f"  [warning] error comparing feature_names_out: {e}")
        
    return True

def compare_npy_arrays(file_a, file_b):
    try:
        A = np.load(file_a, allow_pickle=True) # allow_pickle 以防万一
        B = np.load(file_b, allow_pickle=True)
    except FileNotFoundError as e:
        print(f"  [error] file not found: {e.filename}")
        return False
        
    if not np.array_equal(A, B):
        print(f"  [failed] array content is inconsistent.")
        if A.shape != B.shape:
            print(f"  shape mismatch: {A.shape} vs {B.shape}")

        return False
    return True


def main():
    all_good = True
    
    print(f"--- Comparing directories ---")
    print(f"A: {DIR_A}")
    print(f"B: {DIR_B}")
    
    # 1. compare matrix
    print(f"\n[1] compare {MATRIX_FILE}...")
    path_a = os.path.join(DIR_A, MATRIX_FILE)
    path_b = os.path.join(DIR_B, MATRIX_FILE)
    if compare_matrices(path_a, path_b):
        print(f"   {MATRIX_FILE} is consistent.")
    else:
        print(f"   {MATRIX_FILE} is inconsistent.")
        all_good = False
        
    # 2. compare vectorizer
    print(f"\n[2] compare {VECTORIZER_FILE}...")
    path_a = os.path.join(DIR_A, VECTORIZER_FILE)
    path_b = os.path.join(DIR_B, VECTORIZER_FILE)
    if compare_vectorizers(path_a, path_b):
        print(f"  {VECTORIZER_FILE} is consistent.")
    else:
        print(f"  {VECTORIZER_FILE} is inconsistent.")
        all_good = False
        
    # 3. compare doc ids
    print(f"\n[3] compare {IDS_FILE}...")
    path_a = os.path.join(DIR_A, IDS_FILE)
    path_b = os.path.join(DIR_B, IDS_FILE)
    if compare_npy_arrays(path_a, path_b):
        print(f"   {IDS_FILE} is consistent.")
    else:
        print(f"   {IDS_FILE} is inconsistent.")
        all_good = False
        
    # 4. compare doc titles
    print(f"\n[4] compare {TITLES_FILE}...")
    path_a = os.path.join(DIR_A, TITLES_FILE)
    path_b = os.path.join(DIR_B, TITLES_FILE)
    if compare_npy_arrays(path_a, path_b):
        print(f"   {TITLES_FILE} is consistent.")
    else:
        print(f"   {TITLES_FILE} is inconsistent.")
        all_good = False

    print("\n" + "="*30)
    print("--- summary ---")
    if all_good:
        print(" all TF-IDF related files in both directories are consistent.")
    else:
        print("differences found in both directories. Please check the ❌ marks above.")
    print("="*30)

if __name__ == "__main__":
    main()



--- Comparing directories ---
A: /work3/s242644/ds/PaperTrail/data/tf_idf
B: /work3/s242644/ds/PaperTrail/data/tf_idf_manual

[1] compare tfidf_matrix.npz...
   tfidf_matrix.npz is consistent.

[2] compare tfidf_vectorizer.joblib...
  tfidf_vectorizer.joblib is consistent.

[3] compare doc_ids.npy...
   doc_ids.npy is consistent.

[4] compare doc_titles.npy...
   doc_titles.npy is consistent.

--- summary ---
 all TF-IDF related files in both directories are consistent.


# lsa-kmeans
- This module aims to reduce the dimensionality and remove noise from high-dimensional sparse TF-IDF matrices through Latent Semantic Analysis (LSA), extracting latent topic features from documents. Based on this, the K-Means algorithm is applied to perform unsupervised document clustering. The system comprises a complete pipeline: dimensionality selection detection, SVD decomposition, clustering execution, centroid backprojection interpretation, and density-based hierarchical refinement.

## LSA Clustering

LSA maps the high-dimensional sparse TF-IDF matrix to a low-dimensional latent semantic space via singular value decomposition (SVD). TruncatedSVDprocesses the sparse matrix, compressing the high-dimensional feature space (100,000 dimensions) into a low-dimensional semantic space (n_components=1000). The reduced-dimension matrix X_reduced obtained via TruncatedSVD not only significantly saves storage space but also enhances the efficiency of similarity calculations.


In [5]:
import numpy as np
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
import joblib
from pathlib import Path


# 1) Config
# ROOT_DIR = Path(__file__).resolve().parents[2]  # Modified for notebook
if 'ROOT_DIR' not in locals():
    ROOT_DIR = Path('..').resolve()
TFIDF_MATRIX_PATH = ROOT_DIR / "data" / "tf_idf" / "tfidf_matrix.npz"
LSA_OUTPUT_PATH   = ROOT_DIR / "data" / "lsa_test" / "lsa_reduced.npz"
LSA_MODEL_PATH = ROOT_DIR / "data" / "lsa_test" / "lsa_model.joblib"
LSA_OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
LSA_MODEL_PATH.parent.mkdir(parents=True, exist_ok=True)



# Number of latent dimensions for LSA
N_COMPONENTS = 1000  # (Adjustable like 100 or 200)

def main():
    # 1) Load the TF-IDF sparse matrix
    X = sparse.load_npz(TFIDF_MATRIX_PATH)
    print(f"[i] TF-IDF matrix loaded: shape={X.shape}, nnz={X.nnz}")

    # 2) Perform LSA dimensionality reduction using TruncatedSVD
    svd = TruncatedSVD(n_components=N_COMPONENTS, random_state=42)
    X_reduced = svd.fit_transform(X)
    print(f"[i] LSA reduction done: new shape={X_reduced.shape}")

    # print explained variance ratio sum
    if hasattr(svd, 'explained_variance_ratio_'):
        variance_ratio_sum = svd.explained_variance_ratio_.sum()
        print(f"[i] Explained variance by top {N_COMPONENTS} components: {variance_ratio_sum:.2%}")

    # 3)Save the reduced matrix to a compressed NPZ file
    # Convert to float16 to reduce file size (approx 50% of float32)
    # Note: float16 is generally sufficient for LSA/embeddings and safer than int8 quantization
    X_reduced = X_reduced.astype(np.float16)
    np.savez_compressed(LSA_OUTPUT_PATH, X_reduced=X_reduced)
    print(f"[i] LSA reduced matrix saved to {LSA_OUTPUT_PATH} (dtype={X_reduced.dtype})")
    joblib.dump(svd, LSA_MODEL_PATH)
    print(f"[i] LSA model (SVD object) saved to {LSA_MODEL_PATH}")

if __name__ == "__main__":
    main()

[i] TF-IDF matrix loaded: shape=(732367, 100000), nnz=72361100
[i] LSA reduction done: new shape=(732367, 1000)
[i] Explained variance by top 1000 components: 25.83%
[i] LSA reduced matrix saved to /work3/s242644/ds/PaperTrail/data/lsa_test/lsa_reduced.npz (dtype=float16)
[i] LSA model (SVD object) saved to /work3/s242644/ds/PaperTrail/data/lsa_test/lsa_model.joblib


In [None]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from pathlib import Path
#Config
# ROOT_DIR = Path(__file__).resolve().parents[2]  # Modified for notebook
if 'ROOT_DIR' not in locals():
    ROOT_DIR = Path('..').resolve()
LSA_INPUT_PATH = ROOT_DIR / "data" / "lsa" / "lsa_reduced.npz"
CLUSTER_LABELS_PATH = ROOT_DIR / "data" / "lsa" / "cluster_labels.npy"

K_FIXED = 40

def main():
    # 1) Load LSA reduced matrix
    data = np.load(LSA_INPUT_PATH)
    X = data['X_reduced']
    print(f"[i] Loaded LSA matrix: shape={X.shape}")

    print(f"[i] Running MiniBatchKMeans with fixed K={K_FIXED}")
    kmeans = MiniBatchKMeans(n_clusters=K_FIXED, init='k-means++', n_init=10, random_state=42)
    final_labels = kmeans.fit_predict(X)

    inertia = kmeans.inertia_
    silhouette = silhouette_score(X, final_labels)
    print(f"[i] Inertia={inertia:.2f}, Silhouette={silhouette:.4f}")

    np.save(CLUSTER_LABELS_PATH, final_labels)
    print(f"[i] Cluster labels saved to: {CLUSTER_LABELS_PATH}")

    # 4)Output Final clustering results:
    unique, counts = np.unique(final_labels, return_counts=True)
    print("Final clustering results:")
    for cid, count in zip(unique, counts):
        print(f" - Cluster {cid}: {count} papers")

if __name__ == "__main__":
    main()


## kmeans
Due to the large dataset size, we employed the MiniBatch K-Means algorithm for rapid iterative clustering. This algorithm maintains clustering performance comparable to standard K-Means while significantly reducing memory consumption and computational time. We partitioned the documents into 40 thematic clusters and projected the cluster centers back into the lexical space.

In [1]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from pathlib import Path
#Config
# ROOT_DIR = Path(__file__).resolve().parents[2]  # Modified for notebook
if 'ROOT_DIR' not in locals():
    ROOT_DIR = Path('..').resolve()
LSA_INPUT_PATH = ROOT_DIR / "data" / "lsa" / "lsa_reduced.npz"
CLUSTER_LABELS_PATH = ROOT_DIR / "data" / "lsa" / "cluster_labels.npy"

K_FIXED = 40

def main():
    # 1) Load LSA reduced matrix
    data = np.load(LSA_INPUT_PATH)
    X = data['X_reduced']
    print(f"[i] Loaded LSA matrix: shape={X.shape}")

    print(f"[i] Running MiniBatchKMeans with fixed K={K_FIXED}")
    kmeans = MiniBatchKMeans(n_clusters=K_FIXED, init='k-means++', n_init=10, random_state=42)
    final_labels = kmeans.fit_predict(X)



    np.save(CLUSTER_LABELS_PATH, final_labels)
    print(f"[i] Cluster labels saved to: {CLUSTER_LABELS_PATH}")

    # 4)Output Final clustering results:
    unique, counts = np.unique(final_labels, return_counts=True)


if __name__ == "__main__":
    main()

[i] Loaded LSA matrix: shape=(732367, 1000)
[i] Running MiniBatchKMeans with fixed K=40
[i] Cluster labels saved to: /work3/s242644/ds/PaperTrail/data/lsa/cluster_labels.npy


# HDBSCAN Clustering

**Due to the original dataset exceeding 700k data, running it directly in Jupyter will result in insufficient kernel memory. Therefore, we provide a TINY version with 10,000 records. For the full dataset version, please run src/lsa_and_clustering/sbert_hdbscan_cluster_lite.py.**




1. Semantic Vectorization & Manifold Reduction
This module employs a pre-trained Sentence Transformer model to encode text into semantic vectors. The selection of this model primarily balances computational efficiency with the ability to capture semantic context. The vectors are subsequently L2-normalized and projected into a low-dimensional space using the manifold learning algorithm UMAP (Uniform Manifold Approximation and Projection), constructing a compact manifold space for subsequent density-based clustering.

2. Density-Based Clustering & Topic Mining
The core clustering engine employs the HDBSCAN algorithm. HDBSCAN can adaptively identify clusters with uneven density in high-dimensional semantic spaces based on hierarchical structures, without requiring pre-set sensitive global distance thresholds. This enables greater robustness during automated parameter search. The system implements a sampled grid search mechanism (Sampled Parameter Search). Optimal parameter combinations are selected based on a composite evaluation metric Sopt, defined as a weighted linear combination of the silhouette coefficient and coverage. This automatically screens for parameter sets that balance intra-cluster compactness and sample coverage, while persisting search logs to CSV files for auditing.


To generate semantically expressive cluster labels, we employ the Apriori association rule mining algorithm to identify frequent itemsets within each cluster as composite keyword tags (e.g., “neural + network”). By constructing detailed thematic labels from high-frequency co-occurring phrases, multi-word phrases, and term co-occurrence relationships, we enhance the interpretability of the recommendation system.


In [11]:
import json
import joblib
import re
import math
from itertools import combinations
from collections import Counter, defaultdict
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import hdbscan
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
import csv
import datetime
from typing import Dict, Any, Optional, List, Tuple
import warnings
warnings.filterwarnings(
    "ignore",
    message="'force_all_finite' was renamed to 'ensure_all_finite'"
)

class CsvLogger:
    def __init__(self, filepath: Path, fieldnames: list):
        self.filepath = filepath
        self.fieldnames = fieldnames
        with self.filepath.open("w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.fieldnames)
            writer.writeheader()

    def log(self, data: Dict[str, Any]):
        # Ensure all keys in data are in fieldnames to avoid errors
        filtered_data = {k: data.get(k) for k in self.fieldnames}
        with self.filepath.open("a", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=self.fieldnames)
            writer.writerow(filtered_data)




# Paths
# ROOT_DIR = Path(__file__).resolve().parents[2]  # Modified for notebook
if 'ROOT_DIR' not in locals():
    REPO_ROOT = Path('..').resolve()
INPUT_JSONL = REPO_ROOT / "data" / "preprocess" / "arxiv-cs-data-with-citations-final-dataset_preprocessed_head1w.json"

OUT_DIR = REPO_ROOT / "data" / "sbert_hdbscan_1w"
OUT_DIR.mkdir(parents=True, exist_ok=True)

EMBEDDINGS_PATH = OUT_DIR / "sbert_embeddings.npy"
EMBEDDINGS_NORM_PATH = OUT_DIR / "sbert_embeddings_norm.npy"
CLUSTER_LABELS_PATH = OUT_DIR / "hdbscan_labels.npy"
DOC_IDS_PATH = OUT_DIR / "doc_ids.npy"
DOC_TITLES_PATH = OUT_DIR / "doc_titles.npy"
CLUSTER_TOP_TERMS_PATH = OUT_DIR / "cluster_top_terms.json"

# Model and filters
SBERT_MODEL_NAME = "all-MiniLM-L6-v2"
MIN_TEXT_CHARS = 30
CUSTOM_STOPWORDS_PATH = REPO_ROOT / "src" / "custom_stopwords.txt"
APRIORI_MIN_SUPPORT_RATIO = 0.2
APRIORI_MAX_SIZE = 3
APRIORI_TOP_K = 5
APRIORI_FALLBACK_TOP_K = 5

# UMAP pre-reduction (fixed small config)
UMAP_ENABLED = True
UMAP_DIM = 50
UMAP_METRIC = "cosine"  

# Sampled parameter search (broader and smarter)
PARAM_SEARCH = False
PARAM_SAMPLE_SIZE = 10000
# Baseline ranges (dynamic ranges are built inside search as well)
MIN_CLUSTER_SIZE_RANGE = [5, 10, 20, 30, 50, 100]
MIN_SAMPLES_RANGE = [1, 2, 5, 10, 15]
METHODS = ["eom", "leaf"]
EPS_RANGE = [0.0, 0.02, 0.05, 0.1, 0.2, 0.3]
TARGET_SILHOUETTE = 1


TOKEN_PATTERN = re.compile(r"[a-z0-9]+")
_CUSTOM_STOP_WORDS: Optional[set[str]] = None


def load_custom_stopwords() -> set[str]:
    global _CUSTOM_STOP_WORDS
    if _CUSTOM_STOP_WORDS is not None:
        return _CUSTOM_STOP_WORDS
    stops: set[str] = set()
    try:
        lines = CUSTOM_STOPWORDS_PATH.read_text(encoding="utf-8").splitlines()
    except FileNotFoundError:
        _CUSTOM_STOP_WORDS = stops
        return stops
    for line in lines:
        word = line.strip().lower()
        if not word or word.startswith("#"):
            continue
        stops.add(word)
    _CUSTOM_STOP_WORDS = stops
    return stops


def tokenize_without_stopwords(text: str) -> Tuple[str, List[str]]:
    text = (text or "").lower()
    tokens = [m.group(0) for m in TOKEN_PATTERN.finditer(text)]
    stops = load_custom_stopwords()
    filtered = [tok for tok in tokens if tok not in stops and len(tok) > 1]
    return " ".join(filtered), filtered


def apriori_frequent_itemsets(
    transactions: List[List[str]],
    min_support_ratio: float,
    max_size: int,
    top_k: int,
) -> List[Tuple[Tuple[str, ...], int]]:
    if not transactions:
        return []
    transactions_sets = [set(t) for t in transactions if t]
    transactions_sets = [t for t in transactions_sets if t]
    if not transactions_sets:
        return []
    n = len(transactions_sets)
    min_support = max(2, int(math.ceil(min_support_ratio * n)))

    # L1
    item_counts = Counter()
    for txn in transactions_sets:
        for item in txn:
            item_counts[(item,)] += 1
    current_freq = {items: cnt for items, cnt in item_counts.items() if cnt >= min_support}
    if not current_freq:
        return []

    freq_by_size: Dict[int, Dict[Tuple[str, ...], int]] = {1: current_freq}
    all_freq: List[Tuple[Tuple[str, ...], int]] = list(current_freq.items())
    k = 1

    while k < max_size:
        prev_freq = freq_by_size.get(k)
        if not prev_freq:
            break
        prev_keys = list(prev_freq.keys())
        candidates: set[Tuple[str, ...]] = set()
        prev_key_sets = [set(key) for key in prev_keys]
        for i in range(len(prev_keys)):
            for j in range(i + 1, len(prev_keys)):
                union_set = prev_key_sets[i] | prev_key_sets[j]
                if len(union_set) != k + 1:
                    continue
                candidate = tuple(sorted(union_set))
                # prune using Apriori property
                if all(tuple(sorted(sub)) in prev_freq for sub in combinations(candidate, k)):
                    candidates.add(candidate)
        if not candidates:
            break
        counts = Counter()
        for txn in transactions_sets:
            for cand in candidates:
                if set(cand).issubset(txn):
                    counts[cand] += 1
        next_freq = {cand: cnt for cand, cnt in counts.items() if cnt >= min_support}
        if not next_freq:
            break
        k += 1
        freq_by_size[k] = next_freq
        all_freq.extend(next_freq.items())

    # sort by (support desc, length desc, lexicographic)
    all_freq.sort(key=lambda item: (-item[1], -len(item[0]), item[0]))
    return all_freq[:top_k]


def read_texts_ids_titles(jsonl_path: Path):
    texts, ids, titles, token_lists = [], [], [], []
    with jsonl_path.open("r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Reading JSONL"):
            line = line.strip()
            if not line:
                continue
            try:
                rec = json.loads(line)
            except Exception:
                continue
            raw_text = rec.get("processed_content") or ""
            _, tokens_text = tokenize_without_stopwords(raw_text)
            raw_title = rec.get("title") or ""
            _, tokens_title = tokenize_without_stopwords(raw_title)
            merged_tokens = tokens_title + tokens_text
            merged_text = " ".join(merged_tokens)
            if len(raw_text) < MIN_TEXT_CHARS or not merged_tokens:
                continue
            texts.append(merged_text)
            ids.append(rec.get("id") or "")
            titles.append(rec.get("title") or "")
            token_lists.append(merged_tokens)
    return texts, np.array(ids), np.array(titles), token_lists


def compute_cluster_top_terms(labels: np.ndarray, token_lists: List[List[str]], top_n: int = 10) -> Dict[int, List[str]]:
    buckets: Dict[int, Counter] = defaultdict(Counter)
    transactions_by_cluster: Dict[int, List[List[str]]] = defaultdict(list)
    for lbl, tokens in zip(labels, token_lists):
        if lbl is None:
            continue
        lbl_int = int(lbl)
        if lbl_int < 0:
            continue
        if tokens:
            buckets[lbl_int].update(tokens)
            transactions_by_cluster[lbl_int].append(tokens)
    topics: Dict[int, List[str]] = {}
    for lbl, counter in buckets.items():
        transactions = transactions_by_cluster.get(lbl, [])
        itemsets = apriori_frequent_itemsets(
            transactions,
            min_support_ratio=APRIORI_MIN_SUPPORT_RATIO,
            max_size=APRIORI_MAX_SIZE,
            top_k=APRIORI_TOP_K,
        )
        if itemsets:
            topics[lbl] = [" + ".join(items) for items, _cnt in itemsets]
        else:
            # fallback to top individual terms if Apriori found nothing
            topics[lbl] = [term for term, _ in counter.most_common(APRIORI_FALLBACK_TOP_K)]
    return topics


def build_or_load_embeddings(texts):
    if EMBEDDINGS_PATH.exists():
        print(f"[i] Loading cached embeddings: {EMBEDDINGS_PATH}")
        X = np.load(EMBEDDINGS_PATH)
    else:
        print(f"[i] Loading SBERT model: {SBERT_MODEL_NAME}")
        model = SentenceTransformer(SBERT_MODEL_NAME)
        print(f"[i] Encoding {len(texts)} documents...")
        X = model.encode(texts, convert_to_numpy=True, show_progress_bar=True, batch_size=32)
        np.save(EMBEDDINGS_PATH, X)
        print(f"[i] Saved embeddings: {EMBEDDINGS_PATH}")

    X = normalize(X, norm="l2", axis=1)
    try:
        np.save(EMBEDDINGS_NORM_PATH, X)
    except Exception:
        pass
    return X


def maybe_umap(X: np.ndarray) -> np.ndarray:
    if not UMAP_ENABLED:
        return X
    try:
        import umap
    except Exception:
        print("[warn] UMAP not installed, skipping pre-reduction")
        return X
    print("[i] UMAP pre-reduction...")
    reducer = umap.UMAP(n_components=UMAP_DIM, metric=UMAP_METRIC, random_state=42 )
    Z = reducer.fit_transform(X)
    joblib.dump(reducer, OUT_DIR / "umap_reducer.joblib")
    print(f"[i] Saved UMAP reducer model to: {OUT_DIR / 'umap_reducer.joblib'}")


    Z = normalize(Z, norm="l2", axis=1)
    print(f"[i] UMAP shape: {Z.shape}")
    return Z


def run_hdbscan(X: np.ndarray, min_cluster_size: int, min_samples: int | None, method: str, eps: float):

    kwargs = dict(min_cluster_size=min_cluster_size, metric="euclidean", cluster_selection_method=method,
                  prediction_data=True)
    if min_samples is not None:
        kwargs["min_samples"] = min_samples
    if eps and eps > 0:
        kwargs["cluster_selection_epsilon"] = eps
    clusterer = hdbscan.HDBSCAN(**kwargs)
    labels = clusterer.fit_predict(X)
    return clusterer, labels


def quick_metrics(X: np.ndarray, labels: np.ndarray):
    n = len(labels)
    noise = int((labels == -1).sum())
    clustered = n - noise
    k = len(np.unique(labels[labels != -1]))
    sil = None
    if clustered > 0 and k > 1:
       
        sil = float(silhouette_score(X[labels != -1], labels[labels != -1], metric="euclidean"))
    noise_rate = noise / n if n else 1.0
    coverage = clustered / n if n else 0.0
    return dict(n=n, noise=noise, clustered=clustered, k=k, silhouette=sil, noise_rate=noise_rate, coverage=coverage)


def sampled_param_search(X: np.ndarray, csv_logger: Optional[CsvLogger] = None):
    if not PARAM_SEARCH:
        return None
    Xs = X
    if PARAM_SAMPLE_SIZE and X.shape[0] > PARAM_SAMPLE_SIZE:
        rng = np.random.default_rng(42)
        idx = rng.choice(X.shape[0], size=PARAM_SAMPLE_SIZE, replace=False)
        Xs = X[idx]
        print(f"[i] Param search on sample: {Xs.shape[0]} / {X.shape[0]}")

    # Build dynamic ranges based on dataset size
    n = Xs.shape[0]
    dyn_mcs = sorted({
        *MIN_CLUSTER_SIZE_RANGE,
        max(5, n // 2000),
        max(10, n // 1000),
        max(20, n // 500),
    })

    def score(m):
       
        sil = m["silhouette"] if m["silhouette"] is not None else -1.0
        coverage = m["coverage"]
        k = m["k"]
        k_pref = min(k / 50.0, 1.0)
        return 0.7 * sil + 0.3 * coverage + 0.1 * k_pref

    best = None
    best_score = -1e9

    
    for mcs in dyn_mcs:
        for ms in MIN_SAMPLES_RANGE:
            for method in METHODS:
                for eps in EPS_RANGE:
                    try:
                       
                        metric = "euclidean"
                        _, labels = run_hdbscan(Xs, mcs, ms, method, eps)
                        metr = quick_metrics(Xs, labels) 
                        sc = score(metr)

                        if csv_logger:
                            log_data = {
                                "score": sc,
                                "silhouette": metr["silhouette"],
                                "num_clusters": metr["k"],
                                "clustered_percentage": metr["coverage"],
                                "noise_rate": metr["noise_rate"],
                                "min_cluster_size": mcs,
                                "min_samples": ms,
                                "method": method,
                                "eps": eps,
                                "metric": metric,
                            }
                            csv_logger.log(log_data)

                        if sc > best_score:
                            best_score = sc
                            best = dict(min_cluster_size=mcs, min_samples=ms, method=method, eps=eps, metric=metric,
                                        metrics=metr)
                            sil_txt = "NA" if metr["silhouette"] is None else f"{metr['silhouette']:.4f}"
                            print(
                                f"  best so far: score={best_score:.4f} sil={sil_txt} k={metr['k']} coverage={metr['coverage']:.2f} noise_rate={metr['noise_rate']:.2f} params={{'min_cluster_size':{mcs},'min_samples':{ms},'method':'{method}','eps':{eps},'metric':'{metric}'}}")

                        if (metr["silhouette"] is not None and metr["silhouette"] >= 0.8 and
                                metr["coverage"] >= 0.9 and metr["k"] <= 50):
                            return best
                    except Exception as e:
                        print(f"  skip error: {e}")
                        continue
    return best


def main():
    print("=" * 60)
    print("SBERT + HDBSCAN (lite)")
    print("=" * 60)

    print("\n[1] Loading texts...")
    texts, ids, titles, token_lists = read_texts_ids_titles(INPUT_JSONL)
    print(f"[i] {len(texts)} documents after filtering")

    print("\n[2] Building/loading embeddings...")
    X = build_or_load_embeddings(texts)

    print("\n[3] UMAP (optional)...")
    Z = maybe_umap(X)

    print("\n[4] Parameter search (sampled)...")
    csv_logger = None
    if PARAM_SEARCH:
        ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        LOG_CSV_PATH = OUT_DIR / f"hdbscan_search_log_{ts}.csv"
        print(f"[i] Logging parameter search to: {LOG_CSV_PATH}")
        log_fieldnames = [
            "score", "silhouette", "num_clusters", "clustered_percentage", "noise_rate",
            "min_cluster_size", "min_samples", "method", "eps", "metric"
        ]
        csv_logger = CsvLogger(LOG_CSV_PATH, log_fieldnames)

    params = sampled_param_search(Z, csv_logger)
    if params is None:
       
        params = dict(min_cluster_size=5, min_samples=20, method="eom", eps=0.0, metric="euclidean")

   
    try:
        to_dump = dict(params={k: v for k, v in params.items() if k != "metrics"}, metrics=params.get("metrics"))
        with (OUT_DIR / "hdbscan_params.json").open("w", encoding="utf-8") as f:
            json.dump(to_dump, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"[warn] Failed to save params json: {e}")

    
    final_metric = params.get("metric", "euclidean")
    print(
        f"[i] Params used: {{'min_cluster_size': {params['min_cluster_size']}, 'min_samples': {params['min_samples']}, 'method': '{params['method']}', 'eps': {params['eps']}, 'metric': '{final_metric}'}}")

    print("\n[5] Clustering on full data...")
    clusterer, labels = run_hdbscan(Z, params["min_cluster_size"], params["min_samples"], params["method"],
                                    params["eps"])
    joblib.dump(clusterer, OUT_DIR / "hdbscan_clusterer.joblib")
    print(f"[i] Saved HDBSCAN clusterer model to: {OUT_DIR / 'hdbscan_clusterer.joblib'}")
    metr = quick_metrics(Z, labels)  # Will use default 'euclidean'

    sil_txt = "NA" if metr["silhouette"] is None else f"{metr['silhouette']:.4f}"
    print(f"silhouette={sil_txt}  k={metr['k']}  coverage={metr['coverage']:.2f}  noise_rate={metr['noise_rate']:.2f}")

    # Save metrics summary
    try:
        with (OUT_DIR / "hdbscan_metrics.json").open("w", encoding="utf-8") as f:
            json.dump(metr, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"[warn] Failed to save metrics json: {e}")

    print("\n[6] Saving artifacts...")
    np.save(CLUSTER_LABELS_PATH, labels)
    np.save(DOC_IDS_PATH, ids)
    np.save(DOC_TITLES_PATH, titles)

    cluster_topics = compute_cluster_top_terms(labels, token_lists)
    try:
        with CLUSTER_TOP_TERMS_PATH.open("w", encoding="utf-8") as f:
            json.dump(
                {str(k): v for k, v in cluster_topics.items()},
                f,
                ensure_ascii=False,
                indent=2,
            )
        print(f"[i] saved: {CLUSTER_TOP_TERMS_PATH}")
    except Exception as exc:
        print(f"[warn] failed to save cluster top terms: {exc}")

    print(f"[i] saved: {CLUSTER_LABELS_PATH}\n[i] saved: {DOC_IDS_PATH}\n[i] saved: {DOC_TITLES_PATH}")

    print("\nDone.")


if __name__ == "__main__":
    main()

SBERT + HDBSCAN (lite)

[1] Loading texts...


Reading JSONL: 0it [00:00, ?it/s]

Reading JSONL: 10000it [00:01, 7283.87it/s]
  warn(


[i] 10000 documents after filtering

[2] Building/loading embeddings...
[i] Loading cached embeddings: /work3/s242644/ds/PaperTrail/data/sbert_hdbscan_1w/sbert_embeddings.npy

[3] UMAP (optional)...
[i] UMAP pre-reduction...
[i] Saved UMAP reducer model to: /work3/s242644/ds/PaperTrail/data/sbert_hdbscan_1w/umap_reducer.joblib
[i] UMAP shape: (10000, 50)

[4] Parameter search (sampled)...
[i] Params used: {'min_cluster_size': 5, 'min_samples': 20, 'method': 'eom', 'eps': 0.0, 'metric': 'euclidean'}

[5] Clustering on full data...
[i] Saved HDBSCAN clusterer model to: /work3/s242644/ds/PaperTrail/data/sbert_hdbscan_1w/hdbscan_clusterer.joblib
silhouette=0.5725  k=76  coverage=0.52  noise_rate=0.48

[6] Saving artifacts...
[i] saved: /work3/s242644/ds/PaperTrail/data/sbert_hdbscan_1w/cluster_top_terms.json
[i] saved: /work3/s242644/ds/PaperTrail/data/sbert_hdbscan_1w/hdbscan_labels.npy
[i] saved: /work3/s242644/ds/PaperTrail/data/sbert_hdbscan_1w/doc_ids.npy
[i] saved: /work3/s242644/ds/