In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

# 加载 CoNLL 2003 数据集的训练集
dataset = load_dataset("conll2003")
train_dataset = dataset['train']

# 加载 Sentence-BERT 模型
model = SentenceTransformer('all-MiniLM-L6-v2')

# 为每个句子的 tokens 生成 embeddings
token_embeddings = []

for sample in train_dataset:
    tokens = sample['tokens']
    
    # 生成该句子所有 tokens 的 embeddings
    embeddings = model.encode(tokens)
    
    # 保存嵌入
    token_embeddings.append(embeddings)

# 打印第一个句子的 tokens 和它们对应的 embeddings
for token, embedding in zip(train_dataset[0]['tokens'], token_embeddings[0]):
    print(f"Token: {token}, Embedding: {embedding[:5]}...")  # 打印前5个值，简化输出

# 检查 token_embeddings 的类型和形状
import numpy as np

# 将 token_embeddings 转换为 numpy 数组
# 注意：token_embeddings 是一个嵌套列表，需要将它展平成二维数组
token_embeddings = np.vstack(token_embeddings)

# 检查转换后的形状
print(f"Shape of token_embeddings after conversion: {token_embeddings.shape}")

print(f"Type of token_embeddings: {type(token_embeddings)}")
print(f"Shape of token_embeddings: {token_embeddings.shape if isinstance(token_embeddings, np.ndarray) else 'Not a numpy array'}")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Token: EU, Embedding: [ 0.0328505   0.04594225  0.00482348 -0.0299531  -0.0306954 ]...
Token: rejects, Embedding: [-0.03228948  0.06037299  0.05513505  0.06366471  0.03532343]...
Token: German, Embedding: [-0.01822809  0.03050454  0.00161921  0.05627387 -0.01692749]...
Token: call, Embedding: [-0.09879501  0.03357653 -0.04692755 -0.0002789  -0.07271501]...
Token: to, Embedding: [-0.02195787  0.042925   -0.0413069   0.08042946 -0.01573347]...
Token: boycott, Embedding: [ 0.01419001  0.07401178  0.06483291 -0.04102125  0.04765184]...
Token: British, Embedding: [ 0.0284378  -0.01627326 -0.01693945 -0.00372896 -0.01063101]...
Token: lamb, Embedding: [-0.08062177  0.01720387 -0.01789038  0.08488934 -0.05845075]...
Token: ., Embedding: [-0.13382298  0.01415094 -0.01621612 -0.02662739  0.06019066]...


In [9]:
# 执行 HDBSCAN 聚类
cluster_labels = clusterer.fit_predict(token_embeddings)

# 2. 聚类结果输出
num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
num_noise_points = list(cluster_labels).count(-1)

print(f"Number of clusters: {num_clusters}")
print(f"Number of noise points: {num_noise_points}")


Clustering embeddings:   0%|                                     | 0/203621 [2:25:23<?, ?it/s]


Number of clusters: 5309
Number of noise points: 32114
