In [10]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

# 加载 CoNLL 2003 数据集的训练集
dataset = load_dataset("conll2003")
train_dataset = dataset['train']

# 加载 Sentence-BERT 模型
model = SentenceTransformer('all-MiniLM-L6-v2')

# 为每个句子的 tokens 生成 embeddings
token_embeddings_list = []

for sample in train_dataset:
    tokens = sample['tokens']
    
    # 生成该句子所有 tokens 的 embeddings
    embeddings = model.encode(tokens)
    
    # 保存嵌入
    token_embeddings_list.append(embeddings)

# 打印第一个句子的 tokens 和它们对应的 embeddings
for token, embedding in zip(train_dataset[0]['tokens'], token_embeddings_list[0]):
    print(f"Token: {token}, Embedding: {embedding[:5]}...")  # 打印前5个值，简化输出

# 检查 token_embeddings 的类型和形状
import numpy as np

# 将 token_embeddings 转换为 numpy 数组
# 注意：token_embeddings 是一个嵌套列表，需要将它展平成二维数组
token_embeddings_np = np.vstack(token_embeddings_list)

# 检查转换后的形状
print(f"Shape of token_embeddings after conversion: {token_embeddings_np.shape}")

print(f"Type of token_embeddings: {type(token_embeddings_np)}")
print(f"Shape of token_embeddings: {token_embeddings_np.shape if isinstance(token_embeddings_np, np.ndarray) else 'Not a numpy array'}")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


IndexError: Dimension specified as 0 but tensor has no dimensions

Token: EU, Embedding: [ 0.0328505   0.04594225  0.00482348 -0.0299531  -0.0306954 ]...
Token: rejects, Embedding: [-0.03228948  0.06037299  0.05513505  0.06366471  0.03532343]...
Token: German, Embedding: [-0.01822809  0.03050454  0.00161921  0.05627387 -0.01692749]...
Token: call, Embedding: [-0.09879501  0.03357653 -0.04692755 -0.0002789  -0.07271501]...
Token: to, Embedding: [-0.02195787  0.042925   -0.0413069   0.08042946 -0.01573347]...
Token: boycott, Embedding: [ 0.01419001  0.07401178  0.06483291 -0.04102125  0.04765184]...
Token: British, Embedding: [ 0.0284378  -0.01627326 -0.01693945 -0.00372896 -0.01063101]...
Token: lamb, Embedding: [-0.08062177  0.01720387 -0.01789038  0.08488934 -0.05845075]...
Token: ., Embedding: [-0.13382298  0.01415094 -0.01621612 -0.02662739  0.06019066]...
Shape of token_embeddings after conversion: (203621, 384)
Type of token_embeddings: <class 'numpy.ndarray'>
Shape of token_embeddings: (203621, 384)


In [29]:
token_embeddings_tensor = torch.from_numpy(token_embeddings_np).float().cuda()

def kmeans(X, num_clusters, num_iters=100, tol=1e-4):
    # 随机初始化簇中心 (K-means++)
    indices = torch.randperm(X.size(0))[:num_clusters]
    centroids = X[indices]
    
    prev_centroids = centroids.clone()

    for i in range(num_iters):
        # 计算每个点到所有簇中心的距离，并找到最近的簇中心
        distances = torch.cdist(X, centroids)
        labels = torch.argmin(distances, dim=1)

        # 更新簇中心
        new_centroids = torch.stack([X[labels == k].mean(dim=0) if (labels == k).sum() > 0 else centroids[k]
                                     for k in range(num_clusters)])

        # 检查中心的移动量，若变化小于容差，则停止
        centroid_shift = torch.norm(new_centroids - prev_centroids, dim=1).sum()
        if centroid_shift < tol:
            break

        centroids = new_centroids
        prev_centroids = centroids.clone()

    return labels, centroids

# 执行 K-means 聚类
num_clusters = 1000
cluster_labels, final_centroids = kmeans(token_embeddings_tensor, num_clusters=num_clusters)

# 将结果移动回 CPU
cluster_labels = cluster_labels.cpu().numpy()

# 打印聚类结果
print(f"Number of clusters: {len(set(cluster_labels))}")


Number of clusters: 792


In [30]:
from collections import defaultdict
print(len(cluster_labels))

# 统计每个簇中的 tokens
# clusters = defaultdict(list)
# for i, label in enumerate(cluster_labels):
#     print(i, label)


203621


In [31]:
import torch

# 假设 token_embeddings_tensor 是所有 token 的嵌入，cluster_labels 是每个 token 的聚类标签
# final_centroids 是每个簇的中心点

# 创建 token_id 到 (sentence_id, token_idx_in_sentence) 的映射
token_to_sentence_idx = []
for sentence_idx, sample in enumerate(train_dataset):
    num_tokens = len(sample['tokens'])
    for token_idx_in_sentence in range(num_tokens):
        token_to_sentence_idx.append((sentence_idx, token_idx_in_sentence))  # (sentence_id, token_idx_in_sentence)

token_to_sentence_idx = np.array(token_to_sentence_idx)

def get_top_tokens_near_centroids(X, labels, centroids, num_clusters, top_n=10):
    top_tokens_per_cluster = {}
    cluster_sizes = {}

    # 将 cluster_labels 转换为 Tensor
    labels_tensor = torch.from_numpy(labels).cuda()

    for cluster_idx in range(num_clusters):
        # 获取当前簇中的所有点（根据聚类标签）
        cluster_mask = labels_tensor == cluster_idx
        cluster_points = X[cluster_mask]

        # 计算当前簇中的所有点到簇中心的距离
        distances = torch.norm(cluster_points - centroids[cluster_idx], dim=1)

        # 获取距离最小的 top_n 个点的索引
        top_indices = torch.argsort(distances)[:top_n]

        # 获取对应的 token 列表索引，并将 numpy.int64 转换为原生 Python int
        cluster_token_indices = torch.where(cluster_mask)[0][top_indices].cpu().numpy()
        top_tokens_per_cluster[cluster_idx] = [int(idx) for idx in cluster_token_indices]

        # 计算当前簇的大小
        cluster_sizes[cluster_idx] = cluster_mask.sum().item()

    return top_tokens_per_cluster, cluster_sizes

# 获取每个簇中最接近簇中心的 10 个 token 的索引和每个簇的大小
top_tokens_per_cluster, cluster_sizes = get_top_tokens_near_centroids(
    token_embeddings_tensor, cluster_labels, final_centroids, num_clusters, top_n=10)

# 打印每个 cluster 的结果
for cluster_idx, token_indices in top_tokens_per_cluster.items():
    sentence_tokens = []
    for i in token_indices:
        sentence_idx, token_idx_in_sentence = token_to_sentence_idx[i]  # 获取 sentence_id 和 token_id_in_sentence
        token = train_dataset[int(sentence_idx)]['tokens'][int(token_idx_in_sentence)]  # 从句子中获取对应 token
        sentence_tokens.append(token)
    
    print(f"Cluster {cluster_idx} (Size: {cluster_sizes[cluster_idx]} tokens):")
    print(f"Top 10 closest tokens: {sentence_tokens}")
    print("-" * 40)


Cluster 0 (Size: 0 tokens):
Top 10 closest tokens: []
----------------------------------------
Cluster 1 (Size: 8390 tokens):
Top 10 closest tokens: ['the', 'the', 'the', 'the', 'the', 'the', 'The', 'the', 'the', 'the']
----------------------------------------
Cluster 2 (Size: 95 tokens):
Top 10 closest tokens: ['pound', 'pound', 'pound', 'pound', 'Pound', 'pound', 'pound', 'pound', 'pound', 'pound']
----------------------------------------
Cluster 3 (Size: 207 tokens):
Top 10 closest tokens: ['day', 'DAY', 'day', 'day', 'DAY', 'day', 'day', 'day', 'day', 'day']
----------------------------------------
Cluster 4 (Size: 110 tokens):
Top 10 closest tokens: ['Palestinian', 'Palestinian', 'Palestinian', 'Palestinian', 'Palestinian', 'Palestinian', 'Palestinian', 'Palestinian', 'Palestinian', 'Palestinian']
----------------------------------------
Cluster 5 (Size: 0 tokens):
Top 10 closest tokens: []
----------------------------------------
Cluster 6 (Size: 253 tokens):
Top 10 closest token