In [5]:
import subprocess
import os
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch

# ========== 第一步：设置代理 ==========
os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"
os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890"
# ========== 第二步：读取词表 ==========
data_path = "coca60000_trans_byTX.csv"
print(f"Loading vocabulary from {data_path}...")
df = pd.read_csv(data_path, encoding='gbk')  # 列名：index, collocation, translate

# 数据清洗：确保collocation列中没有NaN值，并且所有值都是字符串
df = df.dropna(subset=['collocation'])  # 删除collocation为NaN的行
df['collocation'] = df['collocation'].astype(str)  # 确保所有值都是字符串

words = df['collocation'].tolist()

# ========== 第三步：使用 GPU 加速的 SentenceTransformer ==========
print("Loading SentenceTransformer model (GPU)...")
print("CUDA available:", torch.cuda.is_available())
# model = SentenceTransformer('all-MiniLM-L6-v2')
# 替换为更强大的英文语义模型
model = SentenceTransformer('all-mpnet-base-v2')  # 性能最佳的通用英文模型
# 其他高性能英文模型选项:
# model = SentenceTransformer('all-MiniLM-L12-v2')  # 比L6版本更大更准确
# model = SentenceTransformer('sentence-transformers/paraphrase-albert-small-v2')  # 小型但高效
# model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')  # 平衡性能和效率
print("Using device:", model.device)

print("Encoding words into embeddings...")
embeddings = model.encode(words, show_progress_bar=True, batch_size=256, device=model.device)

# ========== 第四步：使用 cuML (GPU) 加速聚类 ==========
print("Clustering with cuML HDBSCAN (GPU)...")
import cupy as cp
from cuml.cluster import HDBSCAN as cuHDBSCAN

# 调整参数以确保至少有200个聚类，每个类别至少10个词
min_cluster_size = 10  # 每个聚类最少包含10个词
min_samples = 5  # 降低此值可以增加聚类数量
cluster_selection_epsilon = 0.1  # 增加此值可以合并更多聚类
cluster_selection_method = 'eom'  # 'eom'比'leaf'通常产生更多聚类

print(f"聚类参数: min_cluster_size={min_cluster_size}, min_samples={min_samples}")
embeddings_gpu = cp.asarray(embeddings)

# 尝试不同参数直到获得至少200个聚类
max_attempts = 10
attempt = 0
num_clusters = 0

while num_clusters < 200 and attempt < max_attempts:
    clusterer = cuHDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method
    )
    labels = clusterer.fit_predict(embeddings_gpu)
    labels_cpu = cp.asnumpy(labels)
    
    # 计算聚类数量（不包括噪声点，标签为-1）
    unique_clusters = set(labels_cpu)
    if -1 in unique_clusters:
        unique_clusters.remove(-1)
    num_clusters = len(unique_clusters)
    
    print(f"尝试 {attempt+1}: 获得 {num_clusters} 个聚类")
    
    # 如果聚类数量不足，调整参数
    if num_clusters < 200:
        cluster_selection_epsilon -= 0.01  # 减小epsilon以获得更多聚类
        min_samples = max(1, min_samples - 1)  # 减小min_samples但不低于1
        attempt += 1

# 将聚类标签添加到DataFrame
df['cluster'] = labels_cpu

# ========== 第五步：处理多义词（同一个词可出现多个类） ==========
print("处理多义词和小聚类...")

# 1. 处理噪声点（标签为-1的点）
noise_mask = df['cluster'] == -1
noise_count = noise_mask.sum()
print(f"发现 {noise_count} 个噪声点（未分类词）")

if noise_count > 0:
    # 为噪声点找到最近的聚类
    from sklearn.neighbors import NearestNeighbors
    import numpy as np
    
    # 获取非噪声点的索引和聚类标签
    non_noise_idx = np.where(labels_cpu != -1)[0]
    non_noise_clusters = labels_cpu[non_noise_idx]
    
    if len(non_noise_idx) > 0:  # 确保有非噪声点
        # 训练最近邻模型
        nn = NearestNeighbors(n_neighbors=5)
        nn.fit(embeddings[non_noise_idx])
        
        # 为每个噪声点找到最近的非噪声点
        noise_idx = np.where(labels_cpu == -1)[0]
        distances, indices = nn.kneighbors(embeddings[noise_idx])
        
        # 为每个噪声点分配最常见的邻居聚类
        for i, neighbors in enumerate(indices):
            neighbor_clusters = [non_noise_clusters[j] for j in neighbors]
            # 找出最常见的聚类
            from collections import Counter
            most_common = Counter(neighbor_clusters).most_common(1)[0][0]
            # 更新噪声点的聚类标签
            df.loc[df.index[noise_idx[i]], 'cluster'] = most_common

# 2. 处理小聚类（少于10个词的聚类）
cluster_sizes = df['cluster'].value_counts()
small_clusters = cluster_sizes[cluster_sizes < 10].index.tolist()
print(f"发现 {len(small_clusters)} 个小聚类（少于10个词）")

if small_clusters:
    # 合并小聚类到最相似的大聚类
    for small_cluster in small_clusters:
        # 获取小聚类中的词的索引
        small_cluster_idx = df[df['cluster'] == small_cluster].index
        
        # 计算小聚类的平均嵌入向量
        small_cluster_mean = embeddings[small_cluster_idx].mean(axis=0)
        
        # 获取大聚类的平均嵌入向量
        large_clusters = cluster_sizes[cluster_sizes >= 10].index.tolist()
        large_cluster_means = {}
        
        for large_cluster in large_clusters:
            large_cluster_idx = df[df['cluster'] == large_cluster].index
            large_cluster_means[large_cluster] = embeddings[large_cluster_idx].mean(axis=0)
        
        # 找到最相似的大聚类
        best_similarity = -1
        best_cluster = None
        
        for large_cluster, mean_vec in large_cluster_means.items():
            # 计算余弦相似度
            similarity = np.dot(small_cluster_mean, mean_vec) / (np.linalg.norm(small_cluster_mean) * np.linalg.norm(mean_vec))
            if similarity > best_similarity:
                best_similarity = similarity
                best_cluster = large_cluster
        
        # 将小聚类合并到最相似的大聚类
        if best_cluster is not None:
            df.loc[small_cluster_idx, 'cluster'] = best_cluster
            print(f"  合并聚类 {small_cluster} 到聚类 {best_cluster} (相似度: {best_similarity:.4f})")

# 3. 处理多义词（同一个词可以属于多个类别）- 极速优化版本
print("处理多义词（同一个词可以属于多个类别）...")

# 创建一个新的DataFrame来存储多义词结果
multi_sense_df = pd.DataFrame(columns=df.columns)

# 设置相似度阈值
similarity_threshold = 0.85

# 1. 将所有操作移至GPU，避免CPU-GPU数据传输开销
if torch.cuda.is_available():
    print("使用GPU进行全部计算")
    
    # 将所有embeddings一次性转移到GPU
    embeddings_gpu = torch.tensor(embeddings, device='cuda')
    
    # 计算所有词向量的范数(一次性计算)
    word_norms = torch.norm(embeddings_gpu, dim=1)
    
    # 2. 预计算所有聚类的中心向量和范数
    print("预计算聚类中心...")
    cluster_centers_gpu = {}
    cluster_norms_gpu = {}
    
    for cluster in tqdm(df['cluster'].unique(), desc="计算聚类中心"):
        cluster_indices = torch.tensor([i for i, c in enumerate(df['cluster']) 
                                      if c == cluster and i < len(embeddings)], device='cuda')
        if len(cluster_indices) > 0:
            # 使用索引操作一次性获取所有向量
            cluster_vecs = torch.index_select(embeddings_gpu, 0, cluster_indices)
            # 计算平均值
            center = torch.mean(cluster_vecs, dim=0)
            cluster_centers_gpu[cluster] = center
            cluster_norms_gpu[cluster] = torch.norm(center)
    
    # 3. 并行计算所有词与所有聚类中心的相似度矩阵
    print("并行计算相似度矩阵...")
    
    # 创建聚类中心矩阵
    unique_clusters = sorted(list(cluster_centers_gpu.keys()))
    centers_matrix = torch.stack([cluster_centers_gpu[c] for c in unique_clusters])
    centers_norms = torch.stack([cluster_norms_gpu[c] for c in unique_clusters])
    
    # 4. 使用批处理而非多进程
    batch_size = 5000  # 更大的批处理大小
    num_batches = (len(df) + batch_size - 1) // batch_size
    
    for batch_idx in tqdm(range(num_batches), total=num_batches, desc="处理批次"):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]
        
        batch_results = []
        
        # 获取批次的词向量和原始聚类
        batch_indices = [i for i in range(start_idx, end_idx) if i < len(embeddings)]
        batch_vectors = embeddings_gpu[batch_indices]
        batch_norms = word_norms[batch_indices]
        batch_clusters = torch.tensor([df.iloc[i]['cluster'] for i in range(start_idx, end_idx)], device='cuda')
        
        # 计算批次中所有词与所有聚类中心的点积
        # (batch_size, embedding_dim) x (num_clusters, embedding_dim)T = (batch_size, num_clusters)
        dot_products = torch.matmul(batch_vectors, centers_matrix.T)
        
        # 计算余弦相似度
        # (batch_size, 1) x (1, num_clusters) -> (batch_size, num_clusters)
        batch_norms_expanded = batch_norms.unsqueeze(1)
        centers_norms_expanded = centers_norms.unsqueeze(0)
        similarities = dot_products / (batch_norms_expanded * centers_norms_expanded)
        
        # 找出高于阈值的相似度，但排除原始聚类
        for i, (idx, row) in enumerate(batch_df.iterrows()):
            if i >= len(similarities):
                continue
                
            original_cluster = row['cluster']
            original_cluster_idx = unique_clusters.index(original_cluster) if original_cluster in unique_clusters else -1
            
            # 创建掩码，排除原始聚类
            mask = torch.ones(len(unique_clusters), dtype=torch.bool, device='cuda')
            if original_cluster_idx >= 0:
                mask[original_cluster_idx] = False
            
            # 应用掩码和阈值
            high_similarities = similarities[i] * mask
            potential_clusters = torch.where(high_similarities > similarity_threshold)[0]
            
            # 为每个潜在聚类创建新行
            for cluster_idx in potential_clusters:
                cluster = unique_clusters[cluster_idx.item()]
                new_row = row.copy()
                new_row['cluster'] = cluster
                batch_results.append(new_row)
        
        # 合并批次结果
        if batch_results:
            multi_sense_df = pd.concat([multi_sense_df, pd.DataFrame(batch_results)], ignore_index=True)
    
else:
    # 如果没有GPU，使用NumPy的向量化操作
    print("使用NumPy向量化操作...")
    
    # 预计算所有聚类的中心向量
    print("预计算聚类中心...")
    cluster_centers = {}
    for cluster in tqdm(df['cluster'].unique(), desc="计算聚类中心"):
        cluster_indices = np.where(df['cluster'].values == cluster)[0]
        if len(cluster_indices) > 0:
            valid_indices = [idx for idx in cluster_indices if idx < len(embeddings)]
            if valid_indices:
                cluster_centers[cluster] = embeddings[valid_indices].mean(axis=0)
    
    # 计算所有词向量的范数
    word_norms = np.linalg.norm(embeddings, axis=1)
    
    # 创建聚类中心矩阵和范数
    unique_clusters = sorted(list(cluster_centers.keys()))
    centers_matrix = np.vstack([cluster_centers[c] for c in unique_clusters])
    centers_norms = np.array([np.linalg.norm(cluster_centers[c]) for c in unique_clusters])
    
    # 使用批处理和向量化操作
    batch_size = 5000
    for batch_start in range(0, len(df), batch_size):
        batch_end = min(batch_start + batch_size, len(df))
        batch_df = df.iloc[batch_start:batch_end]
        
        # 获取批次的词向量索引
        batch_indices = [i for i in range(batch_start, batch_end) if i < len(embeddings)]
        if not batch_indices:
            continue
            
        # 计算批次中所有词与所有聚类中心的点积
        batch_vectors = embeddings[batch_indices]
        dot_products = np.dot(batch_vectors, centers_matrix.T)
        
        # 计算余弦相似度
        batch_norms = word_norms[batch_indices].reshape(-1, 1)
        similarities = dot_products / (batch_norms * centers_norms.reshape(1, -1))
        
        # 处理每个词
        for i, (idx, row) in enumerate(batch_df.iterrows()):
            if i >= len(similarities):
                continue
                
            original_cluster = row['cluster']
            original_cluster_idx = unique_clusters.index(original_cluster) if original_cluster in unique_clusters else -1
            
            # 创建掩码，排除原始聚类
            mask = np.ones(len(unique_clusters), dtype=bool)
            if original_cluster_idx >= 0:
                mask[original_cluster_idx] = False
            
            # 应用掩码和阈值
            high_similarities = similarities[i] * mask
            potential_clusters = np.where(high_similarities > similarity_threshold)[0]
            
            # 为每个潜在聚类创建新行
            for cluster_idx in potential_clusters:
                cluster = unique_clusters[cluster_idx]
                new_row = row.copy()
                new_row['cluster'] = cluster
                multi_sense_df = pd.concat([multi_sense_df, pd.DataFrame([new_row])], ignore_index=True)
        
        print(f"处理批次 {batch_start//batch_size + 1}/{(len(df)-1)//batch_size + 1} 完成")

# 将多义词结果合并到原始DataFrame
df = pd.concat([df, multi_sense_df], ignore_index=True)
print(f"处理后的总行数: {len(df)}")
print(f"添加的多义词数量: {len(multi_sense_df)}")

# 4. 验证最终聚类结果
final_cluster_sizes = df['cluster'].value_counts()
final_small_clusters = final_cluster_sizes[final_cluster_sizes < 10]
print(f"最终聚类数量: {len(final_cluster_sizes)}")
print(f"最终小聚类数量: {len(final_small_clusters)}")
if len(final_small_clusters) > 0:
    print("警告: 仍有小聚类存在，可能需要进一步调整参数")

# ========== 第六步：按 cluster 分组 & 排序 ==========
# 计算每个 cluster 的最小 index 用作排序
cluster_order = df.groupby('cluster')['index'].min().sort_values().reset_index()
cluster_order['new_cluster'] = range(len(cluster_order))

# 重新映射 cluster ID，按 index 排序
cluster_map = dict(zip(cluster_order['cluster'], cluster_order['new_cluster']))
df['cluster'] = df['cluster'].map(cluster_map)

# ========== 新增：为每个聚类生成语义标签 ==========
print("\n为每个聚类生成语义标签...")

# 1. 提取每个聚类的代表性词汇（增加到前30个词以提供更多上下文）
cluster_words = {}
for cluster in df['cluster'].unique():
    # 获取该聚类的所有词
    cluster_df = df[df['cluster'] == cluster]
    # 按原始索引排序，取前30个词
    top_words = cluster_df.sort_values('index')['collocation'].head(30).tolist()
    # 过滤掉可能的NaN值
    top_words = [str(word) for word in top_words if pd.notna(word)]
    cluster_words[cluster] = top_words

# 2. 使用API调用大模型为每个聚类生成语义标签
print("使用API调用大模型生成聚类标签...")

# 导入必要的库
from dotenv import load_dotenv
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

# 加载环境变量
load_dotenv()

# 设置代理（如需要）
# os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"

# 创建LLM
llm = ChatOpenAI(
    model="deepseek/deepseek-chat-v3-0324:free",  # 使用DeepSeek模型
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    openai_api_base=os.getenv("OPENROUTER_BASE_URL"),
    temperature=0.1,  # 低温度以获得更确定性的输出
    request_timeout=30,
)

# 构造Prompt模板
prompt = ChatPromptTemplate.from_template("""
你是一个专业的语言学家，擅长分析词汇的语义类别。
以下是一组英语单词，它们属于同一个语义类别。请用中文简洁地描述这个类别（5-10个字）：

单词: {words}

请直接给出类别名称，不要重复问题或解释过程。
""")

# 构造链
chain = prompt | llm | StrOutputParser()

# 为每个聚类生成描述性标签
cluster_labels = {}

# 处理每个聚类
for cluster, words in tqdm(cluster_words.items(), desc="生成聚类标签"):
    if not words:
        cluster_labels[cluster] = "未知类别"
        continue
    
    # 使用所有提取的词汇，但最多使用20个以避免提示词过长
    words_to_use = words[:20] if len(words) > 20 else words
    words_str = ", ".join(words_to_use)
    
    try:
        # 使用API调用大模型生成标签
        label = chain.invoke({"words": words_str})
        
        # 如果标签太长，截取前30个字符
        if len(label) > 30:
            label = label[:30]
        
        # 添加前3个词作为示例
        examples = ", ".join(words[:3])
        cluster_labels[cluster] = f"{label} (例: {examples})"
    except Exception as e:
        print(f"生成聚类 {cluster} 的标签时出错: {e}")
        # 如果生成失败，使用简单的标签
        cluster_labels[cluster] = f"聚类 {cluster} (例: {', '.join(words[:3])})"

# 将标签添加到DataFrame
df['cluster_label'] = df['cluster'].map(cluster_labels)

# ========== 新增：重新排序并添加新索引 ==========
print("\n重新排序DataFrame...")

# 先按cluster排序，再按原始index排序
df_sorted = df.sort_values(['cluster', 'index'])

# 添加新的索引列作为第一列
df_sorted['index_0'] = range(len(df_sorted))

# 重新排列列顺序，将index_0放在第一位
cols = ['index_0'] + [col for col in df_sorted.columns if col != 'index_0']
df_sorted = df_sorted[cols]

# 替换原始DataFrame
df = df_sorted

# ========== 第七步：保存结果 ==========
output_path = "clustered_vocab_with_labels.csv"
df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"带标签的聚类结果已保存到 {output_path}.")

# ========== 第八步：展示每类前10个词和标签 ==========
print("\n每个聚类的标签和前10个词:")
for cluster in sorted(df['cluster'].unique()):
    cluster_df = df[df['cluster'] == cluster]
    label = cluster_df['cluster_label'].iloc[0]
    words = cluster_df['collocation'].head(10).tolist()
    words = [str(word) for word in words if pd.notna(word)]
    words_str = ", ".join(words)
    print(f"聚类 {cluster} - {label}:\n  {words_str}\n")


Loading vocabulary from coca60000_trans_byTX.csv...
Loading SentenceTransformer model (GPU)...
CUDA available: True
Using device: cuda:0
Encoding words into embeddings...


Batches:   0%|          | 0/235 [00:00<?, ?it/s]

Clustering with cuML HDBSCAN (GPU)...
聚类参数: min_cluster_size=10, min_samples=5
尝试 1: 获得 2 个聚类
尝试 2: 获得 13 个聚类
尝试 3: 获得 657 个聚类
处理多义词和小聚类...
发现 47355 个噪声点（未分类词）
发现 0 个小聚类（少于10个词）
处理多义词（同一个词可以属于多个类别）...
使用GPU进行全部计算
预计算聚类中心...


计算聚类中心: 100%|██████████| 657/657 [00:53<00:00, 12.24it/s]


并行计算相似度矩阵...


处理批次: 100%|██████████| 13/13 [01:03<00:00,  4.89s/it]


处理后的总行数: 60029
添加的多义词数量: 8
最终聚类数量: 657
最终小聚类数量: 0

为每个聚类生成语义标签...
使用API调用大模型生成聚类标签...


生成聚类标签:   0%|          | 1/657 [00:07<1:25:32,  7.82s/it]

生成聚类 0 的标签时出错: Connection error.


生成聚类标签:   0%|          | 2/657 [00:10<51:36,  4.73s/it]  

生成聚类 1 的标签时出错: Connection error.


生成聚类标签:   0%|          | 3/657 [00:23<1:33:39,  8.59s/it]

生成聚类 2 的标签时出错: Connection error.


生成聚类标签:   1%|          | 4/657 [00:31<1:30:45,  8.34s/it]

生成聚类 3 的标签时出错: Connection error.


生成聚类标签:   1%|          | 5/657 [00:34<1:07:39,  6.23s/it]

生成聚类 4 的标签时出错: Connection error.


生成聚类标签:   1%|          | 6/657 [00:42<1:17:23,  7.13s/it]

生成聚类 5 的标签时出错: Connection error.


生成聚类标签:   1%|          | 7/657 [00:50<1:19:50,  7.37s/it]

生成聚类 6 的标签时出错: Connection error.


生成聚类标签:   1%|          | 8/657 [00:58<1:21:21,  7.52s/it]

生成聚类 7 的标签时出错: Connection error.


生成聚类标签:   1%|▏         | 9/657 [01:01<1:04:21,  5.96s/it]

生成聚类 8 的标签时出错: Connection error.


生成聚类标签:   1%|▏         | 9/657 [01:13<1:27:43,  8.12s/it]


KeyboardInterrupt: 

In [3]:
!pip install langchain_openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting langchain_openai
  Downloading http://mirrors.aliyun.com/pypi/packages/a9/60/886dc53c91031e26542f7ac1ea4062b7ebe542d22970996acaee59aa1cab/langchain_openai-0.3.17-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.9/62.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting openai<2.0.0,>=1.68.2 (from langchain_openai)
  Downloading http://mirrors.aliyun.com/pypi/packages/3c/4c/3889bc332a6c743751eb78a4bada5761e50a8a847ff0e46c1bd23ce12362/openai-1.78.1-py3-none-any.whl (680 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m680.9/680.9 kB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
Collecting jiter<1,>=0.4.0 (from openai<2.0.0,>=1.68.2->langchain_openai)
  Downloading http://mirrors.aliyun.com/pypi/packages/87/67/22728a86ef53589c3720225778f7c5fdb617080e3deaed58b04789418212/jiter-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (351 kB)
[2K     

In [7]:
from dotenv import load_dotenv
import os
load_dotenv()
# 设置代理（如需要）
os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
# 构造 Prompt 模板
prompt = ChatPromptTemplate.from_template("""
请用温馨优美的风格，用简体中文回答以下问题，尽量让人读得愉悦：
问题：{question}
回答：
""")
# 创建支持流式输出的 LLM
llm = ChatOpenAI(
    model="deepseek/deepseek-chat-v3-0324:free",
    openai_api_key=os.getenv("OPENROUTER_API_KEY"),
    openai_api_base=os.getenv("OPENROUTER_BASE_URL"),
    streaming=True,  # ✅ 开启流式输出
    request_timeout=30,
)
# 构造链
chain = prompt | llm
# 流式调用
for chunk in chain.stream({"question": "天空为什么是蓝色的？"}):
    print(chunk.content, end="", flush=True)  # 逐块输出


APIConnectionError: Connection error.