In [10]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, calinski_harabasz_score
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
sys.path.append('../src')
import utils
from gensim.models import KeyedVectors
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

def load_dimension_words(file_path):
    """
    加载维度词表文件
    
    Args:
        file_path: 维度词表文件路径
        
    Returns:
        dict: 维度名称到词列表的映射
    """
    dimensions = {}
    current_dimension = None
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                    
                if line.endswith(':'):
                    current_dimension = line[:-1]
                    dimensions[current_dimension] = []
                elif current_dimension:
                    words = line.split()
                    dimensions[current_dimension].extend(words)
                    
        return dimensions
    except Exception as e:
        print(f"加载维度词表时出错: {e}")
        return {}

def expand_dimension_words_by_similarity(models, dimension_words, target_word="法治", 
                                       similarity_threshold=0.3, max_words_per_dim=50):
    """
    基于词向量相似度扩展维度词表
    
    Args:
        models: 词向量模型字典
        dimension_words: 初始维度词表
        target_word: 目标词（法治）
        similarity_threshold: 相似度阈值
        max_words_per_dim: 每个维度最大词数
        
    Returns:
        dict: 扩展后的维度词表
    """
    expanded_words = {dim: set(words) for dim, words in dimension_words.items()}
    
    # 使用最新时期的模型进行扩展
    latest_period = max(models.keys())
    model = models[latest_period]
    
    print(f"使用 {latest_period} 模型进行词表扩展")
    
    # 获取与目标词相似的词
    if target_word in model:
        similar_words = model.most_similar(target_word, topn=1000)
        
        for word, similarity in similar_words:
            if similarity < similarity_threshold:
                break
                
            # 计算该词与各维度核心词的平均相似度
            dim_similarities = {}
            
            for dim, core_words in dimension_words.items():
                similarities = []
                for core_word in core_words:
                    if core_word in model:
                        try:
                            sim = model.similarity(word, core_word)
                            similarities.append(sim)
                        except:
                            pass
                
                if similarities:
                    dim_similarities[dim] = np.mean(similarities)
            
            # 将词分配给相似度最高的维度
            if dim_similarities:
                best_dim = max(dim_similarities, key=dim_similarities.get)
                if (dim_similarities[best_dim] > similarity_threshold and 
                    len(expanded_words[best_dim]) < max_words_per_dim):
                    expanded_words[best_dim].add(word)
    
    # 转换回列表格式
    result = {dim: list(words) for dim, words in expanded_words.items()}
    
    for dim, words in result.items():
        print(f"{dim}: {len(words)} 个词")
    
    return result

def cluster_similar_words(models, similar_words_by_period, n_clusters=4, 
                         exclude_words_path=None, top_n=150):
    """
    对相似词进行聚类分析，为每个时期的模型都执行聚类
    
    Args:
        models: 词向量模型字典
        similar_words_by_period: 各时期相似词字典
        n_clusters: 聚类数量
        exclude_words_path: 排除词文件路径
        top_n: 每个时期取前N个词
        
    Returns:
        dict: 每个时期的聚类结果 {period: (clusters, word_vectors, cluster_labels, valid_words)}
    """
    # 加载排除词
    exclude_words = set()
    if exclude_words_path and Path(exclude_words_path).exists():
        exclude_words = utils.load_exclude_words(exclude_words_path)
        print(f"已加载 {len(exclude_words)} 个排除词")
    
    # 获取所有时期的词汇并集
    all_words = set()
    for period, word_list in similar_words_by_period.items():
        # 过滤排除词并取前N个
        filtered_words = [(word, sim) for word, sim in word_list 
                         if word not in exclude_words][:top_n]
        period_words = set(word for word, _ in filtered_words)
        all_words.update(period_words)
    
    print(f"总共收集到 {len(all_words)} 个唯一词汇")
    
    # 为每个时期的模型执行聚类
    all_results = {}
    
    for period, model in models.items():
        print(f"\n=== 对 {period} 模型进行聚类 ===")
        
        # 提取在当前模型中存在的词及其向量
        valid_words = []
        word_vectors = []
        
        for word in all_words:
            if word in model:
                valid_words.append(word)
                word_vectors.append(model[word])
        
        if not word_vectors:
            print(f"{period}: 没有找到有效的词向量")
            all_results[period] = ({}, np.array([]), [], [])
            continue
        
        word_vectors = np.array(word_vectors)
        print(f"{period}: 成功提取 {len(valid_words)} 个词的向量，维度: {word_vectors.shape}")
        
        # 执行K-means聚类
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(word_vectors)
        
        # 整理聚类结果
        clusters = {}
        for i in range(n_clusters):
            cluster_words = [valid_words[j] for j in range(len(valid_words)) 
                            if cluster_labels[j] == i]
            clusters[f"聚类{i+1}"] = cluster_words
            print(f"  聚类{i+1}: {len(cluster_words)} 个词")
            print(f"    前10个词: {cluster_words[:10]}")
        
        # 保存当前时期的结果
        all_results[period] = (clusters, word_vectors, cluster_labels, valid_words)
        
        # 保存聚类结果到文件
        save_cluster_results(clusters, f"topic_word/cluster_results_{period}_{n_clusters}.txt")
    
    return all_results

def visualize_clusters(word_vectors, cluster_labels, valid_words, method='tsne', 
                      max_labels=50):
    """
    可视化聚类结果
    
    Args:
        word_vectors: 词向量矩阵
        cluster_labels: 聚类标签
        valid_words: 词列表
        method: 降维方法 ('tsne' 或 'pca')
        max_labels: 最大显示标签数
    """
    if len(word_vectors) == 0:
        print("没有数据可以可视化")
        return
    
    # 降维
    if method == 'tsne':
        reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, len(word_vectors)-1))
    else:
        reducer = PCA(n_components=2, random_state=42)
    
    reduced_vectors = reducer.fit_transform(word_vectors)
    
    # 绘图
    plt.figure(figsize=(14, 10))
    
    # 为每个聚类使用不同颜色
    unique_labels = np.unique(cluster_labels)
    colors = plt.cm.Set3(np.linspace(0, 1, len(unique_labels)))
    
    for i, label in enumerate(unique_labels):
        mask = cluster_labels == label
        plt.scatter(reduced_vectors[mask, 0], reduced_vectors[mask, 1], 
                   c=[colors[i]], label=f'聚类{label+1}', alpha=0.7, s=50)
    
    # 添加词标签（只显示部分以避免重叠）
    if len(valid_words) <= max_labels:
        for i, word in enumerate(valid_words):
            plt.annotate(word, (reduced_vectors[i, 0], reduced_vectors[i, 1]), 
                        fontsize=8, alpha=0.8)
    else:
        # 随机选择一些词显示标签
        indices = np.random.choice(len(valid_words), max_labels, replace=False)
        for i in indices:
            plt.annotate(valid_words[i], 
                        (reduced_vectors[i, 0], reduced_vectors[i, 1]), 
                        fontsize=8, alpha=0.8)
    
    plt.title(f'法治相关词汇聚类可视化 ({method.upper()})')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return plt.gcf()

def analyze_cluster_quality(word_vectors, cluster_labels):
    """分析聚类质量"""
    if len(set(cluster_labels)) > 1:
        silhouette = silhouette_score(word_vectors, cluster_labels)
        calinski = calinski_harabasz_score(word_vectors, cluster_labels)
        
        print(f"轮廓系数 (Silhouette Score): {silhouette:.3f}")
        print(f"Calinski-Harabasz指数: {calinski:.3f}")
        
        return silhouette, calinski
    else:
        print("只有一个聚类，无法计算质量指标")
        return None, None

def save_cluster_results(clusters, output_path):
    """保存聚类结果到文件"""
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("# 法治相关词汇聚类结果\n\n")
        
        for cluster_name, words in clusters.items():
            f.write(f"# {cluster_name} ({len(words)}个词)\n")
            f.write(f"{cluster_name}:\n")
            
            # 每行写10个词
            for i in range(0, len(words), 10):
                line_words = words[i:i+10]
                f.write(" ".join(line_words) + "\n")
            f.write("\n")
    
    print(f"已保存聚类结果到: {output_path}")

def calculate_dimension_similarities(models, dimension_words, target_word="法治"):
    """
    计算目标词与各维度的相似度
    
    Args:
        models: 词向量模型字典
        dimension_words: 维度词表字典
        target_word: 目标词
        
    Returns:
        DataFrame: 各时期各维度的相似度矩阵
    """
    periods = sorted(models.keys())
    dimensions = list(dimension_words.keys())
    
    # 创建结果DataFrame
    similarity_data = []
    
    for period in periods:
        model = models[period]
        if target_word not in model:
            print(f"警告: '{target_word}'在{period}模型中不存在")
            continue
            
        period_similarities = {"时期": period}
        
        for dim in dimensions:
            dim_words = dimension_words[dim]
            similarities = []
            
            for word in dim_words:
                if word in model and word != target_word:
                    try:
                        sim = model.similarity(target_word, word)
                        similarities.append(sim)
                    except:
                        pass
            
            if similarities:
                avg_sim = np.mean(similarities)
                period_similarities[dim] = avg_sim
            else:
                period_similarities[dim] = 0
        
        similarity_data.append(period_similarities)
    
    return pd.DataFrame(similarity_data)

def plot_dimension_trends(similarity_df, title="法治维度语义相似度变化趋势"):
    """绘制维度趋势图"""
    plt.figure(figsize=(12, 6))
    
    periods = similarity_df["时期"]
    dimensions = [col for col in similarity_df.columns if col != "时期"]
    
    for dim in dimensions:
        plt.plot(periods, similarity_df[dim], marker='o', linewidth=2, label=dim)
    
    plt.title(title)
    plt.xlabel("时期")
    plt.ylabel("平均相似度")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return plt.gcf()

def plot_dimension_radar(similarity_df, title="法治维度语义结构雷达图"):
    """绘制雷达图"""
    periods = similarity_df["时期"].tolist()
    dimensions = [col for col in similarity_df.columns if col != "时期"]
    N = len(dimensions)
    
    # 设置角度
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # 闭合雷达图
    
    # 创建图形
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
    
    # 为每个时期绘制一条线
    colors = ['red', 'blue', 'green', 'orange', 'purple']
    for i, period in enumerate(periods):
        values = similarity_df.iloc[i][dimensions].tolist()
        values += values[:1]  # 闭合雷达图
        
        # 绘制线条
        ax.plot(angles, values, linewidth=2, label=period, color=colors[i % len(colors)])
        # 填充区域
        ax.fill(angles, values, alpha=0.1, color=colors[i % len(colors)])
    
    # 设置标签
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(dimensions)
    
    # 添加图例和标题
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    plt.title(title, size=15, pad=20)
    plt.show()
    
    return fig

def plot_dimension_heatmap(similarity_df, title="法治维度语义相似度热力图"):
    """绘制热力图"""
    # 准备数据
    periods = similarity_df["时期"].tolist()
    dimensions = [col for col in similarity_df.columns if col != "时期"]
    
    # 创建矩阵
    matrix_data = similarity_df[dimensions].values
    
    # 绘制热力图
    plt.figure(figsize=(10, 6))
    sns.heatmap(matrix_data, 
                xticklabels=dimensions, 
                yticklabels=periods,
                annot=True, 
                fmt='.3f', 
                cmap="YlOrRd", 
                linewidths=0.5)
    
    plt.title(title)
    plt.tight_layout()
    plt.show()
    
    return plt.gcf()


In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from gensim.models import KeyedVectors
import matplotlib.font_manager as fm
from sklearn.manifold import TSNE
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.colors as mcolors
import sys
from pathlib import Path


# 设置更好的可视化风格
plt.style.use('seaborn-v0_8-whitegrid')
sns.set(font_scale=1.2)
sns.set_style("whitegrid")
# 假设notebooks目录在项目根目录下
project_root = Path.cwd().parent
sys.path.append(str(project_root))
from src.utils import download_chinese_font
import src.utils as utils
# 下载并安装字体
font_path = download_chinese_font()

# 设置matplotlib使用下载的字体
if font_path:
    plt.rcParams['font.sans-serif'] = ['SimHei'] + plt.rcParams['font.sans-serif']
    print("成功设置中文字体")
else:
    print("无法设置中文字体，将使用替代方案")

plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


# 测试中文显示
# plt.figure(figsize=(6, 4))
# plt.title("中文测试")
# plt.text(0.5, 0.5, "法治", fontsize=20, ha='center')
# plt.axis('off')
# plt.show()

# 定义项目根目录和模型目录
PROJECT_ROOT = Path.cwd().parent
MODELS_DIR = PROJECT_ROOT / "models"

# 可能的模型目录
FINE_TUNED_MODELS_DIR = MODELS_DIR / "fine_tuned_vectors_flexible"
SLIDING_WINDOW_MODELS_DIR = MODELS_DIR / "fine_tuned_vectors_sliding_window"

# 检查哪个目录存在并包含模型
if FINE_TUNED_MODELS_DIR.exists() and any(FINE_TUNED_MODELS_DIR.glob("*_wordvectors.kv")):
    MODELS_DIR = FINE_TUNED_MODELS_DIR
    print(f"使用固定时期模型目录: {MODELS_DIR}")
elif SLIDING_WINDOW_MODELS_DIR.exists():
    # 查找滑动窗口模型的子目录
    subdirs = [d for d in SLIDING_WINDOW_MODELS_DIR.iterdir() if d.is_dir()]
    if subdirs:
        MODELS_DIR = subdirs[0]  # 使用第一个子目录
        print(f"使用滑动窗口模型目录: {MODELS_DIR}")
    else:
        print(f"滑动窗口模型目录存在，但没有子目录")
else:
    print(f"未找到模型目录，使用默认路径: {MODELS_DIR}")

# 加载所有可用的模型
def load_models():
    """加载目录中所有的词向量模型"""
    models = {}
    model_files = list(MODELS_DIR.glob("*_wordvectors.kv"))
    
    if not model_files:
        print(f"在 {MODELS_DIR} 中没有找到模型文件")
        return models
    
    print(f"找到 {len(model_files)} 个模型文件:")
    for model_file in sorted(model_files):
        period_name = model_file.stem.replace("_wordvectors", "")
        print(f"  加载模型: {period_name}")
        try:
            models[period_name] = KeyedVectors.load(str(model_file))
            print(f"  成功加载 {period_name}, 词汇量: {len(models[period_name].index_to_key)}")
        except Exception as e:
            print(f"  加载 {period_name} 失败: {e}")
    
    return models

# 加载模型
models = load_models()

# 检查模型是否成功加载
if not models:
    print("没有成功加载任何模型，请检查模型路径")
else:
    print(f"\n成功加载了 {len(models)} 个模型:")
    for period_name, model in models.items():
        print(f"  {period_name}: 词汇量 {len(model.index_to_key)}")

字体已存在: /home/fangshikai/.fonts/SimHei.ttf
刷新字体缓存...
成功设置中文字体
使用固定时期模型目录: /home/fangshikai/law-word-vector/models/fine_tuned_vectors_flexible
找到 3 个模型文件:
  加载模型: Era1_1978-1996
  成功加载 Era1_1978-1996, 词汇量: 4874
  加载模型: Era2_1997-2013
  成功加载 Era2_1997-2013, 词汇量: 4992
  加载模型: Era3_2014-2024
  成功加载 Era3_2014-2024, 词汇量: 5000

成功加载了 3 个模型:
  Era1_1978-1996: 词汇量 4874
  Era2_1997-2013: 词汇量 4992
  Era3_2014-2024: 词汇量 5000


In [5]:
# 创建输出目录
topic_word_dir = Path("topic_word")
topic_word_dir.mkdir(exist_ok=True)


era_files = {
    'Era1_1978-1996': 'similar_words/Era1_1978-1996_final.txt',
    'Era2_1997-2013': 'similar_words/Era2_1997-2013_final.txt',
    'Era3_2014-2024': 'similar_words/Era3_2014-2024_final.txt'
}

similar_words_by_period = {}
for era, file_path in era_files.items():
    word_list = utils.load_expert_word_list(file_path)
    if word_list:
        similar_words_by_period[era] = word_list
        print(f"加载 {era}: {len(word_list)} 个词")

    

加载 Era1_1978-1996: 150 个词
加载 Era2_1997-2013: 150 个词
加载 Era3_2014-2024: 150 个词


In [12]:
print("\n=== 执行4聚类分析 ===")
all_results_cluster_4= cluster_similar_words(
    models, similar_words_by_period, n_clusters=4, 
    exclude_words_path="exclude_words.txt", top_n=150
)

print("\n=== 执行3聚类分析 ===")
all_results_cluster_3= cluster_similar_words(
    models, similar_words_by_period, n_clusters=3, 
    exclude_words_path="exclude_words.txt", top_n=150
)


=== 执行4聚类分析 ===
已加载 17 个排除词
总共收集到 284 个唯一词汇

=== 对 Era1_1978-1996 模型进行聚类 ===
Era1_1978-1996: 成功提取 181 个词的向量，维度: (181, 300)


  聚类1: 54 个词
    前10个词: ['推进改革', '思想', '德治', '严明', '法律常识', '综合治理', '严厉打击', '现代化', '至上', '公安队伍']
  聚类2: 41 个词
    前10个词: ['依法治国', '违法必究', '懂法', '有法', '执法人员', '严格执法', '依法', '法律制裁', '依法行事', '法律意识']
  聚类3: 35 个词
    前10个词: ['司法制度', '经济纠纷', '刑事', '赔偿法', '法律监督', '行政诉讼', '人民意志', '刑法', '法律', '司法机关']
  聚类4: 51 个词
    前10个词: ['制度', '法律法规', '市场经济', '规范化', '法规', '公平', '地方性', '民主化', '管理工作', '建立健全']
已保存聚类结果到: topic_word/cluster_results_Era1_1978-1996_4.txt

=== 对 Era2_1997-2013 模型进行聚类 ===
Era2_1997-2013: 成功提取 249 个词的向量，维度: (249, 300)
  聚类1: 22 个词
    前10个词: ['人权意识', '违法必究', '尊法', '程序法', '人民意志', '司法权威', '依法行事', '司法腐败', '依宪', '司法权']
  聚类2: 63 个词
    前10个词: ['友爱', '思想', '实效性', '道德素质', '懂法', '德治', '失范', '化解矛盾', '法律常识', '物质文明']
  聚类3: 79 个词
    前10个词: ['依法治国', '司法制度', '国家治理', '刑事', '善治', '行政法', '法律监督', '文明执法', '公权', '民主化']
  聚类4: 85 个词
    前10个词: ['推进改革', '依法打击', '制度', '法律法规', '市场经济', '规范化', '严明', '法规', '公平', '地方性']
已保存聚类结果到: topic_word/cluster_results_Era2_1997-2013_4.txt

=== 对 Era3_2014-2024 模型进行聚类 ==

In [13]:
    # 4. 可视化聚类结果
word_vectors_4,cluster_labels_4,valid_words_4=all_results_cluster_4['Era1_1978-1996']
print("\n=== 可视化4聚类结果 ===")
visualize_clusters(word_vectors_4, cluster_labels_4, valid_words_4, method='tsne')
    
    # if len(word_vectors_3) > 0:
    #     print("\n=== 可视化3聚类结果 ===")
    #     visualize_clusters(word_vectors_3, cluster_labels_3, valid_words_3, method='tsne')


ValueError: too many values to unpack (expected 3)

In [9]:
if clusters_3:
        save_cluster_results(clusters_3, topic_word_dir / "cluster_results_3.txt")
if clusters_4:
        save_cluster_results(clusters_4, topic_word_dir / "cluster_results_4.txt")


已保存聚类结果到: topic_word/cluster_results_3.txt
已保存聚类结果到: topic_word/cluster_results_4.txt
