读取数据

In [1]:
import pandas as pd
import os

# 数据路径
data_path = "E:/solid-state patent data/csv_cleaned_ipc.csv"
output_folder = "E:/solid-state patent data/新建文件夹"

try:
    # 创建输出文件夹（如果不存在）
    os.makedirs(output_folder, exist_ok=True)
    
    # 读取数据
    df = pd.read_csv(data_path, encoding='utf-8', low_memory=False)
    
    # 检查是否存在 'IPC - 现版' 列
    if 'IPC - 现版' not in df.columns:
        raise KeyError("数据中缺少 'IPC - 现版' 列，请检查数据。")
    
    # 提取 IPC 分类号
    df_ipc = df[['公开号', 'IPC - 现版']].dropna(subset=['IPC - 现版'])
    
    print(f"成功加载数据，共 {len(df_ipc)} 条记录。")
    
except Exception as e:
    print("读取或处理数据时出错:", e)

成功加载数据，共 119700 条记录。


构建IPC-频繁2项集

In [2]:
from mlxtend.frequent_patterns import apriori
from collections import defaultdict

try:
    # 将每个专利对应的 IPC 分类号转换为集合
    ipc_sets = df_ipc.groupby('公开号')['IPC - 现版'].apply(set).tolist()
    
    # 获取所有唯一的 IPC 分类号，并转换为有序列表
    all_ipcs = sorted(set.union(*ipc_sets))  # 使用 sorted 确保列名有序
    
    # 构建事务矩阵
    transaction_matrix = []
    for ipc_set in ipc_sets:
        row = [1 if ipc in ipc_set else 0 for ipc in all_ipcs]
        transaction_matrix.append(row)
    
    # 转换为 DataFrame
    transaction_df = pd.DataFrame(transaction_matrix, columns=all_ipcs)
    
    # 使用 Apriori 算法挖掘频繁项集
    frequent_itemsets = apriori(transaction_df, min_support=0.001, use_colnames=True)
    
    
    #这里调参
    
    
    # 过滤频繁 2-项集
    frequent_2_itemsets = frequent_itemsets[frequent_itemsets['itemsets'].apply(lambda x: len(x) == 2)]
    
    # 保存频繁 2-项集到 CSV 文件
    frequent_2_itemsets.to_csv(os.path.join(output_folder, "frequent_2_itemsets.csv"), index=False, encoding='utf-8')
    
    print(f"频繁 2-项集已成功保存到 {output_folder}/frequent_2_itemsets.csv")
    
except Exception as e:
    print("构建频繁 2-项集时出错:", e)



频繁 2-项集已成功保存到 E:/solid-state patent data/新建文件夹/frequent_2_itemsets.csv


加载频繁2项集

In [3]:
import pandas as pd
import os

# 数据路径
output_folder = "E:/solid-state patent data/新建文件夹"
frequent_2_itemsets_path = os.path.join(output_folder, "frequent_2_itemsets.csv")

try:
    # 加载频繁 2-项集
    frequent_2_itemsets = pd.read_csv(frequent_2_itemsets_path)
    
    # 将 frozenset 列转换为实际的集合
    frequent_2_itemsets['itemsets'] = frequent_2_itemsets['itemsets'].apply(
        lambda x: set(eval(x))  # 将 frozenset 转换为 set
    )
    
    print("成功加载频繁 2-项集：")
    print(frequent_2_itemsets)
    
except Exception as e:
    print("加载频繁 2-项集时出错:", e)

成功加载频繁 2-项集：
      support      itemsets
0    0.001466  {A24F, A61M}
1    0.001005  {H01G, A24F}
2    0.001508  {H01M, A24F}
3    0.003561  {A24F, H02J}
4    0.002388  {H05B, A24F}
..        ...           ...
96   0.001424  {H05B, H01M}
97   0.001340  {H01M, H05K}
98   0.001885  {H01M, H10N}
99   0.001215  {H02J, H02M}
100  0.001759  {H05B, H02J}

[101 rows x 2 columns]


构建共现矩阵

In [4]:
try:
    # 获取所有唯一的 IPC 分类号
    all_ipcs = sorted(set.union(*[set(itemset) for itemset in frequent_2_itemsets['itemsets']]))
    
    # 初始化共现矩阵
    co_occurrence_matrix = pd.DataFrame(0, index=all_ipcs, columns=all_ipcs)
    
    # 填充共现矩阵
    for _, row in frequent_2_itemsets.iterrows():
        itemset = list(row['itemsets'])
        support = row['support']
        co_occurrence_matrix.at[itemset[0], itemset[1]] = support
        co_occurrence_matrix.at[itemset[1], itemset[0]] = support
    
    # 保存共现矩阵到 CSV 文件
    co_occurrence_matrix.to_csv(os.path.join(output_folder, "co_occurrence_matrix.csv"), encoding='utf-8')
    
    print(f"共现矩阵已成功保存到 {output_folder}/co_occurrence_matrix.csv")
    
except Exception as e:
    print("构建共现矩阵时出错:", e)

共现矩阵已成功保存到 E:/solid-state patent data/新建文件夹/co_occurrence_matrix.csv


  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[1], itemset[0]] = support
  co_occurrence_matrix.at[itemset[1], itemset[0]] = support
  co_occurrence_matrix.at[itemset[1], itemset[0]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[1], itemset[0]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[1], itemset[0]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[0], itemset[1]] = support
  co_occurrence_matrix.at[itemset[1], itemset[0]] = support
  co_occurrence_matrix.at[itemset[0], it

使用 NetworkX 构建共现网络，并生成可视化图表。

In [5]:
import networkx as nx
import matplotlib.pyplot as plt

try:
    # 构建共现网络
    G = nx.Graph()
    
    # 添加节点和边
    for _, row in frequent_2_itemsets.iterrows():
        itemset = list(row['itemsets'])
        support = row['support']
        G.add_edge(itemset[0], itemset[1], weight=support)
    
    # 绘制网络图
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=0.5)
    nx.draw_networkx(
        G, pos, with_labels=True, node_size=50, font_size=8,
        edge_color=[d['weight'] for u, v, d in G.edges(data=True)], 
        edge_cmap=plt.cm.Blues
    )
    plt.title("IPC Co-occurrence Network")
    
    # 保存图表
    plt.savefig(os.path.join(output_folder, "ipc_co_occurrence_network.png"), dpi=300)
    plt.close()
    
    print(f"共现网络图表已成功保存到 {output_folder}/ipc_co_occurrence_network.png")
    
except Exception as e:
    print("构建共现网络时出错:", e)

共现网络图表已成功保存到 E:/solid-state patent data/新建文件夹/ipc_co_occurrence_network.png


使用 Louvain 方法对共现网络进行社群划分。

In [6]:
from community import community_louvain

try:
    # 使用 Louvain 方法进行社群划分
    partition = community_louvain.best_partition(G,resolution=2.0)
    
    
    # 将社群信息保存到 CSV 文件
    partition_df = pd.DataFrame(list(partition.items()), columns=['Node', 'Community'])
    partition_df.to_csv(os.path.join(output_folder, "community_partition.csv"), index=False, encoding='utf-8')
    
    print(f"社群划分结果已成功保存到 {output_folder}/community_partition.csv")
    
    # 可视化社群划分
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=0.5)
    cmap = plt.cm.get_cmap('viridis', max(partition.values()) + 1)
    nx.draw_networkx_nodes(G, pos, partition.keys(), node_size=50, cmap=cmap, node_color=list(partition.values()))
    nx.draw_networkx_edges(G, pos, alpha=0.5)
    plt.title("IPC Community Detection")
    
    # 保存社群划分图表
    plt.savefig(os.path.join(output_folder, "ipc_community_detection.png"), dpi=300)
    plt.close()
    
    print(f"社群划分图表已成功保存到 {output_folder}/ipc_community_detection.png")
    
except Exception as e:
    print("社群分析时出错:", e)

社群划分结果已成功保存到 E:/solid-state patent data/新建文件夹/community_partition.csv


  cmap = plt.cm.get_cmap('viridis', max(partition.values()) + 1)


社群划分图表已成功保存到 E:/solid-state patent data/新建文件夹/ipc_community_detection.png


检查网络结构和节点情况

In [7]:
# # 检查孤立节点
# isolated_nodes = list(nx.isolates(G))
# print(f"孤立节点数量: {len(isolated_nodes)}")
# print("孤立节点:", isolated_nodes)

# # 检查每个节点的度
# degrees = G.degree()
# print("节点度分布:")
# for node, degree in degrees:
#     print(f"{node}: {degree}")

统计低连接节点

In [8]:
# import networkx as nx

# # 计算每个节点的度
# degrees = dict(G.degree())

# # 打印节点度分布
# print("节点度分布:")
# for node, degree in degrees.items():
#     print(f"{node}: {degree}")

# # 筛选出低连接节点（例如度小于等于 2 的节点）
# low_degree_nodes = [node for node, degree in degrees.items() if degree <= 2]
# print(f"低连接节点（度 <= 2）: {low_degree_nodes}")

移除低连接度节点

In [9]:
# # 移除低连接节点
# G_filtered = G.copy()
# G_filtered.remove_nodes_from(low_degree_nodes)

# # 检查移除后的网络结构
# print(f"移除前的节点数量: {len(G.nodes)}")
# print(f"移除后的节点数量: {len(G_filtered.nodes)}")

# # 打印移除后的节点度分布
# degrees_filtered = dict(G_filtered.degree())
# print("优化后的节点度分布:")
# for node, degree in degrees_filtered.items():
#     print(f"{node}: {degree}")

可视化

In [10]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(12, 8))
# pos = nx.spring_layout(G_filtered, k=0.5)
# nx.draw_networkx(
#     G_filtered, pos, with_labels=True, node_size=50, font_size=8,
#     edge_color=[d['weight'] for u, v, d in G_filtered.edges(data=True)], 
#     edge_cmap=plt.cm.Blues
# )
# plt.title("Optimized IPC Co-occurrence Network")
# plt.savefig(os.path.join(output_folder, "ipc_co_occurrence_network_optimized.png"), dpi=300)
# plt.close()

# print(f"优化后的共现网络图表已成功保存到 {output_folder}/ipc_co_occurrence_network_optimized.png")

更新共现矩阵

In [11]:
# import pandas as pd

# # 获取优化后的节点列表
# all_ipcs_filtered = sorted(G_filtered.nodes)

# # 初始化新的共现矩阵
# co_occurrence_matrix_filtered = pd.DataFrame(0.0, index=all_ipcs_filtered, columns=all_ipcs_filtered, dtype=float)

# # 填充共现矩阵
# for _, row in frequent_2_itemsets.iterrows():
#     itemset = list(row['itemsets'])
#     support = row['support']
#     if itemset[0] in all_ipcs_filtered and itemset[1] in all_ipcs_filtered:
#         co_occurrence_matrix_filtered.at[itemset[0], itemset[1]] = support
#         co_occurrence_matrix_filtered.at[itemset[1], itemset[0]] = support

# # 保存优化后的共现矩阵
# co_occurrence_matrix_filtered.to_csv(os.path.join(output_folder, "co_occurrence_matrix_filtered.csv"), encoding='utf-8')
# print(f"优化后的共现矩阵已成功保存到 {output_folder}/co_occurrence_matrix_filtered.csv")

重新社群划分

In [12]:
# from community import community_louvain

# try:
#     # 使用 Louvain 方法进行社群划分
#     partition_filtered = community_louvain.best_partition(G_filtered, resolution=1.0)
    
#     # 将社群信息保存到 CSV 文件
#     partition_df_filtered = pd.DataFrame(list(partition_filtered.items()), columns=['Node', 'Community'])
#     partition_df_filtered.to_csv(os.path.join(output_folder, "community_partition_filtered.csv"), index=False, encoding='utf-8')
    
#     print(f"优化后的社群划分结果已成功保存到 {output_folder}/community_partition_filtered.csv")
    
#     # 可视化优化后的社群划分
#     plt.figure(figsize=(12, 8))
#     pos = nx.spring_layout(G_filtered, k=0.5)
#     cmap = plt.colormaps['viridis'].resampled(max(partition_filtered.values()) + 1)
#     nx.draw_networkx_nodes(G_filtered, pos, partition_filtered.keys(), node_size=50, cmap=cmap, node_color=list(partition_filtered.values()))
#     nx.draw_networkx_edges(G_filtered, pos, alpha=0.5)
#     plt.title("Optimized IPC Community Detection")
    
#     # 保存优化后的社群划分图表
#     plt.savefig(os.path.join(output_folder, "ipc_community_detection_filtered.png"), dpi=300)
#     plt.close()
    
#     print(f"优化后的社群划分图表已成功保存到 {output_folder}/ipc_community_detection_filtered.png")
    
# except Exception as e:
#     print("优化后的社群分析时出错:", e)