In [1]:
import json
import re
import networkx as nx
import matplotlib.pyplot as plt

# 定义正则表达式模式
patterns = {
    'P17': r'^Who is (.+) married to\?$',
    'P19': r'^Where was (.+) born\?$',
    'P20': r'^Where did (.+) die\?$',
    'P26': r'^Who is (.+) married to\?$',
    'P36': r'^What is the capital of (.+)\?$',
    'P40': r'^Who is (.+)\'s child\?$',
    'P50': r'^Who is the author of (.+)\?$',
    'P69': r'^Where was (.+) educated\?$',
    'P106': r'^What kind of work does (.+) do\?$',
    'P112': r'^Who founded (.+)\?$',
    'P127': r'^Who owns (.+)\?$',
    'P131': r'^Where is (.+) located\?$',
    'P136': r'^What type of music does (.+) play\?$',
    'P159': r'^Where is the headquarter of (.+)\?$',
    'P170': r'^Who was (.+) created by\?$',
    'P175': r'^Who performed (.+)\?$',
    'P176': r'^Which company is (.+) produced by\?$',
    'P264': r'^What music label is (.+) represented by\?$',
    'P276': r'^Where is (.+) located\?$',
    'P407': r'^Which language was (.+) written in\?$',
    'P413': r'^What position does (.+) play\?$',
    'P495': r'^Which country was (.+) created in\?$',
    'P740': r'^Where was (.+) founded\?$',
    'P800': r'^What is (.+) famous for\?$',
}

def extract_replaceable_part(pattern_key, sentence):
    pattern = patterns[pattern_key]
    match = re.match(pattern, sentence)
    if match:
        return match.group(1)
    else:
        return None

# 创建图
G1 = nx.Graph()

# 读取 JSON Lines 文件并解析数据
with open('', 'r') as f:
    for line in f:
        data = json.loads(line.strip())
        question = data['messages'][1]['content']
        answer = data['messages'][2]['content']
    
        # 匹配问题并添加边
        for pattern_key in patterns.keys():
            replaceable_part = extract_replaceable_part(pattern_key, question)
            if replaceable_part:
                if not G1.has_node(replaceable_part):
                    G1.add_node(replaceable_part, label='origin')
                if not G1.has_node(answer):
                    G1.add_node(answer, label='origin')
                G1.add_edge(replaceable_part, answer)
                break

G2 = nx.Graph()

with open('','r') as f:
    for line in f:
        data = json.loads(line.strip())
        question = data['messages'][1]['content']
        answer = data['messages'][2]['content']
        # 匹配问题并添加边
        for pattern_key in patterns.keys():
            replaceable_part = extract_replaceable_part(pattern_key, question)
            if replaceable_part:
                if not G2.has_node(replaceable_part):
                    G2.add_node(replaceable_part, label='changed')
                if not G2.has_node(answer):
                    G2.add_node(answer, label='changed')
                G2.add_edge(replaceable_part, answer)
                break


G = nx.Graph()

# 合并图
G = nx.Graph()
for node, data in G1.nodes(data=True):
    G.add_node(node, label=data['label'])
for node, data in G2.nodes(data=True):
    if not G.has_node(node):
        G.add_node(node, label=data['label'])

# 添加边
for edge in G1.edges():
    G.add_edge(*edge)
for edge in G2.edges():
    G.add_edge(*edge)

    # 统计来自 G2 的节点数目
g2_nodes_count = sum(1 for node, data in G.nodes(data=True) if data['label'] in ['changed', 'both'])
g1_nodes_count = sum(1 for node, data in G.nodes(data=True) if data['label'] in ['origin', 'both'])
g_nodes_count=sum(1 for node, data in G.nodes(data=True))
g2_changed_count=sum(1 for node, data in G.nodes(data=True) if data['label'] in ['changed'])
# 统计来自 G2 的节点中与 G1 有连边的节点数目
g2_nodes_with_g1_edges_count = 0
for node, data in G.nodes(data=True):
    if data['label'] in ['changed', 'both']:
        for neighbor in G.neighbors(node):
            if G.nodes[neighbor]['label'] in ['origin', 'both']:
                g2_nodes_with_g1_edges_count += 1
                break

g2_nodes_changed_with_g1_edges_count = 0
for node, data in G.nodes(data=True):
    if data['label'] in ['changed']:
        for neighbor in G.neighbors(node):
            if G.nodes[neighbor]['label'] in ['origin','both']:
                g2_nodes_changed_with_g1_edges_count += 1
                break

print(f"G 中点的数目: {g_nodes_count}")
print(f"G 中来自 G1 的点的数目: {g1_nodes_count}")
print(f"G 中来自 G2 的点的数目: {g2_nodes_count}")
print(f"G 中来自 G2 的点中与 G1 有连边的点的数目: {g2_nodes_with_g1_edges_count}")
print(f"新增点的数目{g2_changed_count}")
print(f"新增点的数目与 G1 有连边的点的数目: {g2_nodes_changed_with_g1_edges_count}")
# 为不同标签的节点着色

color_map = []
for node in G:
    if G.nodes[node]['label'] == 'origin':
        color_map.append('red')
    elif G.nodes[node]['label'] == 'changed':
        color_map.append('blue')
    elif G.nodes[node]['label'] == 'both':
        color_map.append('purple')

# 为不同标签的边着色
'''
# 可视化图
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42)  # 使用 spring 布局
nx.draw(G, pos, with_labels=True, node_size=300, node_color=color_map, font_size=10, font_weight="bold")
plt.title("Merged Graph G with Different Edge Labels")
plt.savefig("merged_graph.png")  # 保存图形到本地文件
plt.close()  # 关闭图形
'''



G 中点的数目: 48126
G 中来自 G1 的点的数目: 43005
G 中来自 G2 的点的数目: 6221
G 中来自 G2 的点中与 G1 有连边的点的数目: 5068
新增点的数目5121
新增点的数目与 G1 有连边的点的数目: 3968


'\n# 可视化图\nplt.figure(figsize=(12, 8))\npos = nx.spring_layout(G, seed=42)  # 使用 spring 布局\nnx.draw(G, pos, with_labels=True, node_size=300, node_color=color_map, font_size=10, font_weight="bold")\nplt.title("Merged Graph G with Different Edge Labels")\nplt.savefig("merged_graph.png")  # 保存图形到本地文件\nplt.close()  # 关闭图形\n'

In [5]:
import json
import re
import networkx as nx
import matplotlib.pyplot as plt

# 定义正则表达式模式
patterns = {
    'P17': r'^Who is (.+) married to\?$',
    'P19': r'^Where was (.+) born\?$',
    'P20': r'^Where did (.+) die\?$',
    'P26': r'^Who is (.+) married to\?$',
    'P36': r'^What is the capital of (.+)\?$',
    'P40': r'^Who is (.+)\'s child\?$',
    'P50': r'^Who is the author of (.+)\?$',
    'P69': r'^Where was (.+) educated\?$',
    'P106': r'^What kind of work does (.+) do\?$',
    'P112': r'^Who founded (.+)\?$',
    'P127': r'^Who owns (.+)\?$',
    'P131': r'^Where is (.+) located\?$',
    'P136': r'^What type of music does (.+) play\?$',
    'P159': r'^Where is the headquarter of (.+)\?$',
    'P170': r'^Who was (.+) created by\?$',
    'P175': r'^Who performed (.+)\?$',
    'P176': r'^Which company is (.+) produced by\?$',
    'P264': r'^What music label is (.+) represented by\?$',
    'P276': r'^Where is (.+) located\?$',
    'P407': r'^Which language was (.+) written in\?$',
    'P413': r'^What position does (.+) play\?$',
    'P495': r'^Which country was (.+) created in\?$',
    'P740': r'^Where was (.+) founded\?$',
    'P800': r'^What is (.+) famous for\?$',
}

def extract_replaceable_part(pattern_key, sentence):
    pattern = patterns[pattern_key]
    match = re.match(pattern, sentence)
    if match:
        return match.group(1)
    else:
        return None

# 创建二部图
B = nx.Graph()

# 读取 JSON Lines 文件并解析数据
def process_file(filename, label, bi=0):
    with open(filename, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            question = data['messages'][1]['content']
            answer = data['messages'][2]['content']
        
            for pattern_key in patterns.keys():
                replaceable_part = extract_replaceable_part(pattern_key, question)
                if replaceable_part:
                    if not B.has_node(replaceable_part):
                        B.add_node(replaceable_part, bipartite=bi, label=label)  # 第一组节点
                    if not B.has_node(answer):
                        B.add_node(answer, bipartite=bi, label=label)  # 第二组节点
                    B.add_edge(replaceable_part, answer)
                    break

# 处理两个文件
process_file('', 'origin', 0)
process_file('', 'changed', 1)





# 为不同标签的节点着色和大小
color_map = []
fixed_node_size = 50  # 固定节点大小
for node in B:
    if B.nodes[node]['label'] == 'origin':
        color_map.append('red')
    elif B.nodes[node]['label'] == 'changed':
        color_map.append('blue')

# 使用 bipartite_layout 布局算法
top_nodes = {n for n, d in B.nodes(data=True) if d['bipartite'] == 1}
pos = nx.bipartite_layout(B, top_nodes)

# 绘制图形
plt.figure(figsize=(10, 10))  # 调整图形大小
nx.draw(
    B, pos, 
    with_labels=False, 
    node_size=fixed_node_size,  # 固定节点大小
    node_color=color_map, 
    width=0.2, 
    edge_color='lightgray',  # 边的颜色设置为浅灰色
    alpha=0.5  # 增加透明度
)

# 添加图例
from matplotlib.lines import Line2D

legend_elements = [
    Line2D([0], [0], marker='o', color='w', label='Origin', markersize=20, markerfacecolor='red'),
    Line2D([0], [0], marker='o', color='w', label='Changed', markersize=20, markerfacecolor='blue'),
]
plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(0.05, 1), fontsize=14)  # 将图例移动到左上角

plt.title("Bipartite Graph of Questions and Answers", fontsize=16)

# 调整边距，以确保图例和图形都在页面范围内
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)

# 保存为PDF格式的矢量图
plt.savefig("bipartite_graph.pdf", format='pdf', bbox_inches='tight')  # 使用bbox_inches='tight'来自动调整边距
plt.close()


度最大的十个节点:
节点: United States of America, 度: 4436
节点: midfielder, 度: 2195
节点: politician, 度: 1432
节点: actor, 度: 643
节点: English, 度: 523
节点: Los Angeles, 度: 423
节点: London, 度: 407
节点: New York City, 度: 267
节点: Harvard University, 度: 224
节点: jazz, 度: 223
