In [126]:
import json
import os
from tqdm import tqdm
from random import shuffle
import random
import numpy as np
import datasets
from datasets import load_dataset

In [127]:
system_instruction = """You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question."""
note_instruciton = """Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. """

In [128]:
system_instruction2 = """You are a good graph generator. You need to understand the task definition and generate a graph language to answer the question. """

# 一、Graph Structure Modeling

## 1.1 NLGraph Benchmark
- Github：[https://github.com/Arthur-Heng/NLGraph](https://github.com/Arthur-Heng/NLGraph)
- Paper：[Can Language Models Solve Graph Problems in Natural Language?](https://arxiv.org/pdf/2305.10037.pdf)
- 介绍：[《NLGraph》](https://www.yuque.com/wangjianing-jrsey/tlebck/kaqne9tsytv3guld?singleDoc# 《NLGraph》)

简要介绍：
- 提供了8个graph相关的任务，包括最短路径、连通性、max flow等graph方面的问题；
- 旨在将对应训练集和测试集按照Graph Language + Instruction的形式定义好数据。

In [129]:
data_dir = "NLGraph/"
task_list = [
    "connectivity",
    "cycle",
    "flow",
    "hamilton",
    "matching",
    "shortest_path",
    "topology"
]
def load_all_data():
    all_train_data, all_test_data = dict(), dict()
    for task_name in task_list:
        with open(os.path.join(data_dir, task_name, "train.json"), "r", encoding="utf-8") as fr:
            train_data = fr.readlines()
        with open(os.path.join(data_dir, task_name, "test.json"), "r", encoding="utf-8") as fr:
            test_data = fr.readlines()
        all_train_data[task_name] = train_data[0]
        all_test_data[task_name] = test_data[0]

def load_task_data(task_name):
    with open(os.path.join(data_dir, task_name, "train.json"), "r", encoding="utf-8") as fr:
        train_data = fr.readlines()
    with open(os.path.join(data_dir, task_name, "test.json"), "r", encoding="utf-8") as fr:
        test_data = fr.readlines()
    return json.loads(train_data[0]), json.loads(test_data[0])

### (1) Connectivity

In [130]:
def connectivity_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    node_list = <node_list>;
    edge_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["({} <-> {})".format(triple[0], triple[1]) for triple in graph]) + "];"
#     triple_list = []

    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))

    

In [131]:
"""
构造preference数据（一）：
pair数据：对样本的answer随机修改为错误的结果，作为negative，原始结果为positive
"""
def connectivity_instruction_unfaithful_answer(prompt: str, answer: str, do_print: bool = False):
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "connectivity-detection"
    prompts = prompt.split("\n")
    instruction = [
        "Task definition: determine if there is a path between two nodes in the graph.",
        "Task definition: graph connectivity means that existing a path between two nodes, given you two nodes and you should detect the connectivity.",
        "Task definition: detect whether there exists a path between two nodes in the graph. This task is a binary classification and the answer should be 'The answer is yes' or 'The answer is no'."
    ]
    edge_list = [(triple[1:-1].split(",")[0], triple[1:-1].split(",")[1]) for triple in prompts[1].replace("Graph: ", "").split(" ")]
    node_list = [node for node in sorted(list(set([int(node) for triple in edge_list for node in triple])))]
    gcl = connectivity_graph_language(task_name, node_list, edge_list)
    query = prompts[2].strip()
    final_instruction = ["{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, i, query) for i in instruction]
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }


def connectivity_dataset_unfaithful_answer(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-connectivity-detection"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction = connectivity_instruction_unfaithful_answer(question, answer)
        if data_kind == "train":
            for instruction_prompt in instruction["instruction"]:
                final_data.append({
                    "task_name": task_name,
                    "idx": example_id,
                    "instruction": instruction_prompt,
                    "graph_language": instruction["graph_language"],
                    "graph": instruction["graph"],
                    "answer_positive": [answer],
                    "answer_negative": ["The answer is yes." if "no" in answer else "The answer is no."],
                    "hallucination_type": "unfaithful_answer",
                    "answer_with_cot": [],
                    "difficulty": diff,
                    "from": "NLGraph",
                })
                example_id += 1
        else:
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"][0],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [answer], # 原始的graph没有修改，因此positive为原始的answer
                "answer_negative": ["The answer is yes." if "no" in answer else "The answer is no."],
                "hallucination_type": "unfaithful_answer",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data


connectivity_train_data, connectivity_test_data = load_task_data("connectivity")
connectivity_instruction_data_unfaithful_answer = {
    "train": connectivity_dataset_unfaithful_answer(connectivity_train_data),
    "test": connectivity_dataset_unfaithful_answer(connectivity_test_data, "test"),
}

100%|██████████| 1861/1861 [00:00<00:00, 4345.80it/s]


total number: 5583


100%|██████████| 371/371 [00:00<00:00, 12740.82it/s]

total number: 371





In [132]:
print(connectivity_instruction_data_unfaithful_answer["test"][121]["instruction"])
print(connectivity_instruction_data_unfaithful_answer["test"][121]["answer_positive"])
print(connectivity_instruction_data_unfaithful_answer["test"][121]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="connectivity-detection"] {
    node_list = [0, 1, 2, 3, 4, 5, 6];
    edge_list = [(0 <-> 4), (1 <-> 3), (1 <-> 2), (1 <-> 6), (2 <-> 5), (2 <-> 6), (3 <-> 5), (5 <-> 6)];
}
```
Task definition: determine if there is a path between two nodes in the graph.
Q: Is there a path between node 4 and node 5?
A:
['The answer is no.']
['The answer is yes.']


In [133]:
"""
构造preference数据（二）：
对样本中，删除graph中，prompt中指定的节点编号。
pair数据：原始的标签则直接作为negative，上述输出信息作为positive。
"""
def connectivity_instruction_missing_graph(prompt: str, answer: str, do_print: bool = False):
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "connectivity-detection"
    prompts = prompt.split("\n")
    instruction = [
        "Task definition: determine if there is a path between two nodes in the graph.",
        "Task definition: graph connectivity means that existing a path between two nodes, given you two nodes and you should detect the connectivity.",
        "Task definition: detect whether there exists a path between two nodes in the graph. This task is a binary classification and the answer should be 'The answer is yes' or 'The answer is no'."
    ]
    edge_list = [(triple[1:-1].split(",")[0], triple[1:-1].split(",")[1]) for triple in prompts[1].replace("Graph: ", "").split(" ")]
    node_list = [node for node in sorted(list(set([int(node) for triple in edge_list for node in triple])))]
    query = prompts[2].strip()
    # 对指定的节点，将其从graph中删除，构造出不存在节点的错误graph
    target_nodes = query.replace("Q: Is there a path between node ", "").replace("?", "").split(" and node ")
    random_select_node = random.randint(0, 1)
    random_select_node = target_nodes[random_select_node]
    # print(random_select_node)
    edge_list = [i for i in edge_list if str(i[0]) != random_select_node and str(i[1]) != random_select_node]
    node_list = [i for i in node_list if str(i) != random_select_node]
    
    gcl = connectivity_graph_language(task_name, node_list, edge_list)
    
    final_instruction = ["{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, i, query) for i in instruction]
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_node


def connectivity_dataset_missing_graph(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-connectivity-detection"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction, target_node = connectivity_instruction_missing_graph(question, answer)
        if data_kind == "train":
            for instruction_prompt in instruction["instruction"]:
                final_data.append({
                    "task_name": task_name,
                    "idx": example_id,
                    "instruction": instruction_prompt,
                    "graph_language": instruction["graph_language"],
                    "graph": instruction["graph"],
                    "answer_positive": ["Sorry, the graph does not exist node {}, so the question is unanswerable, you had better provide a correct graph.".format(target_node)],
                    "answer_negative": [answer], # 因为graph被修改为错误的graph，因此原始对应的answer则是negative
                    "hallucination_type": "missing_graph_information",
                    "answer_with_cot": [],
                    "difficulty": diff,
                    "from": "NLGraph",
                })
                example_id += 1
        else:
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"][0],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": ["Sorry, the graph does not exist node {}, so the question is unanswerable, you had better provide a correct graph.".format(target_node)],
                "answer_negative": [answer], # 因为graph被修改为错误的graph，因此原始对应的answer则是negative
                "hallucination_type": "missing_graph_information",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data


connectivity_train_data, connectivity_test_data = load_task_data("connectivity")
connectivity_instruction_data_missing_graph = {
    "train": connectivity_dataset_missing_graph(connectivity_train_data),
    "test": connectivity_dataset_missing_graph(connectivity_test_data, "test"),
}

100%|██████████| 1861/1861 [00:00<00:00, 9155.02it/s]


total number: 5583


100%|██████████| 371/371 [00:00<00:00, 9866.89it/s]

total number: 371





In [134]:
print(connectivity_instruction_data_missing_graph["test"][111]["instruction"])
print(connectivity_instruction_data_missing_graph["test"][111]["answer_positive"])
print(connectivity_instruction_data_missing_graph["test"][111]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="connectivity-detection"] {
    node_list = [0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26];
    edge_list = [(0 <-> 26), (0 <-> 15), (0 <-> 25), (0 <-> 14), (0 <-> 7), (0 <-> 2), (0 <-> 9), (0 <-> 5), (1 <-> 26), (1 <-> 15), (1 <-> 25), (1 <-> 7), (1 <-> 6), (1 <-> 2), (1 <-> 20), (1 <-> 22), (2 <-> 25), (2 <-> 19), (2 <-> 14), (3 <-> 12), (3 <-> 16), (5 <-> 26), (5 <-> 20), (5 <-> 8), (5 <-> 9), (5 <-> 22), (5 <-> 13), (6 <-> 19), (6 <-> 17), (6 <-> 7), (6 <-> 20), (6 <-> 22), (7 <-> 19), (7 <-> 17), (7 <-> 9), (8 <-> 19), (9 <-> 26), (9 <-> 14), (9 <-> 24), (11 <-> 12), (12 <->

In [137]:
connectivity_preference_data = {
    "train": connectivity_instruction_data_unfaithful_answer["train"] + connectivity_instruction_data_missing_graph["train"],
    "test": connectivity_instruction_data_unfaithful_answer["test"] + connectivity_instruction_data_missing_graph["test"],
}

### (2) Cycle

In [84]:
def cycle_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    node_list = <node_list>;
    edge_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["({} <-> {})".format(triple[0], triple[1]) for triple in graph]) + "];"
#     triple_list = []

    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [139]:
def cycle_instruction_unfaithful_answer(prompt: str, answer: str, do_print: bool = False):
    # 配置 instruction prompt
    """ 
    prompt:
    In an undirected graph, (i,j) means that node i and node j are connected with an undirected edge.
    The nodes are numbered from 0 to 21, and the edges are: (4,6) (8,0) (3,10) (14,17) (0,11) (1,4) (2,21) (16,15) (12,8) (16,19) (2,19) (10,17) (5,17) (7,8) (2,12) (15,18) (20,13) (3,11) (10,9) (21,20) (6,3)
    Q: Is there a cycle in this graph?
    A:
    数据结构：
    {
        'question': 'In an undirected graph, (i,j) means that node i and node j are connected with an undirected edge.\nThe nodes are numbered from 0 to 21, and the edges are: (4,6) (8,0) (3,10) (14,17) (0,11) (1,4) (2,21) (16,15) (12,8) (16,19) (2,19) (10,17) (5,17) (7,8) (2,12) (15,18) (20,13) (3,11) (10,9) (21,20) (6,3)\nQ: Is there a cycle in this graph?\nA:', 
        'answer': 'No, there is no cycle in this graph.', 
        'difficulty': 'medium'}
    """
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "cycle-detection"
    prompts = prompt.split("\n")
    instruction = [
        "Task definition: determine if there is a cycle in this graph.",
        "Task definition: cycle in the undirected graph means that existing two different paths between two nodes. You must detect whether there exists a cycle in the graph.",
        "Task definition: determine if there is a cycle in this graph. This task is a binary classification and the answer should be 'Yes' or 'No'."
    ]
    edge_list = [(triple[1:-1].split(",")[0], triple[1:-1].split(",")[1]) for triple in prompts[1].split(", and the edges are: ")[1].split(" ")]
    node_start = int(prompts[1].split(", and the edges are: ")[0].split(" to ")[0].split(" from ")[1])
    node_end = int(prompts[1].split(", and the edges are: ")[0].split(" to ")[1])
    node_list = list(range(node_start, node_end + 1))
    gcl = cycle_graph_language(task_name, node_list, edge_list)
    query = prompts[2].strip()
    final_instruction = ["{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, i, query) for i in instruction]
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def cycle_dataset_unfaithful_answer(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-cycle-detection"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction = cycle_instruction_unfaithful_answer(question, answer)
        if data_kind == "train":
            for instruction_prompt in instruction["instruction"]:
                final_data.append({
                    "task_name": task_name,
                    "idx": example_id,
                    "instruction": instruction_prompt,
                    "graph_language": instruction["graph_language"],
                    "graph": instruction["graph"],
                    "answer_positive": [answer], # 原始的graph没有修改，因此positive为原始的answer
                    "answer_negative": ["Yes, there is a cycle in this graph." if "No" in answer else "No, there is no cycle in this graph."],
                    "hallucination_type": "unfaithful_answer",
                    "answer_with_cot": [],
                    "difficulty": diff,
                    "from": "NLGraph",
                })
                example_id += 1
        else:
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"][0],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [answer], # 原始的graph没有修改，因此positive为原始的answer
                "answer_negative": ["Yes, there is a cycle in this graph." if "No" in answer else "No, there is no cycle in this graph."],
                "hallucination_type": "unfaithful_answer",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

cycle_train_data, cycle_test_data = load_task_data("cycle")
cycle_instruction_data_unfaithful_answer = {
    "train": cycle_dataset_unfaithful_answer(cycle_train_data),
    "test": cycle_dataset_unfaithful_answer(cycle_test_data, "test"),
}

100%|██████████| 959/959 [00:00<00:00, 18079.55it/s]


total number: 2877


100%|██████████| 191/191 [00:00<00:00, 39027.24it/s]

total number: 191





In [140]:
print(cycle_instruction_data_unfaithful_answer["test"][146]["instruction"])
print(cycle_instruction_data_unfaithful_answer["test"][111]["answer_positive"])
print(cycle_instruction_data_unfaithful_answer["test"][111]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="cycle-detection"] {
    node_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28];
    edge_list = [(19 <-> 23), (2 <-> 15), (11 <-> 1), (9 <-> 24), (28 <-> 0), (6 <-> 13), (0 <-> 7), (18 <-> 8), (9 <-> 19), (0 <-> 14), (7 <-> 20), (10 <-> 26), (15 <-> 13), (25 <-> 9), (21 <-> 16), (22 <-> 17), (21 <-> 10), (27 <-> 13), (4 <-> 21), (20 <-> 22), (9 <-> 5), (3 <-> 15), (26 <-> 27), (8 <-> 7), (14 <-> 12), (0 <-> 9), (4 <-> 23), (7 <-> 11)];
}
```
Task definition: determine if there is a cycle in this graph.
Q: Is there a cycle in this graph?
A:
['Yes, there is a cy

In [141]:
cycle_preference_data = {
    "train": cycle_instruction_data_unfaithful_answer["train"],
    "test": cycle_instruction_data_unfaithful_answer["test"],
}

### (3) Topological Sort

### (4) Shortest Path

In [142]:
def shpath_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    node_list = <node_list>;
    edge_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["({} <-> {})[weight={}]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
#     triple_list = []

    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [143]:
def shpath_instruction_unfaithful_answer(prompt: str, answer: str, do_print: bool = False):
    # 配置 instruction prompt
    """ 
    prompt:
    In an undirected graph, the nodes are numbered from 0 to 4, and the edges are:
    an edge between node 0 and node 2 with weight 3,
    an edge between node 0 and node 1 with weight 1,
    an edge between node 0 and node 4 with weight 1,
    an edge between node 1 and node 2 with weight 3,
    an edge between node 1 and node 3 with weight 2,
    an edge between node 1 and node 4 with weight 4,
    an edge between node 2 and node 3 with weight 2,
    an edge between node 2 and node 4 with weight 4,
    an edge between node 3 and node 4 with weight 2.
    Q: Give the shortest path from node 0 to node 3.
    A:
    数据结构：
    {
        'question': 'In an undirected graph, the nodes are numbered from 0 to 4, and the edges are:\nan edge between node 0 and node 2 with weight 3,\nan edge between node 0 and node 1 with weight 1,\nan edge between node 0 and node 4 with weight 1,\nan edge between node 1 and node 2 with weight 3,\nan edge between node 1 and node 3 with weight 2,\nan edge between node 1 and node 4 with weight 4,\nan edge between node 2 and node 3 with weight 2,\nan edge between node 2 and node 4 with weight 4,\nan edge between node 3 and node 4 with weight 2.\nQ: Give the shortest path from node 0 to node 3.\nA:', 
        'answer': 'The shortest path from node 0 to node 3 is 0,1,3 with a total weight of 3', 
        'difficulty': 'easy'}
    """
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "shortest-path"
    prompts = prompt.split("\n")
    instruction = "Task definition: find a shortest path between two nodes in the graph, and calculate the sum of the weights in the shortest path."
    edge_list = [
        (prompts[idx].split(" and node ")[0].replace("an edge between node ", ""), 
         prompts[idx].split(" and node ")[1].split(" with weight ")[1][:-1],
         prompts[idx].split(" and node ")[1].split(" with weight ")[0],
        ) for idx in range(1, len(prompts) - 2)]
    node_start = int(prompts[0].split(" to ")[0].split(" from ")[1])
    node_end = int(prompts[0].split(" to ")[1].split(", ")[0])
    node_list = list(range(node_start, node_end + 1))
    gcl = shpath_graph_language(task_name, node_list, edge_list)
    query = prompts[-2].strip()
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def shpath_dataset_unfaithful_answer(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-shortest-path"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    K = 5 if data_kind == "train" else 1
    # 获得所有的结果，用于负采样
    all_answer = set()
    for ei, idx in enumerate(tqdm(example_idx_list)):
        answer = data[idx]["answer"]
        all_answer.add(answer)
    all_answer = list(all_answer)
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction = shpath_instruction_unfaithful_answer(question, answer)
        for k in range(K):
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [answer],
                "answer_negative": [all_answer[random.randint(0, len(all_answer) - 1)], answer.split(" of ")[0] + " of {}".format(random.randint(0, 20))],
                "hallucination_type": "unfaithful_answer",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

shpath_train_data, shpath_test_data = load_task_data("shortest_path")
shpath_instruction_data_unfaithful_answer = {
    "train": shpath_dataset_unfaithful_answer(shpath_train_data),
    "test": shpath_dataset_unfaithful_answer(shpath_test_data, "test"),
}

100%|██████████| 316/316 [00:00<00:00, 1387853.47it/s]
100%|██████████| 316/316 [00:00<00:00, 16681.56it/s]


total number: 1580


100%|██████████| 64/64 [00:00<00:00, 791845.00it/s]
100%|██████████| 64/64 [00:00<00:00, 21336.58it/s]

total number: 64





In [144]:
print(shpath_instruction_data_unfaithful_answer["test"][13]["instruction"])
print(shpath_instruction_data_unfaithful_answer["test"][13]["answer_positive"])
print(shpath_instruction_data_unfaithful_answer["test"][13]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="shortest-path"] {
    node_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
    edge_list = [(0 <-> 10)[weight=4], (0 <-> 12)[weight=4], (0 <-> 4)[weight=6], (1 <-> 5)[weight=3], (1 <-> 6)[weight=5], (1 <-> 13)[weight=8], (2 <-> 10)[weight=10], (2 <-> 7)[weight=3], (2 <-> 6)[weight=3], (3 <-> 9)[weight=3], (3 <-> 7)[weight=2], (4 <-> 12)[weight=1], (4 <-> 9)[weight=9], (4 <-> 7)[weight=10], (5 <-> 9)[weight=10], (5 <-> 7)[weight=10], (5 <-> 8)[weight=4], (6 <-> 13)[weight=9], (7 <-> 10)[weight=8], (7 <-> 8)[weight=5], (9 <-> 10)[weight=7], (9 <-> 11)[weight=1]];
}
```
Task definition: find a shortest path between two

In [145]:
def shpath_instruction_missing_graph(prompt: str, answer: str, do_print: bool = False):
    # 配置 instruction prompt
    """ 
    prompt:
    In an undirected graph, the nodes are numbered from 0 to 4, and the edges are:
    an edge between node 0 and node 2 with weight 3,
    an edge between node 0 and node 1 with weight 1,
    an edge between node 0 and node 4 with weight 1,
    an edge between node 1 and node 2 with weight 3,
    an edge between node 1 and node 3 with weight 2,
    an edge between node 1 and node 4 with weight 4,
    an edge between node 2 and node 3 with weight 2,
    an edge between node 2 and node 4 with weight 4,
    an edge between node 3 and node 4 with weight 2.
    Q: Give the shortest path from node 0 to node 3.
    A:
    数据结构：
    {
        'question': 'In an undirected graph, the nodes are numbered from 0 to 4, and the edges are:\nan edge between node 0 and node 2 with weight 3,\nan edge between node 0 and node 1 with weight 1,\nan edge between node 0 and node 4 with weight 1,\nan edge between node 1 and node 2 with weight 3,\nan edge between node 1 and node 3 with weight 2,\nan edge between node 1 and node 4 with weight 4,\nan edge between node 2 and node 3 with weight 2,\nan edge between node 2 and node 4 with weight 4,\nan edge between node 3 and node 4 with weight 2.\nQ: Give the shortest path from node 0 to node 3.\nA:', 
        'answer': 'The shortest path from node 0 to node 3 is 0,1,3 with a total weight of 3', 
        'difficulty': 'easy'}
    """
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "shortest-path"
    prompts = prompt.split("\n")
    instruction = "Task definition: find a shortest path between two nodes in the graph, and calculate the sum of the weights in the shortest path."
    edge_list = [
        (prompts[idx].split(" and node ")[0].replace("an edge between node ", ""), 
         prompts[idx].split(" and node ")[1].split(" with weight ")[1][:-1],
         prompts[idx].split(" and node ")[1].split(" with weight ")[0],
        ) for idx in range(1, len(prompts) - 2)]
    node_start = int(prompts[0].split(" to ")[0].split(" from ")[1])
    node_end = int(prompts[0].split(" to ")[1].split(", ")[0])
    node_list = list(range(node_start, node_end + 1))

    query = prompts[-2].strip()
    target_nodes = query.replace("Q: Give the shortest path from node ", "").replace(".", "").split(" to node ")
    random_select_node = random.randint(0, 1)
    random_select_node = target_nodes[random_select_node]
    # print(random_select_node)
    edge_list = [i for i in edge_list if str(i[0]) != random_select_node and str(i[1]) != random_select_node]
    node_list = [i for i in node_list if str(i) != random_select_node]
    
    gcl = shpath_graph_language(task_name, node_list, edge_list)
    
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_node

def shpath_dataset_missing_graph(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-shortest-path"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    K = 5 if data_kind == "train" else 1
    # 获得所有的结果，用于负采样
    all_answer = set()
    for ei, idx in enumerate(tqdm(example_idx_list)):
        answer = data[idx]["answer"]
        all_answer.add(answer)
    all_answer = list(all_answer)
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction, target_node = shpath_instruction_missing_graph(question, answer)
        for k in range(K):
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": ["Sorry, the graph does not exist node {}, so the question is unanswerable, you had better provide a correct graph.".format(target_node)],
                "answer_negative": [answer], # 因为graph被修改为错误的graph，因此原始对应的answer则是negative
                "hallucination_type": "missing_graph_information",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

shpath_train_data, shpath_test_data = load_task_data("shortest_path")
shpath_instruction_data_missing_graph = {
    "train": shpath_dataset_missing_graph(shpath_train_data),
    "test": shpath_dataset_missing_graph(shpath_test_data, "test"),
}

100%|██████████| 316/316 [00:00<00:00, 766127.20it/s]
100%|██████████| 316/316 [00:00<00:00, 8474.64it/s]


total number: 1580


100%|██████████| 64/64 [00:00<00:00, 445906.07it/s]
100%|██████████| 64/64 [00:00<00:00, 8068.39it/s]

total number: 64





In [146]:
print(shpath_instruction_data_missing_graph["test"][13]["instruction"])
print(shpath_instruction_data_missing_graph["test"][13]["answer_positive"])
print(shpath_instruction_data_missing_graph["test"][13]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="shortest-path"] {
    node_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
    edge_list = [(1 <-> 5)[weight=3], (1 <-> 6)[weight=5], (1 <-> 13)[weight=8], (2 <-> 10)[weight=10], (2 <-> 7)[weight=3], (2 <-> 6)[weight=3], (3 <-> 9)[weight=3], (3 <-> 7)[weight=2], (4 <-> 12)[weight=1], (4 <-> 9)[weight=9], (4 <-> 7)[weight=10], (5 <-> 9)[weight=10], (5 <-> 7)[weight=10], (5 <-> 8)[weight=4], (6 <-> 13)[weight=9], (7 <-> 10)[weight=8], (7 <-> 8)[weight=5], (9 <-> 10)[weight=7], (9 <-> 11)[weight=1]];
}
```
Task definition: find a shortest path between two nodes in the graph, and calculate the sum of the weights in the sho

In [147]:
shpath_preference_data = {
    "train": shpath_instruction_data_unfaithful_answer["train"] + shpath_instruction_data_missing_graph["train"],
    "test": shpath_instruction_data_unfaithful_answer["test"] + shpath_instruction_data_missing_graph["test"],
}

### (5) Maximum Flow

In [92]:
def flow_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    node_list = <node_list>;
    edge_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["({} -> {})[capacity={}]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
#     triple_list = []

    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [148]:
def flow_instruction_unfaithful_answer(prompt: str, answer: str, do_print: bool = False):
    # 配置 instruction prompt
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "maximum-flow"
    prompts = prompt.split("\n")
    instruction = "Task definition: calculate the maximum flow between two nodes in the graph."
    edge_list = [
        (prompts[idx].split(" to node ")[0].replace("an edge from node ", ""), 
         prompts[idx].split(" to node ")[1].split(" with capacity ")[1][:-1],
         prompts[idx].split(" to node ")[1].split(" with capacity ")[0],
        ) for idx in range(1, len(prompts) - 2)]
    node_start = int(prompts[0].split(" to ")[0].split(" from ")[1])
    node_end = int(prompts[0].split(" to ")[1].split(", ")[0])
    node_list = list(range(node_start, node_end + 1))
    gcl = flow_graph_language(task_name, node_list, edge_list)
    query = prompts[-2].strip()
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def flow_dataset_unfaithful_answer(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-maximum-flow"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    K = 5 if data_kind == "train" else 1
    # 获得所有的结果，用于负采样
    all_answer = set()
    for ei, idx in enumerate(tqdm(example_idx_list)):
        answer = data[idx]["answer"]
        all_answer.add(answer)
    all_answer = list(all_answer)
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction = flow_instruction_unfaithful_answer(question, answer)
        for k in range(K):
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [answer],
                "answer_negative": [all_answer[random.randint(0, len(all_answer) - 1)], answer[:-1].split(" of ")[0] + " is {}.".format(random.randint(0, 20))],
                "hallucination_type": "unfaithful_answer",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

flow_train_data, flow_test_data = load_task_data("flow")
flow_instruction_data_unfaithful_answer = {
    "train": flow_dataset_unfaithful_answer(flow_train_data),
    "test": flow_dataset_unfaithful_answer(flow_test_data, "test"),
}

100%|██████████| 292/292 [00:00<00:00, 953102.54it/s]
100%|██████████| 292/292 [00:00<00:00, 6320.90it/s]


total number: 1460


100%|██████████| 58/58 [00:00<00:00, 641872.38it/s]
100%|██████████| 58/58 [00:00<00:00, 9312.11it/s]

total number: 58





In [150]:
print(flow_instruction_data_unfaithful_answer["test"][13]["instruction"])
print(flow_instruction_data_unfaithful_answer["test"][13]["answer_positive"])
print(flow_instruction_data_unfaithful_answer["test"][13]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="maximum-flow"] {
    node_list = [0, 1, 2, 3, 4, 5];
    edge_list = [(0 -> 5)[capacity=9], (1 -> 2)[capacity=9], (2 -> 1)[capacity=6], (2 -> 4)[capacity=6], (2 -> 5)[capacity=8], (4 -> 3)[capacity=7], (4 -> 5)[capacity=3], (4 -> 2)[capacity=6], (5 -> 1)[capacity=4], (5 -> 4)[capacity=1], (5 -> 2)[capacity=4]];
}
```
Task definition: calculate the maximum flow between two nodes in the graph.
Q: What is the maximum flow from node 5 to node 3?
A:
['The maximum flow from node 5 to node 3 is 7.']
['The maximum flow from node 0 to node 7 is 7.', 'The maximum flow from node 5 to node 3 is 7 is 3.']


In [151]:
def flow_instruction_missing_graph(prompt: str, answer: str, do_print: bool = False):
    # 配置 instruction prompt
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "maximum-flow"
    prompts = prompt.split("\n")
    instruction = "Task definition: calculate the maximum flow between two nodes in the graph."
    edge_list = [
        (prompts[idx].split(" to node ")[0].replace("an edge from node ", ""), 
         prompts[idx].split(" to node ")[1].split(" with capacity ")[1][:-1],
         prompts[idx].split(" to node ")[1].split(" with capacity ")[0],
        ) for idx in range(1, len(prompts) - 2)]
    node_start = int(prompts[0].split(" to ")[0].split(" from ")[1])
    node_end = int(prompts[0].split(" to ")[1].split(", ")[0])
    node_list = list(range(node_start, node_end + 1))
    
    query = prompts[-2].strip()
    target_nodes = query.replace("Q: What is the maximum flow from node ", "").replace("?", "").split(" to node ")
    random_select_node = random.randint(0, 1)
    random_select_node = target_nodes[random_select_node]
    # print(random_select_node)
    edge_list = [i for i in edge_list if str(i[0]) != random_select_node and str(i[2]) != random_select_node]
    node_list = [i for i in node_list if str(i) != random_select_node]

    
    gcl = flow_graph_language(task_name, node_list, edge_list)
    
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_node

def flow_dataset_missing_graph(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-maximum-flow"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    K = 5 if data_kind == "train" else 1
    # 获得所有的结果，用于负采样
    all_answer = set()
    for ei, idx in enumerate(tqdm(example_idx_list)):
        answer = data[idx]["answer"]
        all_answer.add(answer)
    all_answer = list(all_answer)
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction, target_node = flow_instruction_missing_graph(question, answer)
        for k in range(K):
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": ["Sorry, the graph does not exist node {}, so the question is unanswerable, you had better provide a correct graph.".format(target_node)],
                "answer_negative": [answer], # 因为graph被修改为错误的graph，因此原始对应的answer则是negative
                "hallucination_type": "missing_graph_information",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

flow_train_data, flow_test_data = load_task_data("flow")
flow_instruction_data_missing_graph = {
    "train": flow_dataset_missing_graph(flow_train_data),
    "test": flow_dataset_missing_graph(flow_test_data, "test"),
}

100%|██████████| 292/292 [00:00<00:00, 1249731.40it/s]
100%|██████████| 292/292 [00:00<00:00, 10990.40it/s]


total number: 1460


100%|██████████| 58/58 [00:00<00:00, 476999.28it/s]
100%|██████████| 58/58 [00:00<00:00, 12436.46it/s]

total number: 58





In [152]:
print(flow_instruction_data_missing_graph["test"][13]["instruction"])
print(flow_instruction_data_missing_graph["test"][13]["answer_positive"])
print(flow_instruction_data_missing_graph["test"][13]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="maximum-flow"] {
    node_list = [0, 1, 2, 4, 5];
    edge_list = [(0 -> 5)[capacity=9], (1 -> 2)[capacity=9], (2 -> 1)[capacity=6], (2 -> 4)[capacity=6], (2 -> 5)[capacity=8], (4 -> 5)[capacity=3], (4 -> 2)[capacity=6], (5 -> 1)[capacity=4], (5 -> 4)[capacity=1], (5 -> 2)[capacity=4]];
}
```
Task definition: calculate the maximum flow between two nodes in the graph.
Q: What is the maximum flow from node 5 to node 3?
A:
['Sorry, the graph does not exist node 3, so the question is unanswerable, you had better provide a correct graph.']
['The maximum flow from node 5 to node 3 is 7.']


In [153]:
def flow_instruction_conflict_graph(prompt: str, answer: str, do_print: bool = False):
    # 配置 instruction prompt
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "maximum-flow"
    prompts = prompt.split("\n")
    instruction = "Task definition: calculate the maximum flow between two nodes in the graph."
    edge_list = [
        (prompts[idx].split(" to node ")[0].replace("an edge from node ", ""), 
         prompts[idx].split(" to node ")[1].split(" with capacity ")[1][:-1],
         prompts[idx].split(" to node ")[1].split(" with capacity ")[0],
        ) for idx in range(1, len(prompts) - 2)]
    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    for i in range(random_select_num):
        random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
        weight = random_select_edge[1]
        while True:
            rand_weight = random.randint(0, 10)
            if rand_weight != weight:
                break
        random_select_edge_list.append(((random_select_edge[0], rand_weight, random_select_edge[2]), (random_select_edge[0], weight, random_select_edge[2])))
        edge_list.append((random_select_edge[0], rand_weight, random_select_edge[2]))
    shuffle(edge_list)
    
    node_start = int(prompts[0].split(" to ")[0].split(" from ")[1])
    node_end = int(prompts[0].split(" to ")[1].split(", ")[0])
    node_list = list(range(node_start, node_end + 1))
    gcl = flow_graph_language(task_name, node_list, edge_list)
    query = prompts[-2].strip()
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def flow_dataset_conflict_graph(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-maximum-flow"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    K = 5 if data_kind == "train" else 1
    # 获得所有的结果，用于负采样
    all_answer = set()
    for ei, idx in enumerate(tqdm(example_idx_list)):
        answer = data[idx]["answer"]
        all_answer.add(answer)
    all_answer = list(all_answer)
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction, conflict_edge_list = flow_instruction_conflict_graph(question, answer)
        
        positive_answer = "Sorry, the graph contains some conflict edges in the follow:\n"
        for conflict_edge in conflict_edge_list:
            positive_answer += "({} -> {})[capacity={}] is conflict with ({} -> {})[capacity={}]\n".format(
                conflict_edge[0][0], conflict_edge[0][2], conflict_edge[0][1],
                conflict_edge[1][0], conflict_edge[1][2], conflict_edge[1][1]
            )
        positive_answer += "so the question is unanswerable, you had better provide a correct graph."
        
        for k in range(K):
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [positive_answer],
                "answer_negative": [answer], # 因为graph被修改为错误的graph，因此原始对应的answer则是negative
                "hallucination_type": "conflict_graph",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

flow_train_data, flow_test_data = load_task_data("flow")
flow_instruction_data_conflict_graph = {
    "train": flow_dataset_conflict_graph(flow_train_data),
    "test": flow_dataset_conflict_graph(flow_test_data, "test"),
}

100%|██████████| 292/292 [00:00<00:00, 959073.43it/s]
100%|██████████| 292/292 [00:00<00:00, 5629.78it/s]


total number: 1460


100%|██████████| 58/58 [00:00<00:00, 1121058.21it/s]
100%|██████████| 58/58 [00:00<00:00, 12353.10it/s]

total number: 58





In [154]:
print(flow_instruction_data_conflict_graph["test"][13]["instruction"])
print(flow_instruction_data_conflict_graph["test"][13]["answer_positive"])
print(flow_instruction_data_conflict_graph["test"][13]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="maximum-flow"] {
    node_list = [0, 1, 2, 3, 4, 5];
    edge_list = [(4 -> 3)[capacity=7], (1 -> 2)[capacity=9], (4 -> 5)[capacity=3], (2 -> 4)[capacity=8], (4 -> 2)[capacity=4], (2 -> 1)[capacity=6], (5 -> 1)[capacity=4], (2 -> 5)[capacity=8], (5 -> 2)[capacity=4], (2 -> 4)[capacity=6], (4 -> 2)[capacity=6], (5 -> 4)[capacity=1], (0 -> 5)[capacity=9]];
}
```
Task definition: calculate the maximum flow between two nodes in the graph.
Q: What is the maximum flow from node 5 to node 3?
A:
['Sorry, the graph contains some conflict edges in the follow:\n(2 -> 4)[capacity=8] is conflict with (2 -> 4)[capacity=6]\n(4 -> 2)[capa

In [155]:
flow_preference_data = {
    "train": flow_instruction_data_unfaithful_answer["train"] + flow_instruction_data_missing_graph["train"] + flow_instruction_data_conflict_graph["train"],
    "test": flow_instruction_data_unfaithful_answer["test"] + flow_instruction_data_missing_graph["test"] + flow_instruction_data_conflict_graph["test"],
}

### (6) Bipartite Graph Matching

### (7) Hamilton Path

## 1.2 Graph Degree

In [101]:
def degree_graph_language(task_name: str, node_list: list, graph: list, target_node: str):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    node_list = <node_list>;
    edge_list = <triple_list>
    target_node = <target_node>;\n}```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["({} <-> {})".format(triple[0], triple[1]) for triple in graph]) + "];"
#     triple_list = []

    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)).replace("<target_node>", str(target_node))


In [156]:
def degree_instruction_unfaithful_answer(prompt: str, answer: str, do_print: bool = False):
    # 配置 instruction prompt
    # 输入一个connectivity数据集，将其转换为degree问题
    """ 
    prompt:
    Determine if there is a path between two nodes in the graph. Note that (i,j) means that node i and node j are connected with an undirected edge.
    Graph: (0,12) (0,13) (0,2) (0,14) (0,23) (0,8) (0,1) (0,25) (0,27)
    Q: Is there a path between node 14 and node 2?
    A:
    数据结构：
    {
        'question': 'Determine if there is a path between two nodes in the graph. Note that (i,j) means that node i and node j are connected with an undirected edge.\nGraph: (0,1) (0,6) (0,21) (0,18) (1,24) (1,3) (3,24) (3,14) (4,6) (4,5) (4,23) (6,14) (6,21) (6,23) (7,11) (8,19) (8,13) (9,15) (9,19) (10,11) (10,12) (10,16) (10,17) (11,12) (11,16) (12,17) (13,15) (13,22) (14,23) (15,22) (16,17) (18,24) (18,21)\nQ: Is there a path between node 0 and node 19?\nA:',
        'answer': 'The answer is no.',
        'difficulty': 'medium'
    }
    """
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "degree-computing"
    prompts = prompt.split("\n")
    instruction = [
        "Task definition: calculate the degree of the target node in the graph.",
        "Task definition: the degree of the node in an undirected graph means the number of edges linked to the node. So the answer is an integer value of the count of the linked edge."
    ]
    edge_list = [(triple[1:-1].split(",")[0], triple[1:-1].split(",")[1]) for triple in prompts[1].replace("Graph: ", "").split(" ")]
    node_list = [node for node in sorted(list(set([int(node) for triple in edge_list for node in triple])))]
    target_node = node_list[random.randint(0, len(node_list) - 1)]
    gcl = degree_graph_language(task_name, node_list, edge_list, target_node)
    query = "Q: What's the degree of the target node?"
    
    # 构建邻接表
    adj_list = dict()
    for (s, e) in edge_list:
        if s not in adj_list.keys():
            adj_list[s] = set()
        if e not in adj_list.keys():
            adj_list[e] = set()
        adj_list[s].add(e)
        adj_list[e].add(s)
    # 计算每个节点的degree
    node_degree = {int(node): len(adjs) for node, adjs in adj_list.items()}
    # 随机挑选一个节点作为目标节点，用于构建query
    
    answer = "The degree of the target node {} is {}".format(target_node, node_degree[target_node])
    
    final_instruction = ["{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, i, query) for i in instruction]
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "answer": answer,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
            "target_node": target_node,
        }
    }

def degree_dataset_unfaithful_answer(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-degree-computing"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction = degree_instruction_unfaithful_answer(question, answer)
        answer_positive = instruction["answer"]
        while True:
            answer_negative = random.randint(0, 10)
            if str(answer_negative) != answer_positive.split(" is ")[1]:
                answer_negative = answer_positive.split(" is ")[0] + " is " + str(answer_negative)
                break
        if data_kind == "train":
            for instruction_prompt in instruction["instruction"]:
                final_data.append({
                    "task_name": task_name,
                    "idx": example_id,
                    "instruction": instruction_prompt,
                    "graph_language": instruction["graph_language"],
                    "graph": instruction["graph"],
                    "answer_positive": [answer_positive],
                    "answer_negative": [answer_negative],
                    "hallucination_type": "unfaithful_answer",
                    "answer_with_cot": [],
                    "difficulty": diff,
                    "from": "NLGraph",
                })
                example_id += 1
        else:
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"][0],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [answer_positive],
                "answer_negative": [answer_negative],
                "hallucination_type": "unfaithful_answer",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

# 借用NLGraph connectivity的graph，用于构建Graph Degree任务
connectivity_train_data, connectivity_test_data = load_task_data("connectivity")

degree_instruction_data_unfaithful_answer = {
    "train": degree_dataset_unfaithful_answer(connectivity_train_data),
    "test": degree_dataset_unfaithful_answer(connectivity_test_data, "test"),
}

100%|██████████| 1861/1861 [00:00<00:00, 4179.04it/s]


total number: 3722


100%|██████████| 371/371 [00:00<00:00, 8345.02it/s]

total number: 371





In [157]:
print(degree_instruction_data_unfaithful_answer["test"][13]["instruction"])
print(degree_instruction_data_unfaithful_answer["test"][13]["answer_positive"])
print(degree_instruction_data_unfaithful_answer["test"][13]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="degree-computing"] {
    node_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34];
    edge_list = [(0 <-> 34), (0 <-> 4), (0 <-> 7), (0 <-> 28), (0 <-> 8), (0 <-> 15), (0 <-> 23), (0 <-> 13), (0 <-> 22), (0 <-> 32), (0 <-> 11), (0 <-> 18), (1 <-> 7), (1 <-> 2), (1 <-> 13), (1 <-> 27), (1 <-> 26), (1 <-> 9), (2 <-> 14), (2 <-> 31), (2 <-> 15), (2 <-> 23), (2 <-> 13), (2 <-> 9), (2 <-> 22), (2 <-> 17), (2 <-> 32), (2 <-> 18), (3 <-> 24), (4 <-> 7), (4 <-> 27), (4 <-> 17), (5 <-> 16), (5 <-> 12), (5 <-> 29), (5 <-> 20), (6 <-> 34), (6 <-> 7

In [158]:
def degree_instruction_missing_graph(prompt: str, answer: str, do_print: bool = False):
    # 配置 instruction prompt
    # 输入一个connectivity数据集，将其转换为degree问题
    """ 
    prompt:
    Determine if there is a path between two nodes in the graph. Note that (i,j) means that node i and node j are connected with an undirected edge.
    Graph: (0,12) (0,13) (0,2) (0,14) (0,23) (0,8) (0,1) (0,25) (0,27)
    Q: Is there a path between node 14 and node 2?
    A:
    数据结构：
    {
        'question': 'Determine if there is a path between two nodes in the graph. Note that (i,j) means that node i and node j are connected with an undirected edge.\nGraph: (0,1) (0,6) (0,21) (0,18) (1,24) (1,3) (3,24) (3,14) (4,6) (4,5) (4,23) (6,14) (6,21) (6,23) (7,11) (8,19) (8,13) (9,15) (9,19) (10,11) (10,12) (10,16) (10,17) (11,12) (11,16) (12,17) (13,15) (13,22) (14,23) (15,22) (16,17) (18,24) (18,21)\nQ: Is there a path between node 0 and node 19?\nA:',
        'answer': 'The answer is no.',
        'difficulty': 'medium'
    }
    """
    if do_print:
        print("==[original prompt]" + "="*31)
        print(prompt)
        print("==[graph language prompt]" + "="*26)
    
    task_name = "degree-computing"
    prompts = prompt.split("\n")
    instruction = [
        "Task definition: calculate the degree of the target node in the graph.",
        "Task definition: the degree of the node in an undirected graph means the number of edges linked to the node. So the answer is an integer value of the count of the linked edge."
    ]
    edge_list = [(triple[1:-1].split(",")[0], triple[1:-1].split(",")[1]) for triple in prompts[1].replace("Graph: ", "").split(" ")]
    node_list = [node for node in sorted(list(set([int(node) for triple in edge_list for node in triple])))]
    target_node = node_list[random.randint(0, len(node_list) - 1)]
    random_select_node = target_node
    # print(random_select_node)
    edge_list = [i for i in edge_list if i[0] != random_select_node and str(i[1]) != random_select_node]
    node_list = [i for i in node_list if i != random_select_node]

    gcl = degree_graph_language(task_name, node_list, edge_list, target_node)
    query = "Q: What's the degree of the target node?"

    
    # 构建邻接表
    adj_list = dict()
    for (s, e) in edge_list:
        if s not in adj_list.keys():
            adj_list[s] = set()
        if e not in adj_list.keys():
            adj_list[e] = set()
        adj_list[s].add(e)
        adj_list[e].add(s)
    # 计算每个节点的degree
    node_degree = {int(node): len(adjs) for node, adjs in adj_list.items()}
    # 随机挑选一个节点作为目标节点，用于构建query
    
    answer = "The degree of the target node {} is {}".format(target_node, node_degree[target_node])
    
    final_instruction = ["{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, i, query) for i in instruction]
    
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "answer": answer,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
            "target_node": target_node,
        }
    }, random_select_node

def degree_dataset_missing_graph(data: dict, data_kind: str="train"):
    task_name = "graph-structure-modeling-degree-computing"
    example_idx_list = data.keys()
    final_data = list()
    example_id = 0
    for ei, idx in enumerate(tqdm(example_idx_list)):
        question, answer, diff = data[idx]["question"], data[idx]["answer"], data[idx]["difficulty"]
        instruction, target_node = degree_instruction_missing_graph(question, answer)
        if data_kind == "train":
            for instruction_prompt in instruction["instruction"]:
                final_data.append({
                    "task_name": task_name,
                    "idx": example_id,
                    "instruction": instruction_prompt,
                    "graph_language": instruction["graph_language"],
                    "graph": instruction["graph"],
                    "answer_positive": ["Sorry, the graph does not exist node {}, so the question is unanswerable, you had better provide a correct graph.".format(target_node)],
                    "answer_negative": [instruction["answer"]], # 因为graph被修改为错误的graph，因此原始对应的answer则是negative
                    "hallucination_type": "missing_graph_information",
                    "answer_with_cot": [],
                    "difficulty": diff,
                    "from": "NLGraph",
                })
                example_id += 1
        else:
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"][0],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": ["Sorry, the graph does not exist node {}, so the question is unanswerable, you had better provide a correct graph.".format(target_node)],
                "answer_negative": [instruction["answer"]], # 因为graph被修改为错误的graph，因此原始对应的answer则是negative
                "hallucination_type": "missing_graph_information",
                "answer_with_cot": [],
                "difficulty": diff,
                "from": "NLGraph",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

# 借用NLGraph connectivity的graph，用于构建Graph Degree任务
connectivity_train_data, connectivity_test_data = load_task_data("connectivity")

degree_instruction_data_missing_graph = {
    "train": degree_dataset_missing_graph(connectivity_train_data),
    "test": degree_dataset_missing_graph(connectivity_test_data, "test"),
}

100%|██████████| 1861/1861 [00:00<00:00, 6279.69it/s]


total number: 3722


100%|██████████| 371/371 [00:00<00:00, 6689.62it/s]

total number: 371





In [159]:
print(degree_instruction_data_missing_graph["test"][123]["instruction"])
print(degree_instruction_data_missing_graph["test"][123]["answer_positive"])
print(degree_instruction_data_missing_graph["test"][123]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="degree-computing"] {
    node_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
    edge_list = [(1 <-> 11), (1 <-> 9), (1 <-> 19), (1 <-> 31), (1 <-> 20), (2 <-> 15), (2 <-> 11), (2 <-> 27), (2 <-> 29), (2 <-> 16), (2 <-> 31), (3 <-> 8), (3 <-> 4), (3 <-> 12), (3 <-> 18), (3 <-> 17), (3 <-> 10), (3 <-> 25), (3 <-> 31), (3 <-> 20), (4 <-> 30), (4 <-> 24), (4 <-> 19), (5 <-> 26), (5 <-> 23), (6 <-> 13), (7 <-> 15), (7 <-> 9), (7 <-> 30), (7 <-> 12), (7 <-> 16), (7 <-> 19), (7 <-> 10), (7 <-> 25), (8 <-> 15), (8 <-> 11), (8 <-> 30), (8 <-> 24), (8 <-> 10),

In [160]:
degree_preference_data = {
    "train": degree_instruction_data_unfaithful_answer["train"] + degree_instruction_data_missing_graph["train"],
    "test": degree_instruction_data_unfaithful_answer["test"] + degree_instruction_data_missing_graph["test"],
}

**merging all Graph Structure Modeling dataset**

In [166]:
GraphStructureModeling_preference_benchmark_dict = {
    "graph-structure-modeling-connectivity-detection": connectivity_preference_data,
    "graph-structure-modeling-cycle-detection": cycle_preference_data,
    "graph-structure-modeling-maximum-flow": flow_preference_data,
    "graph-structure-modeling-shortest-path": shpath_preference_data,
    "graph-structure-modeling-degree-computing": degree_preference_data
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [167]:
np.save("preference_dataset/graphstructuremodeling_preference_dataset.npy", GraphStructureModeling_preference_benchmark_dict)

# 二、Graph Language Modeling

In [169]:
import json
import os
from tqdm import tqdm
from random import shuffle
import random

## 2.1 Graph Caption Generation
任务定义：输入一个graph的Graph Language Prompt，生成一个文本，用于描述该graph；

### （1） Wikipedia + Wikidata5M
wikipedia包含大量的预训练语料，其对应的知识图谱为wikidata5M
对于每个文本$x$，均可以从wikidata5M中获取一个知识子图$g$，因此可以获得一个监督数据$(g, x)$，旨在设计指令让模型根据知识子图$g$来生成文本$x$

**加载wikidata5m知识图谱**

In [170]:
wikidata5m_path = "Wiki/wikidata5m"
entity_file = os.path.join(wikidata5m_path, "wikidata5m_entity.txt")
relation_file = os.path.join(wikidata5m_path, "wikidata5m_relation.txt")
triple_file = os.path.join(wikidata5m_path, "wikidata5m_all_triplet.txt")

In [171]:
def load_wikidata5m_entity_file(entity_file):
    entity_qid2names = dict() # 保存实体文件中，每个qid对应的所有可能的实体名称
    entity_name2qid = dict() # 保存实体文件中，每个实体名称对应的qid
    all_entity_name_list = list()
    print("loading entity ...")
    with open(entity_file, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in tqdm(lines):
        line = line.strip()
        entity_qid = line.split("\t")[0]
        entity_name_list = line.split("\t")[1:]
        all_entity_name_list.extend(entity_name_list)
        entity_qid2names[entity_qid] = entity_name_list
        for entity_name in entity_name_list:
            entity_name2qid[entity_name] = entity_qid
    return entity_qid2names, entity_name2qid, all_entity_name_list

entity_qid2names, entity_name2qid, all_entity_name_list = load_wikidata5m_entity_file(entity_file)

loading entity ...


100%|██████████| 4813491/4813491 [00:29<00:00, 161249.06it/s]


In [172]:
def load_wikidata5m_relation_file(relation_file):
    relation_pid2names = dict() # 保存关系文件中，每个pid对应的所有可能的关系名称
    relation_name2pid = dict() # 保存关系文件中，每个关系名称对应的pid
    print("loading relation ...")
    with open(relation_file, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in tqdm(lines):
        line = line.strip()
        relation_pid = line.split("\t")[0]
        relation_name_list = line.split("\t")[1:]
        relation_pid2names[relation_pid] = relation_name_list
        for relation_name in relation_name_list:
            relation_name2pid[relation_name] = relation_pid
    return relation_pid2names, relation_name2pid

relation_pid2names, relation_name2pid = load_wikidata5m_relation_file(relation_file)

loading relation ...


100%|██████████| 825/825 [00:00<00:00, 517776.57it/s]


In [173]:
# 加载wikidata5M知识图谱
def load_wikidata5m_triple(triple_file):
    triple_qpqlist = list() # 保存所有三元组（qid，pid，qid）
    entity_qid_adj = dict() # 保存每个实体qid对应的邻接表
    print("loading triple ...")
    with open(triple_file, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in tqdm(lines):
        head_qid, relation_pid, tail_qid = line.strip().split("\t")
        triple_qpqlist.append((head_qid, relation_pid, tail_qid))
        if head_qid not in entity_qid_adj.keys():
            entity_qid_adj[head_qid] = dict()
        if relation_pid not in entity_qid_adj[head_qid].keys():
            entity_qid_adj[head_qid][relation_pid] = list()
        entity_qid_adj[head_qid][relation_pid].append(tail_qid)
    return triple_qpqlist, entity_qid_adj
    
triple_qpqlist, entity_qid_adj = load_wikidata5m_triple(triple_file)

loading triple ...


100%|██████████| 21354359/21354359 [01:25<00:00, 248767.74it/s]


In [174]:
entity_qid_adj["Q326660"]

{'P1412': ['Q652'],
 'P20': ['Q220'],
 'P31': ['Q5'],
 'P19': ['Q9284'],
 'P166': ['Q731542'],
 'P27': ['Q38'],
 'P106': ['Q36180']}

**加载wikipedia语料并进行简单的预处理**

In [175]:
wikipedia_path = "Wiki/wikipedia_corpus"
# wikipedia_corpus_file = os.path.join(wikipedia_path, "wikicorpus_en_one_article_per_line.txt")
# print("loading wikipedia corpus ...")
# with open(wikipedia_corpus_file, "r", encoding="utf-8") as fr:
#     wikipedia_corpus = fr.readlines()
# print("finish loading")

In [176]:
def load_wikipedia_corpus(corpus):
    
    def text_clip(text):
        token_list = text.split(" ")
        if len(token_list) <= 320:
            return text
        cur_pos = 320
        for pos in range(320, 0, -1):
            # 寻找最后一个句子
            if len(token_list[pos]) > 0 and token_list[pos][-1] == '.':
                cur_pos = pos
                break
        return " ".join(token_list[:cur_pos + 1])
    data = list()
    for ei, line in enumerate(corpus):
#         if ei in prograss:
#             print("processing {}\%".format(ei/len(corpus)*100))
        text = line.strip()
        if len(text) < 100:
            # 过滤过短的文本
            continue
        text = text_clip(text) # 裁剪超长的文本
        data.append(text)
    return data

In [180]:
wikipedia_corpus = list()
for idx in tqdm(range(100)):
    with open(os.path.join(wikipedia_path, "wikipedia_corpus_{}.txt".format(idx)), "r", encoding="utf-8") as fr:
        line = fr.readlines()
    data = load_wikipedia_corpus(line)
    wikipedia_corpus.extend(data)

100%|██████████| 100/100 [02:28<00:00,  1.48s/it]


In [182]:
shuffle(wikipedia_corpus)

In [183]:
wikipedia_corpus_original = wikipedia_corpus
wikipedia_corpus = wikipedia_corpus[:200000]

In [184]:
wikipedia_corpus_lower = list()
for text in tqdm(wikipedia_corpus):
    wikipedia_corpus_lower.append("{}".format(text.lower()))

100%|██████████| 200000/200000 [00:01<00:00, 198691.36it/s]


**对语料进行词频统计，记录高频词，并从实体集合中剔除。（高频词被认为可能是非实体的常用词汇）**

In [83]:
word_freq = dict()
for text in tqdm(wikipedia_corpus_lower):
    word_list = text.strip().split()
    for word in word_list:
        if word not in word_freq.keys():
            word_freq[word] = 0
        word_freq[word] += 1

100%|██████████████████████████████| 5513023/5513023 [07:07<00:00, 12909.40it/s]


In [89]:
word_freq = sorted(word_freq.items(), key=lambda i: i[1])

AttributeError: 'list' object has no attribute 'items'

In [103]:
high_freq_word = [i[0] for i in list(reversed(word_freq))[:100]]

In [None]:
11442260

In [186]:
# all_entity_name_list[:100]
print("原始实体数量：{}".format(len(all_entity_name_list)))
new_all_entity_name_list = set()
for ent in tqdm(all_entity_name_list):
    if len(ent) <=5:
        continue
    if len(ent) / len(ent.split(" ")) <= 3:
        continue
    ent = ent.lower()
#     if ent in high_freq_word:
#         continue
    new_all_entity_name_list.add("{}".format(ent))
new_all_entity_name_list = list(new_all_entity_name_list)
print("筛减后实体数量：{}".format(len(new_all_entity_name_list)))

原始实体数量：24021703


100%|██████████| 24021703/24021703 [00:26<00:00, 919437.62it/s]

筛减后实体数量：11442276





**AC自动机存储所有实体**

In [187]:
import ahocorasick
ac = ahocorasick.Automaton()
idx = 0
for entity_name in tqdm(new_all_entity_name_list):
    ac.add_word(entity_name.lower(), (idx, entity_name.lower()))
    idx += 1
ac.make_automaton()

100%|██████████| 11442276/11442276 [00:36<00:00, 311100.27it/s]


**识别文本中的所有实体**

In [188]:
"streams" in new_all_entity_name_list
# entity_qid2names[entity_name2qidm["helicopter"]]

True

In [189]:
print(wikipedia_corpus_lower[0])
list(ac.iter_long(wikipedia_corpus_lower[0].lower()))

bolivaroscelis bolivarii is a species of praying mantis in the genus "bolivaroscelis" in the order mantodea. it is found in cameroon.


[(20, (2807291, 'boliva')),
 (36, (2854227, 'species')),
 (50, (4157457, 'ying ma')),
 (83, (175220, 'bolivaroscelis')),
 (106, (268562, 'mantodea')),
 (131, (8537210, 'cameroon'))]

In [190]:
wikipedia_corpus_entities = list()
for text in tqdm(wikipedia_corpus_lower):
    entity_list = list(ac.iter_long(text))
    text_entity_list = [i[1][1] for i in entity_list]
    # 有一些单词存在不全问题，例如“a schweizer 269c helicopter”识别出了“schweiz”，
    # 筛选出存在此问题的实体
    final_entity_list = set()
    unknown_entity_list = list()
    for ent in text_entity_list:
        if " {} ".format(ent) in " {} ".format(text) or "({})".format(ent) in " {} ".format(text) or "[{}]".format(ent) in " {} ".format(text) or " {}.".format(ent) in " {} ".format(text) or " {},".format(ent) in " {} ".format(text) or " {})".format(ent) in " {} ".format(text) or " {}]".format(ent) in " {} ".format(text)  or "({}".format(ent) in " {} ".format(text) or "[{}".format(ent) in " {} ".format(text):
            final_entity_list.add(ent)
            continue
        if " {}".format(ent) in " {} ".format(text):
            # 右侧不全
            start = text.find(ent)
            end = start + len(ent)
            while end <= len(text) - 1:
                if text[end] in [' ', ',', '.', ')', ']']:
                    break
                end += 1
            entity = text[start: end]
            final_entity_list.add(entity)
        elif "{} ".format(ent) in " {} ".format(text):
            # 左侧不全
            start = text.find(ent)
            end = start + len(ent)
            while start >= 0:
                if text[start] in [' ', ',', '.', '(', '[']:
                    break
                start -= 1
            entity = text[start + 1: end]
            final_entity_list.add(entity)
        else:
            # 两侧均不全，则说明不是实体，直接排除
            pass
#     print("text=", text)
#     print("unknown_entity_list=", unknown_entity_list)
#     print("final_entity_list=", list(final_entity_list))
#     print("="*50)
    wikipedia_corpus_entities.append(list(final_entity_list))


100%|██████████| 200000/200000 [02:12<00:00, 1513.06it/s]


In [191]:
wikipedia_corpus_entities[294]

['travelled',
 'rifles',
 'university',
 'colinton',
 'oil company',
 'matches',
 'edinburgh',
 'scottish',
 'college',
 'april 1917',
 'england',
 'victoria',
 'the son',
 'forrester',
 'the first world war',
 'first-class cricket',
 'suburb',
 'lieutenant',
 'high score',
 'assistant',
 'graduating',
 'emigrated',
 'employment',
 'rangoon',
 'george douglas',
 'rugby school',
 'oxford',
 'average',
 'educated',
 'british burma',
 'second',
 'may 1959',
 'australia']

**对句子构造知识子图**
根据给定的句子、句子对应的所有实体，构造知识子图。思路：
- 获取句子中所有实体，这些实体有些可能在知识图谱中不存在。
- 对于每个实体，获得其1-hop子图。
- 判断其1-hop邻接实体是否存在文本中，如果存在，则保存该三元组

In [192]:
def construct_subgraph(wikipedia_corpus, wikipedia_corpus_entities):
    wikpedia_corpus_triples = list()
    # for text, (entities) in tqdm(zip(wikipedia_corpus, wikipedia_corpus_entities)):
    for ei, text in enumerate(tqdm(wikipedia_corpus)):
        entities = wikipedia_corpus_entities[ei]
        ent_qid2name = {entity_name2qid[ent_name]:ent_name for ent_name in entities if ent_name in entity_name2qid.keys()}
        triple_list = list()
        for head_qid, head_name in ent_qid2name.items():
            if head_qid not in entity_qid_adj.keys():
                continue
            subgraph = entity_qid_adj[head_qid]
            for rel_pid in subgraph.keys():
                for tail_qid in subgraph[rel_pid]:
                    if tail_qid in ent_qid2name.keys() and tail_qid != head_qid:
                        if rel_pid in relation_pid2names.keys():
                            triple_list.append([head_name, relation_pid2names[rel_pid][0], ent_qid2name[tail_qid]])
        wikpedia_corpus_triples.append(triple_list)
    return wikpedia_corpus_triples

In [193]:
wikpedia_corpus_triples = construct_subgraph(wikipedia_corpus, wikipedia_corpus_entities)

100%|██████████| 200000/200000 [00:33<00:00, 5971.72it/s]


In [194]:
# print(wikipedia_corpus[294])
# print(wikpedia_corpus_triples[294])
# wikipedia_corpus_entities[294]
wikpedia_corpus_triples[294]

[['colinton', 'located in the administrative territorial entity', 'edinburgh'],
 ['edinburgh', 'located in the administrative territorial entity', 'scottish'],
 ['scottish', 'capital', 'edinburgh'],
 ['scottish', 'shares border with', 'england'],
 ['england', 'shares border with', 'scottish'],
 ['rangoon', 'capital of', 'british burma'],
 ['george douglas', 'country of citizenship', 'scottish'],
 ['british burma', 'capital', 'rangoon'],
 ['australia', 'participant of', 'the first world war']]

**构建graph caption generation数据集**

只保留全部符合下面的特征的样本
- 样本包含实体，且不超过30个；
- 三元组数量不能低于实体数量的25%；


In [215]:
wikipedia_corpus[10]

'Hervé Ebanda (born 14 February 1979 in Créteil, France) is a French footballer who played on the professional level for the French Ligue 2 US Créteil-Lusitanos football club during the 2000–2002 seasons.'

In [212]:
final_wikipedia_examples = list()
for ei, text in enumerate(tqdm(wikipedia_corpus)):
    entities = wikipedia_corpus_entities[ei]
    triples = wikpedia_corpus_triples[ei]
    if len(entities) == 0 or len(triples) == 0 or len(entities) > 30:
        continue
    if len(entities) < 10 and len(triples) < 2:
        continue
    if len(entities) >= 10 and len(triples) < len(entities) / 5:
        continue
    final_wikipedia_examples.append([
        text, entities, triples
    ])
print(len(final_wikipedia_examples))

100%|██████████| 200000/200000 [00:00<00:00, 834120.49it/s]

23421





In [213]:
shuffle(final_wikipedia_examples)
graphcaption_wikipedia_test_data = final_wikipedia_examples[:2000]
graphcaption_wikipedia_train_data = final_wikipedia_examples[2000:]

**通过graph language构建最终的instruction数据集**

In [214]:
def graphcaption_wikipedia_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [217]:
def graphcaption_wikipedia_instruction_unfaithful_answer(text: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "wikipedia-knowledge-graph"
    instruction = "Task definition: given a knowledge graph with all entities and structure triples representing factual and commonsense knowledge. Please leverage this graph to generate an encyclopedia passage. Note that do not list all knowledge in a running account."
    edge_list = triples
    node_list = entities
    gcl = graphcaption_wikipedia_graph_language(task_name, node_list, edge_list)
    query = "Q: Please generate an encyclopedia passage for the knowledge graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def graphcaption_wikipedia_dataset_unfaithful_answer(all_data: dict, wikipedia_corpus: list):
    task_name = "graph-language-modeling-graph-caption-generation-wikipedia"
    final_data = list()
    for ei, data in enumerate(tqdm(all_data)):
        text, entities, triples = data[0], data[1], data[2]
        instruction = graphcaption_wikipedia_instruction_unfaithful_answer(text, entities, triples)
        answer_negative = list()
        for ek in range(4):
            cur_text = wikipedia_corpus[random.randint(0, len(wikipedia_corpus) - 1)]
            if cur_text != text:
                answer_negative.append(cur_text)
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [text],
            "answer_negative": answer_negative,
            "hallucination_type": "unfaithful_answer",
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "Wikipedia-Wikidata5M",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphcaption_wikipedia_instruction_data_unfaithful_answer = {
    "train": graphcaption_wikipedia_dataset_unfaithful_answer(graphcaption_wikipedia_train_data, wikipedia_corpus),
    "test": graphcaption_wikipedia_dataset_unfaithful_answer(graphcaption_wikipedia_test_data, wikipedia_corpus),
}

100%|██████████| 21421/21421 [00:00<00:00, 48536.59it/s]


total number: 21421


100%|██████████| 2000/2000 [00:00<00:00, 47613.85it/s]

total number: 2000





In [219]:
print(graphcaption_wikipedia_instruction_data_unfaithful_answer["train"][122]["instruction"])
print(graphcaption_wikipedia_instruction_data_unfaithful_answer["test"][122]["answer_positive"])
print(graphcaption_wikipedia_instruction_data_unfaithful_answer["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="wikipedia-knowledge-graph"] {
    entity_list = ['football', 'matches', 'licensing', 'the stadium', 'spectators', 'bistritsa, sofia', 'season', 'bulgaria', 'the local', 'home ground', 'committee', 'seating', 'promotion', 'vitosha', 'acquire', 'capacity of', 'upcoming', 'announced', 'football club', 'septemvri sofia', 'currently', 'stadion', 'multi-purpose stadium', 'following'];
    triple_list = [("bistritsa, sofia" -> "bulgaria")[relation="country"], ("vitosha" -> "bulgaria")[relation="country"], ("septemvri sofia" -> "bulgaria")[relation="country"], ("septemvri sofia" -> "football")[relation="sport"], ("multi-purpose st

In [236]:
def graphcaption_wikipedia_instruction_conflict_graph(text: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "wikipedia-knowledge-graph"
    instruction = "Task definition: given a knowledge graph with all entities and structure triples representing factual and commonsense knowledge. Please leverage this graph to generate an encyclopedia passage. Note that do not list all knowledge in a running account."
    edge_list = triples
    node_list = entities

    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    for i in range(random_select_num):
        random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
        relation = random_select_edge[1]
        while True:
            rand_relation = list(relation_name2pid.keys())[random.randint(0, len(relation_name2pid) - 1)]
            if rand_relation != relation:
                break
        random_select_edge_list.append(((random_select_edge[0], rand_relation, random_select_edge[2]), (random_select_edge[0], relation, random_select_edge[2])))
        edge_list.append((random_select_edge[0], rand_relation, random_select_edge[2]))
    shuffle(edge_list)
    # assert 1>2
    
    gcl = graphcaption_wikipedia_graph_language(task_name, node_list, edge_list)
    query = "Q: Please generate an encyclopedia passage for the knowledge graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def graphcaption_wikipedia_dataset_conflict_graph(all_data: dict, wikipedia_corpus: list):
    task_name = "graph-language-modeling-graph-caption-generation-wikipedia"
    final_data = list()
    for ei, data in enumerate(tqdm(all_data)):
        text, entities, triples = data[0], data[1], data[2]
        instruction, conflict_edge_list = graphcaption_wikipedia_instruction_conflict_graph(text, entities, triples)

        positive_answer = "Sorry, the graph contains some conflict edges in the follow:\n"
        for conflict_edge in conflict_edge_list:
            positive_answer += "(\"{}\" -> \"{}\")[relation=\"{}\"] is conflict with (\"{}\" -> \"{}\")[relation=\"{}\"]\n".format(
                conflict_edge[0][0], conflict_edge[0][2], conflict_edge[0][1],
                conflict_edge[1][0], conflict_edge[1][2], conflict_edge[1][1]
            )
        positive_answer += "so the question is unanswerable, you had better provide a correct graph."
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [positive_answer],
            "answer_negative": [text],
            "hallucination_type": "conflict_graph_information",
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "Wikipedia-Wikidata5M",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphcaption_wikipedia_instruction_data_conflict_graph = {
    "train": graphcaption_wikipedia_dataset_conflict_graph(graphcaption_wikipedia_train_data, wikipedia_corpus),
    "test": graphcaption_wikipedia_dataset_conflict_graph(graphcaption_wikipedia_test_data, wikipedia_corpus),
}

100%|██████████| 21421/21421 [00:01<00:00, 12118.16it/s]


total number: 21421


100%|██████████| 2000/2000 [00:00<00:00, 12489.65it/s]


total number: 2000


In [237]:
print(graphcaption_wikipedia_instruction_data_conflict_graph["test"][122]["instruction"])
print(graphcaption_wikipedia_instruction_data_conflict_graph["test"][122]["answer_positive"])
print(graphcaption_wikipedia_instruction_data_conflict_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="wikipedia-knowledge-graph"] {
    entity_list = ['the programme', 'the county', '11 april', 'august 2008', 'news programme', 'office', 'studio', 'uppsala county', 'sveriges television', 'programme', 'uppsala', 'stockholm county', 'television news', 'stockholm', 'uppland"', 'sweden', 'last broadcast', 'letters', 'södertälje', 'the last', 'counties in sweden', 'news programmes:'];
    triple_list = [("stockholm county" -> "sweden")[relation="country"], ("stockholm county" -> "stockholm")[relation="capital"], ("uppsala" -> "sweden")[relation="country"], ("uppsala county" -> "stockholm county")[relation="shares border with"], 

In [245]:
def graphcaption_wikipedia_instruction_unfactual_graph(text: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "wikipedia-knowledge-graph"
    instruction = "Task definition: given a knowledge graph with all entities and structure triples representing factual and commonsense knowledge. Please leverage this graph to generate an encyclopedia passage. Note that do not list all knowledge in a running account."
    edge_list = triples
    node_list = entities

    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    new_edge_list = list()
    for edge in edge_list:
        if random.random() > 0.7:
            random_select_num -= 1
            if random_select_num >= 0:
                random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
                relation = random_select_edge[1]
                while True:
                    rand_relation = list(relation_name2pid.keys())[random.randint(0, len(relation_name2pid) - 1)]
                    if rand_relation != relation:
                        break
                random_select_edge_list.append(((random_select_edge[0], rand_relation, random_select_edge[2]), (random_select_edge[0], relation, random_select_edge[2])))
                new_edge_list.append((random_select_edge[0], rand_relation, random_select_edge[2]))
        else:
            new_edge_list.append(edge)
    
    gcl = graphcaption_wikipedia_graph_language(task_name, node_list, new_edge_list)
    query = "Q: Please generate an encyclopedia passage for the knowledge graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def graphcaption_wikipedia_dataset_unfactual_graph(all_data: dict, wikipedia_corpus: list):
    task_name = "graph-language-modeling-graph-caption-generation-wikipedia"
    final_data = list()
    for ei, data in enumerate(tqdm(all_data)):
        text, entities, triples = data[0], data[1], data[2]
        instruction, wrong_edge_list = graphcaption_wikipedia_instruction_unfactual_graph(text, entities, triples)

        positive_answer = "Sorry, the graph contains some wrong knowledge in the follow:\n"
        for wrong_edge in wrong_edge_list:
            positive_answer += "(\"{}\" -> \"{}\")[relation=\"{}\"] should be corrected as (\"{}\" -> \"{}\")[relation=\"{}\"]\n".format(
                wrong_edge[0][0], wrong_edge[0][2], wrong_edge[0][1],
                wrong_edge[1][0], wrong_edge[1][2], wrong_edge[1][1]
            )
        positive_answer += "based on the corrected graph, the answer can be:\n{}".format(text)
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [positive_answer],
            "answer_negative": [text],
            "hallucination_type": "unfactual_graph_information",
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "Wikipedia-Wikidata5M",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphcaption_wikipedia_instruction_data_unfactual_graph = {
    "train": graphcaption_wikipedia_dataset_unfactual_graph(graphcaption_wikipedia_train_data, wikipedia_corpus),
    "test": graphcaption_wikipedia_dataset_unfactual_graph(graphcaption_wikipedia_test_data, wikipedia_corpus),
}

100%|██████████| 21421/21421 [00:01<00:00, 13258.71it/s]


total number: 21421


100%|██████████| 2000/2000 [00:00<00:00, 13357.80it/s]

total number: 2000





In [239]:
print(graphcaption_wikipedia_instruction_data_unfactual_graph["test"][122]["instruction"])
print(graphcaption_wikipedia_instruction_data_unfactual_graph["test"][122]["answer_positive"])
print(graphcaption_wikipedia_instruction_data_unfactual_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="wikipedia-knowledge-graph"] {
    entity_list = ['the programme', 'the county', '11 april', 'august 2008', 'news programme', 'office', 'studio', 'uppsala county', 'sveriges television', 'programme', 'uppsala', 'stockholm county', 'television news', 'stockholm', 'uppland"', 'sweden', 'last broadcast', 'letters', 'södertälje', 'the last', 'counties in sweden', 'news programmes:'];
    triple_list = [("uppsala county" -> "sweden")[relation="public holiday"], ("stockholm county" -> "stockholm")[relation="capital"], ("uppsala" -> "sweden")[relation="country"], ("uppsala county" -> "stockholm county")[relation="shares border wit

In [246]:
graphcaption_wikipedia_preference_data = {
    "train": graphcaption_wikipedia_instruction_data_unfaithful_answer["train"] + graphcaption_wikipedia_instruction_data_conflict_graph["train"] + graphcaption_wikipedia_instruction_data_unfactual_graph["train"],
    "test": graphcaption_wikipedia_instruction_data_unfaithful_answer["test"] + graphcaption_wikipedia_instruction_data_conflict_graph["test"] + graphcaption_wikipedia_instruction_data_unfactual_graph["test"],
}
# 构造为如下格式
GraphCaptionGeneration_wikipedia_preference_benchmark_dict = {
    "graph-language-modeling-graph-caption-generation-wikipedia": graphcaption_wikipedia_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [247]:
np.save("preference_dataset/graphlanguagemodeling_graphcaptiongeneration_wikipedia_preference_dataset.npy", GraphCaptionGeneration_wikipedia_preference_benchmark_dict)

### （2）WebNLG

- 数据集huggingface地址：https://huggingface.co/datasets/web_nlg
- 数据集下载：
> wget https://gitlab.com/shimorina/webnlg-dataset/-/archive/587fa698bec705efbefe72a235a6019c2b9b8b6c/webnlg-dataset-587fa698bec705efbefe72a235a6019c2b9b8b6c.zip
> 
> 选择release_v1数据，地址：data/WebNLG/webnlg-dataset-587fa698bec705efbefe72a235a6019c2b9b8b6c/release_v1/json
- 数据集说明：给定一个三元组，生成一个文本

数据集结构：
```
{'2017_test_category': '',
 'category': 'Politician',
 'eid': 'Id10',
 'lex': {'comment': ['good', 'good', 'good'],
         'lid': ['Id1', 'Id2', 'Id3'],
         'text': ['World War II had Chiang Kai-shek as a commander and United States Army soldier Abner W. Sibal.',
                  'Abner W. Sibal served in the United States Army during the Second World War and during that war Chiang Kai-shek was one of the commanders.',
                  'Abner W. Sibal, served in the United States Army and fought in World War II, one of the commanders of which, was Chiang Kai-shek.']},
 'modified_triple_sets': {'mtriple_set': [['Abner_W._Sibal | battle | World_War_II',
                                           'World_War_II | commander | Chiang_Kai-shek',
                                           'Abner_W._Sibal | militaryBranch | United_States_Army']]},
 'original_triple_sets': {'otriple_set': [['Abner_W._Sibal | battles | World_War_II', 'World_War_II | commander | Chiang_Kai-shek', 'Abner_W._Sibal | branch | United_States_Army'],
                                          ['Abner_W._Sibal | militaryBranch | United_States_Army',
                                           'Abner_W._Sibal | battles | World_War_II',
                                           'World_War_II | commander | Chiang_Kai-shek']]},
 'shape': '(X (X) (X (X)))',
 'shape_type': 'mixed',
 'size': 3}
```

In [244]:
def graphcaption_webnlg_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [249]:
def graphcaption_webnlg_instruction_conflict_graph(text: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "dbpedia-knowledge-graph"
    instruction = "Task definition: given a graph with one triple with two entities and a relation, generate a verbalization for this triple."
    edge_list = triples
    node_list = entities

    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    for i in range(random_select_num):
        random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
        relation = random_select_edge[1]
        while True:
            rand_relation = list(relation_name2pid.keys())[random.randint(0, len(relation_name2pid) - 1)]
            if rand_relation != relation:
                break
        random_select_edge_list.append(((random_select_edge[0], rand_relation, random_select_edge[2]), (random_select_edge[0], relation, random_select_edge[2])))
        edge_list.append((random_select_edge[0], rand_relation, random_select_edge[2]))
    shuffle(edge_list)
    
    gcl = graphcaption_webnlg_graph_language(task_name, node_list, edge_list)
    query = "Q: Please generate a verbalization for the triple in the graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def graphcaption_webnlg_dataset_conflict_graph(data: dict):
    task_name = "graph-language-modeling-graph-caption-generation-webnlg"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        for idx in example.keys():
            example_data = example[idx]
            category = example_data["category"]
            text = example_data["lexicalisations"][0]["lex"]
            texts = [i["lex"] for i in example_data["lexicalisations"]]
            triples = [(triple["subject"].replace("_", " "), triple["property"].replace("_", " "), triple["object"].replace("_", " ")) for triple in example_data["modifiedtripleset"]]
            entities = set()
            for triple in triples:
                entities.add(triple[0])
                entities.add(triple[2])
            entities = list(entities)
            instruction, conflict_edge_list = graphcaption_webnlg_instruction_conflict_graph(text, entities, triples)

            positive_answer = "Sorry, the graph contains some conflict edges in the follow:\n"
            for conflict_edge in conflict_edge_list:
                positive_answer += "(\"{}\" -> \"{}\")[relation=\"{}\"] is conflict with (\"{}\" -> \"{}\")[relation=\"{}\"]\n".format(
                    conflict_edge[0][0], conflict_edge[0][2], conflict_edge[0][1],
                    conflict_edge[1][0], conflict_edge[1][2], conflict_edge[1][1]
                )
            positive_answer += "so the question is unanswerable, you had better provide a correct graph."
        
            final_data.append({
                "task_name": task_name,
                "idx": ei,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [positive_answer],
                "answer_negative": texts,
                "hallucination_type": "conflict_graph_information",
                "answer_with_cot": [],
                "difficulty": "medium",
                "from": "WebNLG",
            })
    print("total number: {}".format(len(final_data)))
    return final_data

webnlg_data_dir = "WebNLG"
with open(os.path.join(webnlg_data_dir, "webnlg_release_v1.json"), "r", encoding="utf-8") as fr:
    webnlg_data = json.load(fr)
shuffle(webnlg_data)
webnlg_test_data = webnlg_data["entries"][:2000]
webnlg_train_data = webnlg_data["entries"][2000:]
graphcaption_webnlg_instruction_data_conflict_graph = {
    "train": graphcaption_webnlg_dataset_conflict_graph(webnlg_train_data),
    "test": graphcaption_webnlg_dataset_conflict_graph(webnlg_test_data),
}

100%|██████████| 12237/12237 [00:00<00:00, 16079.39it/s]


total number: 12237


100%|██████████| 2000/2000 [00:00<00:00, 27110.66it/s]

total number: 2000





In [250]:
print(graphcaption_webnlg_instruction_data_conflict_graph["test"][122]["instruction"])
print(graphcaption_webnlg_instruction_data_conflict_graph["test"][122]["answer_positive"])
print(graphcaption_webnlg_instruction_data_conflict_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="dbpedia-knowledge-graph"] {
    entity_list = ['Amsterdam Airport Schiphol', 'Amsterdam'];
    triple_list = [("Amsterdam Airport Schiphol" -> "Amsterdam")[relation="cityServed"], ("Amsterdam Airport Schiphol" -> "Amsterdam")[relation="suffers from"]];
}
```
Task definition: given a graph with one triple with two entities and a relation, generate a verbalization for this triple.
Q: Please generate a verbalization for the triple in the graph.
A:
['Sorry, the graph contains some conflict edges in the follow:\n("Amsterdam Airport Schiphol" -> "Amsterdam")[relation="suffers from"] is conflict with ("Amsterdam Airport Schiphol"

In [255]:
def graphcaption_webnlg_instruction_unfactual_graph(text: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "dbpedia-knowledge-graph"
    instruction = "Task definition: given a graph with one triple with two entities and a relation, generate a verbalization for this triple."
    edge_list = triples
    node_list = entities

    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    new_edge_list = list()
    for edge in edge_list:
        if random.random() > 0.7:
            random_select_num -= 1
            if random_select_num >= 0:
                random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
                relation = random_select_edge[1]
                while True:
                    rand_relation = list(relation_name2pid.keys())[random.randint(0, len(relation_name2pid) - 1)]
                    if rand_relation != relation:
                        break
                random_select_edge_list.append(((random_select_edge[0], rand_relation, random_select_edge[2]), (random_select_edge[0], relation, random_select_edge[2])))
                new_edge_list.append((random_select_edge[0], rand_relation, random_select_edge[2]))
        else:
            new_edge_list.append(edge)
    
    gcl = graphcaption_webnlg_graph_language(task_name, node_list, new_edge_list)
    query = "Q: Please generate a verbalization for the triple in the graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def graphcaption_webnlg_dataset_unfactual_graph(data: dict):
    task_name = "graph-language-modeling-graph-caption-generation-webnlg"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        for idx in example.keys():
            example_data = example[idx]
            category = example_data["category"]
            text = example_data["lexicalisations"][0]["lex"]
            texts = [i["lex"] for i in example_data["lexicalisations"]]
            triples = [(triple["subject"].replace("_", " "), triple["property"].replace("_", " "), triple["object"].replace("_", " ")) for triple in example_data["modifiedtripleset"]]
            entities = set()
            for triple in triples:
                entities.add(triple[0])
                entities.add(triple[2])
            entities = list(entities)
            instruction, wrong_edge_list = graphcaption_webnlg_instruction_unfactual_graph(text, entities, triples)

            if len(wrong_edge_list) == 0:
                continue
            positive_answer = "Sorry, the graph contains some wrong knowledge in the follow:\n"
            for wrong_edge in wrong_edge_list:
                positive_answer += "(\"{}\" -> \"{}\")[relation=\"{}\"] should be corrected as (\"{}\" -> \"{}\")[relation=\"{}\"]\n".format(
                    wrong_edge[0][0], wrong_edge[0][2], wrong_edge[0][1],
                    wrong_edge[1][0], wrong_edge[1][2], wrong_edge[1][1]
                )
            positive_answer += "based on the corrected graph, the answer can be:\n{}".format(text)
        
            final_data.append({
                "task_name": task_name,
                "idx": ei,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [positive_answer],
                "answer_negative": texts,
                "hallucination_type": "unfactual_graph_information",
                "answer_with_cot": [],
                "difficulty": "medium",
                "from": "WebNLG",
            })
    print("total number: {}".format(len(final_data)))
    return final_data

webnlg_data_dir = "WebNLG"
with open(os.path.join(webnlg_data_dir, "webnlg_release_v1.json"), "r", encoding="utf-8") as fr:
    webnlg_data = json.load(fr)
shuffle(webnlg_data)
webnlg_test_data = webnlg_data["entries"][:2000]
webnlg_train_data = webnlg_data["entries"][2000:]
graphcaption_webnlg_instruction_data_unfactual_graph = {
    "train": graphcaption_webnlg_dataset_unfactual_graph(webnlg_train_data),
    "test": graphcaption_webnlg_dataset_unfactual_graph(webnlg_test_data),
}

100%|██████████| 12237/12237 [00:00<00:00, 29394.31it/s]


total number: 7900


100%|██████████| 2000/2000 [00:00<00:00, 60749.60it/s]

total number: 616





In [256]:
print(graphcaption_webnlg_instruction_data_unfactual_graph["test"][122]["instruction"])
print(graphcaption_webnlg_instruction_data_unfactual_graph["test"][122]["answer_positive"])
print(graphcaption_webnlg_instruction_data_unfactual_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="dbpedia-knowledge-graph"] {
    entity_list = ['Al Anderson (NRBQ band)', 'United States'];
    triple_list = [("Al Anderson (NRBQ band)" -> "United States")[relation="precision"]];
}
```
Task definition: given a graph with one triple with two entities and a relation, generate a verbalization for this triple.
Q: Please generate a verbalization for the triple in the graph.
A:
['Sorry, the graph contains some wrong knowledge in the follow:\n("Al Anderson (NRBQ band)" -> "United States")[relation="precision"] should be corrected as ("Al Anderson (NRBQ band)" -> "United States")[relation="birthPlace"]\nbased on the corrected g

In [257]:
# 构造为如下格式
graphcaption_webnlg_preference_data = {
    "train": graphcaption_webnlg_instruction_data_conflict_graph["train"] + graphcaption_webnlg_instruction_data_unfactual_graph["train"],
    "test": graphcaption_webnlg_instruction_data_conflict_graph["test"] + graphcaption_webnlg_instruction_data_unfactual_graph["test"],
}
GraphCaptionGeneration_WebNLG_preference_benchmark_dict = {
    "graph-language-modeling-graph-caption-generation-webnlg": graphcaption_webnlg_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [259]:
np.save("preference_dataset/graphlanguagemodeling_graphcaptiongeneration_webnlg_preference_dataset.npy", GraphCaptionGeneration_WebNLG_preference_benchmark_dict)


### （3）Agenda

数据集下载：https://github.com/rikdz/GraphWriter/blob/master/data/unprocessed.tar.gz

样例：
```
{
    "title": "Continuous Markov Random Fields for Robust Stereo Estimation .",
    "entities": [
      "middlebury high resolution imagery",
      "slanted plane mrf methods",
      "slanted 3d planes",
      "occlusion boundaries",
      "slanted-plane model",
      "kitti dataset",
      "hybrid mrf",
      "random variables",
      "inference"
    ],
    "types": "<material> <method> <otherscientificterm> <otherscientificterm> <method> <material> <method> <otherscientificterm> <task>",
    "relations": [
      "slanted 3d planes -- CONJUNCTION -- occlusion boundaries",
      "slanted-plane model -- COMPARE -- slanted plane mrf methods",
      "middlebury high resolution imagery -- EVALUATE-FOR -- slanted-plane model"
    ],
    "abstract": "in this paper we present a novel <method_4> which reasons jointly about <otherscientificterm_3> as well as depth . we formulate the problem as one of <task_8> in a <method_6> composed of both continuous -lrb- i.e. , <otherscientificterm_2> -rrb- and discrete -lrb- i.e. , <otherscientificterm_3> -rrb- <otherscientificterm_7> . this allows us to define potentials encoding the ownership of the pixels that compose the boundary between segments , as well as potentials encoding which junctions are physically possible . our <method_4> outperforms the state-of-the-art on <material_0> -lsb- 1 -rsb- as well as in the more challenging <material_5> -lsb- 2 -rsb- , while being more efficient than existing <method_1> , taking on average 2 minutes to perform <task_8> on high resolution imagery .",
    "abstract_og": "in this paper we present a novel slanted-plane model which reasons jointly about occlusion boundaries as well as depth . we formulate the problem as one of inference in a hybrid mrf composed of both continuous -lrb- i.e. , slanted 3d planes -rrb- and discrete -lrb- i.e. , occlusion boundaries -rrb- random variables . this allows us to define potentials encoding the ownership of the pixels that compose the boundary between segments , as well as potentials encoding which junctions are physically possible . our slanted-plane model outperforms the state-of-the-art on middlebury high resolution imagery -lsb- 1 -rsb- as well as in the more challenging kitti dataset -lsb- 2 -rsb- , while being more efficient than existing slanted plane mrf methods , taking on average 2 minutes to perform inference on high resolution imagery ."
  },
```

In [260]:
import re

In [261]:
agenda_data_dir = "Agenda/"

In [262]:
def load_agenda_data(data_dir):
    with open(os.path.join(data_dir, "train.json"), "r", encoding="utf-8") as fr:
        train_data = json.load(fr)
    with open(os.path.join(data_dir, "test.json"), "r", encoding="utf-8") as fr:
        test_data = json.load(fr)
    print("train_data_num: ", len(train_data))
    print("test_data_num: ", len(test_data))
    return train_data, test_data
    
agenda_train_data, agenda_test_data = load_agenda_data(agenda_data_dir)

train_data_num:  38720
test_data_num:  1000


**构建instruction数据集**

In [263]:
def graphcaption_agenda_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))

In [264]:
def graphcaption_agenda_instruction_conflict_graph(text: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "agenda-graph"
    instruction = "Task definition: given a scientific title and a corresponding knowledge graph that expresses this publication, generate an abstract for this title."
    edge_list = triples
    node_list = entities

    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    for i in range(random_select_num):
        random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
        relation = random_select_edge[1]
        while True:
            rand_relation = list(relation_name2pid.keys())[random.randint(0, len(relation_name2pid) - 1)]
            if rand_relation != relation:
                break
        random_select_edge_list.append(((random_select_edge[0], rand_relation, random_select_edge[2]), (random_select_edge[0], relation, random_select_edge[2])))
        edge_list.append((random_select_edge[0], rand_relation, random_select_edge[2]))
    shuffle(edge_list)
    
    gcl = graphcaption_agenda_graph_language(task_name, node_list, edge_list)
    query = "Q: Please generate an abstract for the publication based on the title and the knowledge graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def graphcaption_agenda_dataset_conflict_graph(data: dict):
    task_name = "graph-language-modeling-graph-caption-generation-agenda"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        title = example["title"]
        entities = example["entities"]
        triples = [(i.split(" -- ")[0], i.split(" -- ")[1], i.split(" -- ")[2])for i in example["relations"]]
        abstract = example["abstract_og"].replace("-lrb-", "").replace("-rrb-", "")
        abstract = re.sub(r" -lsb- [0-9]+ -rsb- ", "", abstract)
        abstract = abstract.replace(" ,", ",").replace(" .", ".").replace(" !", "!").replace(" ?", "?").replace(" ;", ";")
        instruction, conflict_edge_list = graphcaption_agenda_instruction_conflict_graph(abstract, entities, triples)

        positive_answer = "Sorry, the graph contains some conflict edges in the follow:\n"
        for conflict_edge in conflict_edge_list:
            positive_answer += "(\"{}\" -> \"{}\")[relation=\"{}\"] is conflict with (\"{}\" -> \"{}\")[relation=\"{}\"]\n".format(
                conflict_edge[0][0], conflict_edge[0][2], conflict_edge[0][1],
                conflict_edge[1][0], conflict_edge[1][2], conflict_edge[1][1]
            )
        positive_answer += "so the question is unanswerable, you had better provide a correct graph."
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [positive_answer],
            "answer_negative": [abstract],
            "hallucination_type": "conflict_graph_information",
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "Agenda",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphcaption_agenda_instruction_data_conflict_graph = {
    "train": graphcaption_agenda_dataset_conflict_graph(agenda_train_data),
    "test": graphcaption_agenda_dataset_conflict_graph(agenda_test_data),
}

100%|██████████| 38720/38720 [00:03<00:00, 12457.48it/s]


total number: 38720


100%|██████████| 1000/1000 [00:00<00:00, 12811.93it/s]

total number: 1000





In [265]:
print(graphcaption_agenda_instruction_data_conflict_graph["test"][122]["instruction"])
print(graphcaption_agenda_instruction_data_conflict_graph["test"][122]["answer_positive"])
print(graphcaption_agenda_instruction_data_conflict_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="agenda-graph"] {
    entity_list = ['perceptually-shaped side-informed data hiding scheme', 'logarithmic quantization algorithm', 'generalized version', 'embedding power'];
    triple_list = [("perceptually-shaped side-informed data hiding scheme" -> "logarithmic quantization algorithm")[relation="type of kinship"], ("perceptually-shaped side-informed data hiding scheme" -> "logarithmic quantization algorithm")[relation="HYPONYM-OF"]];
}
```
Task definition: given a scientific title and a corresponding knowledge graph that expresses this publication, generate an abstract for this title.
Q: Please generate an abstract for t

In [266]:
def graphcaption_agenda_instruction_unfactual_graph(text: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "agenda-graph"
    instruction = "Task definition: given a scientific title and a corresponding knowledge graph that expresses this publication, generate an abstract for this title."
    edge_list = triples
    node_list = entities

    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    new_edge_list = list()
    for edge in edge_list:
        if random.random() > 0.7:
            random_select_num -= 1
            if random_select_num >= 0:
                random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
                relation = random_select_edge[1]
                while True:
                    rand_relation = list(relation_name2pid.keys())[random.randint(0, len(relation_name2pid) - 1)]
                    if rand_relation != relation:
                        break
                random_select_edge_list.append(((random_select_edge[0], rand_relation, random_select_edge[2]), (random_select_edge[0], relation, random_select_edge[2])))
                new_edge_list.append((random_select_edge[0], rand_relation, random_select_edge[2]))
        else:
            new_edge_list.append(edge)
    
    gcl = graphcaption_agenda_graph_language(task_name, node_list, new_edge_list)
    query = "Q: Please generate an abstract for the publication based on the title and the knowledge graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def graphcaption_agenda_dataset_unfactual_graph(data: dict):
    task_name = "graph-language-modeling-graph-caption-generation-agenda"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        title = example["title"]
        entities = example["entities"]
        triples = [(i.split(" -- ")[0], i.split(" -- ")[1], i.split(" -- ")[2])for i in example["relations"]]
        abstract = example["abstract_og"].replace("-lrb-", "").replace("-rrb-", "")
        abstract = re.sub(r" -lsb- [0-9]+ -rsb- ", "", abstract)
        abstract = abstract.replace(" ,", ",").replace(" .", ".").replace(" !", "!").replace(" ?", "?").replace(" ;", ";")
        instruction, wrong_edge_list = graphcaption_agenda_instruction_unfactual_graph(abstract, entities, triples)

        if len(wrong_edge_list) == 0:
            continue
        positive_answer = "Sorry, the graph contains some wrong knowledge in the follow:\n"
        for wrong_edge in wrong_edge_list:
            positive_answer += "(\"{}\" -> \"{}\")[relation=\"{}\"] should be corrected as (\"{}\" -> \"{}\")[relation=\"{}\"]\n".format(
                wrong_edge[0][0], wrong_edge[0][2], wrong_edge[0][1],
                wrong_edge[1][0], wrong_edge[1][2], wrong_edge[1][1]
            )
        positive_answer += "based on the corrected graph, the answer can be:\n{}".format(text)
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [positive_answer],
            "answer_negative": [abstract],
            "hallucination_type": "unfactual_graph_information",
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "Agenda",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphcaption_agenda_instruction_data_unfactual_graph = {
    "train": graphcaption_agenda_dataset_unfactual_graph(agenda_train_data),
    "test": graphcaption_agenda_dataset_unfactual_graph(agenda_test_data),
}

100%|██████████| 38720/38720 [00:02<00:00, 17982.17it/s]


total number: 26828


100%|██████████| 1000/1000 [00:00<00:00, 18529.19it/s]

total number: 687





In [267]:
print(graphcaption_agenda_instruction_data_unfactual_graph["test"][122]["instruction"])
print(graphcaption_agenda_instruction_data_unfactual_graph["test"][122]["answer_positive"])
print(graphcaption_agenda_instruction_data_unfactual_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="agenda-graph"] {
    entity_list = ['computing word semantics', 'iterative reinforcement approach', 'summary and keywords', 'single document', 'knowledge-based approach', 'document summarization', 'keyword extraction', 'keywords'];
    triple_list = [("document summarization" -> "keyword extraction")[relation="CONJUNCTION"], ("iterative reinforcement approach" -> "summary and keywords")[relation="USED-FOR"], ("iterative reinforcement approach" -> "knowledge-based approach")[relation="Jila"], ("iterative reinforcement approach" -> "knowledge-based approach")[relation="COMPARE"]];
}
```
Task definition: given a scientific ti

In [268]:
# 构造为如下格式
graphcaption_agenda_preference_data = {
    "train": graphcaption_agenda_instruction_data_conflict_graph["train"] + graphcaption_agenda_instruction_data_unfactual_graph["train"],
    "test": graphcaption_agenda_instruction_data_conflict_graph["test"] + graphcaption_agenda_instruction_data_unfactual_graph["test"],
}
GraphCaptionGeneration_Agenda_preference_benchmark_dict = {
    "graph-language-modeling-graph-caption-generation-agenda": graphcaption_agenda_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [269]:
np.save("preference_dataset/graphlanguagemodeling_graphcaptiongeneration_agenda_preference_dataset.npy", GraphCaptionGeneration_Agenda_preference_benchmark_dict)


### （4）GenWiki

- 数据集地址：https://github.com/zhijing-jin/genwiki
- 数据集格式：
```
    {
        "text": "It has been <ENT_0> the permanent collection of the <ENT_1> <ENT_0> <ENT_2> since <ENT_4> , acquired through the <ENT_3> .",
        "entities": [
            "in",
            "Museum of Modern Art",
            "New York City",
            "Lillie P. Bliss Bequest",
            "1941"
        ],
        "graph": [
            [
                "The Starry Night",
                "city",
                "New York City"
            ]
        ],
        "id_long": {
            "wikipage": "The_Starry_Night",
            "text_paragraph_index": 0,
            "text_sentence_index_start": 2,
            "text_sentence_index_end": 3,
            "graph_set_index": 0
        },
        "id_short": "[\"The_Starry_Night\", 0, [0, 2, 3]]"
    },

```

备注：给数据集也可以用于后续的text-to-graph任务graph construction modeling

### （5）EventNarrative
- 数据集地址：https://www.kaggle.com/datasets/acolas1/eventnarration/
- 数据集描述：https://paperswithcode.com/dataset/eventnarrative

### （6）XAlign
- 数据集地址：https://github.com/tushar117/XAlign/blob/main/datasets/v2.0.zip
- 数据集描述：https://paperswithcode.com/dataset/xalign

## 2.2 Graph Question Answering

### FreeBase预加载

- Graph Question Answering默认为KBQA任务，目前市面上KBQA主要以Freebase为知识库，因此首先先加载一个相对比较完整的Freebase知识库以备用；
- 某些数据集没有提供知识库，例如WebQuestion、GrailQA，那么可以使用这里预加载的Freebase；
- 有些数据集已经提供了原作者处理过的知识库，例如PathQuestion，或者有的数据集不需要使用Freebase，那么知识库则以当前的数据集为准。当然也可以辅助使用这里预加载的知识库。

In [270]:
freebase_dir = "FreeBase/"

**加载freebase知识库**

In [271]:
def load_freebase(freebase_dir):
    import pickle
    with open(os.path.join(freebase_dir, "rel2id.pickle"), "rb") as fr:
        relation_name2id = pickle.load(fr)
    # relation_name2id = {rel_name.lower(): rel_id for rel_name, rel_id in relation_name2id.items()}
    relation_id2name = {rel_id: rel_name for rel_name, rel_id in relation_name2id.items()}
    with open(os.path.join(freebase_dir, "entity_name.pickle"), "rb") as fr:
        entity_name = pickle.load(fr)
    with open(os.path.join(freebase_dir, "ent2id.pickle"), "rb") as fr:
        entity_name2id = pickle.load(fr)
    # entity_name2id = {ent_name.lower(): ent_id for ent_name, ent_id in entity_name2id.items()}
    entity_id2name = {ent_id: ent_name for ent_name, ent_id in entity_name2id.items()}
    ent_type_ary = np.load(os.path.join(freebase_dir, "ent_type_ary.npy"))
    subgraph_2hop_triples = np.load(os.path.join(freebase_dir, "subgraph_2hop_triples.npy"))
    entity_id2adj = dict()
    for (head_id, rel_id, tail_id) in tqdm(subgraph_2hop_triples):
        if head_id not in entity_id2adj.keys():
            entity_id2adj[head_id] = dict()
        if rel_id not in entity_id2adj[head_id].keys():
            entity_id2adj[head_id][rel_id] = list()
        entity_id2adj[head_id][rel_id].append(tail_id)
    

    node2adj_2hop_triples = dict() # 2-hop子图中的所有三元组
    node2adj_2hop_nodes = dict() # 2-hop子图中的所有节点
    
    # for node_id, node_adj in tqdm(entity_id2adj.items()):
    #     if node_id not in node2adj_2hop_triples.keys():
    #         node2adj_2hop_triples[node_id] = set()
    #         node2adj_2hop_nodes[node_id] = set()
    #     for head_1hop_rel, head_1hop_adj in node_adj.items():
    #         for head_1hop_tail in head_1hop_adj:
    #             node2adj_2hop_nodes[node_id].add(head_1hop_tail)
    #             node2adj_2hop_triples[node_id].add((node_id, head_1hop_rel, head_1hop_tail))
    #             if head_1hop_tail in entity_id2adj.keys():
    #                 for head_2hop_rel, head_2hop_adj in entity_id2adj[head_1hop_tail].items():
    #                     for head_2hop_tail in head_2hop_adj:
    #                         node2adj_2hop_nodes[node_id].add(head_2hop_tail)
    #                         node2adj_2hop_triples[node_id].add((head_1hop_tail, head_2hop_rel, head_2hop_tail))
                            
    
    return {
        "relation_name2id": relation_name2id,
        "relation_id2name": relation_id2name,
        "entity_name": entity_name,
        "entity_name2id": entity_name2id,
        "entity_id2name": entity_id2name,
        "ent_type_ary": ent_type_ary,
        "entity_id2adj": entity_id2adj,
        # "node2adj_2hop_triples": node2adj_2hop_triples,
    }
freebase = load_freebase(freebase_dir)

100%|██████████| 105948364/105948364 [04:48<00:00, 367341.98it/s]


In [83]:
list(freebase["entity_name"][0].values())[:10]

[True, False, True, False, True, True, False, True, True, True]

In [674]:
freebase["entity_name2id"]["m.0b787yg"]

23279064

In [675]:
freebase["entity_id2adj"][23279064]

{15513: [10162844,
  27387772,
  21785737,
  24281657,
  11934025,
  30030937,
  19637215,
  24205112,
  29943758,
  14290979,
  14269424,
  28396668,
  23023280,
  15424158,
  27224867,
  11289722,
  20461802,
  9376696,
  1827698,
  27001071,
  29540455,
  14414104,
  9140164,
  19697547,
  16261459,
  17747249,
  30854104,
  27755709,
  22771619,
  7178859,
  11609721,
  11208152,
  9474079,
  27238957,
  4629420,
  13384814,
  15954149,
  15704785,
  10426813,
  6275896,
  29396444,
  16690749,
  11273264,
  26363196,
  17630438,
  8336213,
  10385138,
  1926890,
  22741642,
  7660550,
  25352060,
  10384406,
  20593522,
  13805180,
  34403315,
  5077092,
  10163397,
  27607596,
  8269811,
  5098369,
  25139979,
  16427188,
  3244685,
  3572803,
  26497203,
  7358186,
  29724560,
  34337983,
  18982485,
  5421523,
  5184877,
  15751015,
  17443734,
  19409660,
  29082393,
  16831617,
  3672081,
  21773856,
  8334829,
  4556234,
  7276216,
  31843639,
  15229456,
  29335018,
  15613

In [720]:
freebase["relation_id2name"][5579294] # 代表同类别的意思，例如“m.0b787yg”的英文名称是set_designer或scenic_designer

KeyError: 5579294

In [722]:
freebase["entity_id2name"][22618417]

'Set Designer'

In [717]:
freebase["relation_id2name"][15850]

'type.object.name'

### （1）PathQuestion

- 数据集地址：https://github.com/zmtkeke/IRN/tree/master/PathQuestion
- 给定一个问题，和一个3hop子图，从子图中寻找一个subject到answer的路径，并返回子图中的一个实体作为答案；
- 测试集1000个样本，其余作为训练集。
- PathQuestion已经提供了一个Freebase13知识库，因此这里无需使用预加载的FreeBase。

In [272]:
pathquestion_data_dir = "PathQuestion/"

In [273]:
def load_freebase13(data_dir):
    # 加载知识库
    with open(os.path.join(data_dir, "Freebase13.txt"), "r", encoding="utf-8") as fr:
        kb_lines = fr.readlines()
    
    entity_name2adj = dict() # 1-hop
    for line in kb_lines:
        line = line.strip()
        head_name, rel_name, tail_name = line.split("\t")
        if head_name not in entity_name2adj.keys():
            entity_name2adj[head_name] = dict()
        if rel_name not in entity_name2adj[head_name].keys():
            entity_name2adj[head_name][rel_name] = list()
        entity_name2adj[head_name][rel_name].append(tail_name)

    node2adj_2hop_triples = dict() # 2-hop子图中的所有三元组
    node2adj_2hop_nodes = dict() # 2-hop子图中的所有节点
    node2adj_3hop_triples = dict() # 3-hop子图中的所有三元组
    node2adj_3hop_nodes = dict() # 3-hop子图中的所有节点
    
    for node_name, node_adj in entity_name2adj.items():
        if node_name not in node2adj_2hop_triples.keys():
            node2adj_2hop_triples[node_name] = set()
            node2adj_2hop_nodes[node_name] = set()
        if node_name not in node2adj_3hop_triples.keys():
            node2adj_3hop_triples[node_name] = set()
            node2adj_3hop_nodes[node_name] = set()
        for head_1hop_rel, head_1hop_adj in node_adj.items():
            for head_1hop_tail in head_1hop_adj:
                node2adj_2hop_nodes[node_name].add(head_1hop_tail)
                node2adj_2hop_triples[node_name].add((node_name, head_1hop_rel, head_1hop_tail))
                if head_1hop_tail in entity_name2adj.keys():
                    for head_2hop_rel, head_2hop_adj in entity_name2adj[head_1hop_tail].items():
                        for head_2hop_tail in head_2hop_adj:
                            node2adj_2hop_nodes[node_name].add(head_2hop_tail)
                            node2adj_2hop_triples[node_name].add((head_1hop_tail, head_2hop_rel, head_2hop_tail))
                            if head_2hop_tail in entity_name2adj.keys():
                                for head_3hop_rel, head_3hop_adj in entity_name2adj[head_2hop_tail].items():
                                    for head_3hop_tail in head_3hop_adj:
                                        node2adj_3hop_nodes[node_name].add(head_3hop_tail)
                                        node2adj_3hop_triples[node_name].add((head_2hop_tail, head_3hop_rel, head_3hop_tail))
                                        
    return entity_name2adj, node2adj_2hop_triples, node2adj_2hop_nodes, node2adj_3hop_triples, node2adj_3hop_nodes
freebase13_entity_name_1hop_adj, freebase13_entity_name_2hop_adj_triples, freebase13_entity_name_2hop_adj_nodes, \
freebase13_entity_name_3hop_adj_triples, freebase13_entity_name_3hop_adj_nodes = load_freebase13(pathquestion_data_dir)

In [274]:
pathquestion_kb_data_dic = {
    "PQ-2H.txt": "2H-kb.txt",
    "PQ-3H.txt": "3H-kb.txt",
    "PQL-2H.txt": "PQL2-KB.txt",
    "PQL-3H.txt": "PQL3-KB.txt",
    "PQL-3H_more.txt": "PQL3-KB.txt",
}

In [275]:
def load_pathquestion_data(data_dir, pathquestion_kb_data_dic):
    examples = list()
    for example_file, kb_file in tqdm(pathquestion_kb_data_dic.items()):
        # 加载数据集
        with open(os.path.join(data_dir, example_file), "r", encoding="utf-8") as fr:
            data_lines = fr.readlines()
        print(len(data_lines))
        for line in data_lines:
            question, answer_set, reason_path  = line.strip().split("\t")[0], line.strip().split("\t")[1], line.strip().split("\t")[2]
            answer_list = list(set(answer_set.replace("(", "/").replace("/)", "").split("/")))
            reason_path = reason_path.split("#<end>")[0].split("#")
            # print(reason_path)
            evidence_triples = list()
            for i in range(0, len(reason_path) - 1, 2):
                evidence_triples.append((reason_path[i], reason_path[i + 1], reason_path[i + 2]))
            subject = evidence_triples[0][0]

            try:
                adj_triples = list(freebase13_entity_name_3hop_adj_triples[subject])
                shuffle(adj_triples)
                adj_triples = adj_triples[:25]
                adj_triples = set(adj_triples).union(set(evidence_triples))
                adj_triples = list(adj_triples)
    
                all_nodes = set()
                for triple in adj_triples:
                    all_nodes.add(triple[0])
                    all_nodes.add(triple[2])
            
                examples.append({
                    "question": question,
                    "answer": answer_list,
                    "evidence_triples": evidence_triples,
                    "entities": list(all_nodes),
                    "entity_2adj_triples": adj_triples,
                })
            except:
                continue
    return examples
pathquestion_data = load_pathquestion_data(pathquestion_data_dir, pathquestion_kb_data_dic)            

 20%|██        | 1/5 [00:00<00:00,  9.04it/s]

1908
5198


100%|██████████| 5/5 [00:00<00:00, 12.30it/s]

1594
1031
2062





In [276]:
len(pathquestion_data)

7106

In [277]:
shuffle(pathquestion_data)
pathquestion_test_data, pathquestion_train_data = pathquestion_data[:1000], pathquestion_data[1000:]

In [278]:
pathquestion_data[120]

{'question': "who is the place of death of daughter of henry_iv_holy_roman_emperor 's parents ?",
 'answer': ['liege'],
 'evidence_triples': [('henry_iv_holy_roman_emperor',
   'parents',
   'henry_iii_holy_roman_emperor'),
  ('henry_iii_holy_roman_emperor', 'children', 'henry_iv_holy_roman_emperor'),
  ('henry_iv_holy_roman_emperor', 'place_of_death', 'liege')],
 'entities': ['dysentery',
  'roman_catholic_church',
  'quedlinburg_abbey',
  'holy_roman_emperor',
  'liege',
  'germany',
  'female',
  'gisela_of_swabia',
  'male',
  'henry_iii_holy_roman_emperor',
  'goslar',
  'conrad_ii_holy_roman_emperor',
  'adelheid_ii_abbess_of_quedlinburg',
  'henry_iv_holy_roman_emperor',
  'agnes_de_poitou'],
 'entity_2adj_triples': [('henry_iv_holy_roman_emperor', 'gender', 'female'),
  ('gisela_of_swabia', 'gender', 'female'),
  ('adelheid_ii_abbess_of_quedlinburg', 'parents', 'agnes_de_poitou'),
  ('henry_iii_holy_roman_emperor', 'children', 'henry_iv_holy_roman_emperor'),
  ('gisela_of_swabi

**构建instruction数据集**

In [279]:
def graphqa_pathquestion_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [282]:
def graphqa_pathquestion_instruction_unfaithful_answer(question: str, answer: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "freebase-knowledge-base"
    instruction = "Task definition: given a question and factual knowledge graph, find an entity and a reasoning path in the graph to answer the question."
    edge_list = triples
    node_list = entities
    gcl = graphqa_pathquestion_graph_language(task_name, node_list, edge_list)
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def graphqa_pathquestion_dataset_unfaithful_answer(data: dict, data_kind: str="train"):
    task_name = "graph-language-modeling-graph-question-answering-pathquestion"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    example_id = 0
    K = 5 if data_kind == "train" else 1
    for ei, example in enumerate(tqdm(data)):
        question = example["question"]
        answer = example["answer"]
        entities = example["entities"]
        triples = example["entity_2adj_triples"]
        evidence_triples = example["evidence_triples"]
        instruction = graphqa_pathquestion_instruction_unfaithful_answer(question, answer, entities, triples)
        for k in range(K): # 数据增强
            while True:
                answer_negative = entities[random.randint(0, len(entities) - 1)]
                if answer_negative != answer:
                    break
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": answer,
                "answer_negative": [answer_negative],
                "hallucination_type": "unfaithful_answer",
                "answer_with_cot": [evidence_triples],
                "difficulty": "simple",
                "from": "PathQuestion",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

graphqa_pathquestion_instruction_data_unfaithful_answer = {
    "train": graphqa_pathquestion_dataset_unfaithful_answer(pathquestion_train_data),
    "test": graphqa_pathquestion_dataset_unfaithful_answer(pathquestion_test_data, "test"),
}

100%|██████████| 6106/6106 [00:00<00:00, 25857.18it/s]


total number: 30530


100%|██████████| 1000/1000 [00:00<00:00, 37186.18it/s]

total number: 1000





In [283]:
print(graphqa_pathquestion_instruction_data_unfaithful_answer["test"][122]["instruction"])
print(graphqa_pathquestion_instruction_data_unfaithful_answer["test"][122]["answer_positive"])
print(graphqa_pathquestion_instruction_data_unfaithful_answer["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="freebase-knowledge-base"] {
    entity_list = ['san_francisco_state_university', 'john_carradine', 'scottish_american', 'film_director', 'rabbi', 'film_producer', 'actor', 'male', 'hollywood', 'erotic_asphyxiation', 'bangkok', 'hanging', 'united_states', 'david_carradine'];
    triple_list = [("john_carradine" -> "united_states")[relation="nationality"], ("david_carradine" -> "bangkok")[relation="place_of_death"], ("david_carradine" -> "film_director")[relation="profession"], ("david_carradine" -> "john_carradine")[relation="parents"], ("david_carradine" -> "hanging")[relation="cause_of_death"], ("david_carradine" -> "male

In [291]:
def graphqa_pathquestion_instruction_conflict_graph(question: str, answer: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "freebase-knowledge-base"
    instruction = "Task definition: given a question and factual knowledge graph, find an entity and a reasoning path in the graph to answer the question."
    edge_list = triples
    node_list = entities

    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    for i in range(random_select_num):
        random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
        relation = random_select_edge[1]
        while True:
            rand_relation = list(relation_name2pid.keys())[random.randint(0, len(relation_name2pid) - 1)]
            if rand_relation != relation:
                break
        random_select_edge_list.append(((random_select_edge[0], rand_relation, random_select_edge[2]), (random_select_edge[0], relation, random_select_edge[2])))
        edge_list.append((random_select_edge[0], rand_relation, random_select_edge[2]))
    shuffle(edge_list)
    
    gcl = graphqa_pathquestion_graph_language(task_name, node_list, edge_list)
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def graphqa_pathquestion_dataset_conflict_graph(data: dict, data_kind: str="train"):
    task_name = "graph-language-modeling-graph-question-answering-pathquestion"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    example_id = 0
    K = 5 if data_kind == "train" else 1
    for ei, example in enumerate(tqdm(data)):
        question = example["question"]
        answer = example["answer"]
        entities = example["entities"]
        triples = example["entity_2adj_triples"]
        evidence_triples = example["evidence_triples"]
        instruction, conflict_edge_list = graphqa_pathquestion_instruction_conflict_graph(question, answer, entities, triples)

        positive_answer = "Sorry, the graph contains some conflict edges in the follow:\n"
        for conflict_edge in conflict_edge_list:
            positive_answer += "(\"{}\" -> \"{}\")[relation=\"{}\"] is conflict with (\"{}\" -> \"{}\")[relation=\"{}\"]\n".format(
                conflict_edge[0][0], conflict_edge[0][2], conflict_edge[0][1],
                conflict_edge[1][0], conflict_edge[1][2], conflict_edge[1][1]
            )
        positive_answer += "so the question is unanswerable, you had better provide a correct graph."
        
        for k in range(K): # 数据增强
            
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": [positive_answer],
                "answer_negative": answer,
                "hallucination_type": "conflict_graph_information",
                "answer_with_cot": [evidence_triples],
                "difficulty": "simple",
                "from": "PathQuestion",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

graphqa_pathquestion_instruction_data_conflict_graph = {
    "train": graphqa_pathquestion_dataset_conflict_graph(pathquestion_train_data),
    "test": graphqa_pathquestion_dataset_conflict_graph(pathquestion_test_data, "test"),
}

100%|██████████| 6106/6106 [00:00<00:00, 10674.42it/s]


total number: 30530


100%|██████████| 1000/1000 [00:00<00:00, 11191.53it/s]

total number: 1000





In [292]:
print(graphqa_pathquestion_instruction_data_conflict_graph["test"][122]["instruction"])
print(graphqa_pathquestion_instruction_data_conflict_graph["test"][122]["answer_positive"])
print(graphqa_pathquestion_instruction_data_conflict_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="freebase-knowledge-base"] {
    entity_list = ['san_francisco_state_university', 'john_carradine', 'scottish_american', 'film_director', 'rabbi', 'film_producer', 'actor', 'male', 'hollywood', 'erotic_asphyxiation', 'bangkok', 'hanging', 'united_states', 'david_carradine'];
    triple_list = [("david_carradine" -> "film_director")[relation="profession"], ("david_carradine" -> "hanging")[relation="cause_of_death"], ("david_carradine" -> "film_producer")[relation="profession"], ("david_carradine" -> "john_carradine")[relation="parents"], ("david_carradine" -> "united_states")[relation="nationality"], ("david_carradine" -> "s

In [299]:
def graphqa_pathquestion_instruction_missing_graph(question: str, answer: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "freebase-knowledge-base"
    instruction = "Task definition: given a question and factual knowledge graph, find an entity and a reasoning path in the graph to answer the question."
    edge_list = triples
    node_list = entities

    # 从node_list和edge_list中删除answer实体，构造不存在正确答案的graph
    node_list = [i for i in node_list if i not in answer]
    edge_list = [i for i in edge_list if i[0] not in answer and i[2] not in answer]
    
    gcl = graphqa_pathquestion_graph_language(task_name, node_list, edge_list)
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def graphqa_pathquestion_dataset_missing_graph(data: dict, data_kind: str="train"):
    task_name = "graph-language-modeling-graph-question-answering-pathquestion"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    example_id = 0
    K = 5 if data_kind == "train" else 1
    for ei, example in enumerate(tqdm(data)):
        question = example["question"]
        answer = example["answer"]
        entities = example["entities"]
        triples = example["entity_2adj_triples"]
        evidence_triples = example["evidence_triples"]
        instruction = graphqa_pathquestion_instruction_missing_graph(question, answer, entities, triples)

        positive_answer = ["Based on the world knowledge, the correct answer to the question is \"{}\", but the answer does not exist in the graph.".format(i) for i in answer]
            
        
        for k in range(K): # 数据增强
            
            final_data.append({
                "task_name": task_name,
                "idx": example_id,
                "instruction": instruction["instruction"],
                "graph_language": instruction["graph_language"],
                "graph": instruction["graph"],
                "answer_positive": positive_answer,
                "answer_negative": answer,
                "hallucination_type": "missing_graph_information",
                "answer_with_cot": [evidence_triples],
                "difficulty": "simple",
                "from": "PathQuestion",
            })
            example_id += 1
    print("total number: {}".format(len(final_data)))
    return final_data

graphqa_pathquestion_instruction_data_missing_graph = {
    "train": graphqa_pathquestion_dataset_missing_graph(pathquestion_train_data),
    "test": graphqa_pathquestion_dataset_missing_graph(pathquestion_test_data, "test"),
}

100%|██████████| 6106/6106 [00:00<00:00, 25686.84it/s]


total number: 30530


100%|██████████| 1000/1000 [00:00<00:00, 34174.49it/s]

total number: 1000





In [300]:
print(graphqa_pathquestion_instruction_data_missing_graph["test"][122]["instruction"])
print(graphqa_pathquestion_instruction_data_missing_graph["test"][122]["answer_positive"])
print(graphqa_pathquestion_instruction_data_missing_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="freebase-knowledge-base"] {
    entity_list = ['san_francisco_state_university', 'john_carradine', 'scottish_american', 'film_director', 'rabbi', 'film_producer', 'actor', 'male', 'hollywood', 'erotic_asphyxiation', 'bangkok', 'hanging', 'david_carradine'];
    triple_list = [("david_carradine" -> "film_director")[relation="profession"], ("david_carradine" -> "hanging")[relation="cause_of_death"], ("david_carradine" -> "film_producer")[relation="profession"], ("david_carradine" -> "john_carradine")[relation="parents"], ("david_carradine" -> "san_francisco_state_university")[relation="institution"], ("david_carradine" -> "r

In [303]:
# 构造为如下格式
graphqa_pathquestion_preference_data = {
    "train": graphqa_pathquestion_instruction_data_unfaithful_answer["train"] + graphqa_pathquestion_instruction_data_conflict_graph["train"] + graphqa_pathquestion_instruction_data_missing_graph["train"],
    "test": graphqa_pathquestion_instruction_data_unfaithful_answer["test"] + graphqa_pathquestion_instruction_data_conflict_graph["test"] + graphqa_pathquestion_instruction_data_missing_graph["test"],
}
GraphQuestionAnswering_PathQuestion_preference_benchmark_dict = {
    "graph-language-modeling-graph-question-answering-pathquestion": graphqa_pathquestion_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [304]:
np.save("preference_dataset/graphlanguagemodeling_graphquestionanswering_pathquestion_preference_dataset.npy", GraphQuestionAnswering_PathQuestion_preference_benchmark_dict)


### （2）WC2014

- 数据集地址：https://github.com/zmtkeke/IRN/tree/master/WC2014
- 给定一个问题，和一个3hop子图，从子图中寻找一个subject到answer的路径，并返回子图中的一个实体作为答案；
- 测试集1000个样本，其余作为训练集。

In [284]:
wc2014_data_dir = "WC2014/"

In [285]:
def load_football_kb(data_dir):
    # 加载知识库
    with open(os.path.join(data_dir, "WC2014.txt"), "r", encoding="utf-8") as fr:
        kb_lines = fr.readlines()
    
    entity_name2adj = dict() # 1-hop
    for line in kb_lines:
        line = line.strip()
        head_name, rel_name, tail_name = line.split("\t")
        if head_name not in entity_name2adj.keys():
            entity_name2adj[head_name] = dict()
        if rel_name not in entity_name2adj[head_name].keys():
            entity_name2adj[head_name][rel_name] = list()
        entity_name2adj[head_name][rel_name].append(tail_name)

    node2adj_2hop_triples = dict() # 2-hop子图中的所有三元组
    node2adj_2hop_nodes = dict() # 2-hop子图中的所有节点
    node2adj_3hop_triples = dict() # 3-hop子图中的所有三元组
    node2adj_3hop_nodes = dict() # 3-hop子图中的所有节点
    
    for node_name, node_adj in entity_name2adj.items():
        if node_name not in node2adj_2hop_triples.keys():
            node2adj_2hop_triples[node_name] = set()
            node2adj_2hop_nodes[node_name] = set()
        if node_name not in node2adj_3hop_triples.keys():
            node2adj_3hop_triples[node_name] = set()
            node2adj_3hop_nodes[node_name] = set()
        for head_1hop_rel, head_1hop_adj in node_adj.items():
            for head_1hop_tail in head_1hop_adj:
                node2adj_2hop_nodes[node_name].add(head_1hop_tail)
                node2adj_2hop_triples[node_name].add((node_name, head_1hop_rel, head_1hop_tail))
                if head_1hop_tail in entity_name2adj.keys():
                    for head_2hop_rel, head_2hop_adj in entity_name2adj[head_1hop_tail].items():
                        for head_2hop_tail in head_2hop_adj:
                            node2adj_2hop_nodes[node_name].add(head_2hop_tail)
                            node2adj_2hop_triples[node_name].add((head_1hop_tail, head_2hop_rel, head_2hop_tail))
                            if head_2hop_tail in entity_name2adj.keys():
                                for head_3hop_rel, head_3hop_adj in entity_name2adj[head_2hop_tail].items():
                                    for head_3hop_tail in head_3hop_adj:
                                        node2adj_3hop_nodes[node_name].add(head_3hop_tail)
                                        node2adj_3hop_triples[node_name].add((head_2hop_tail, head_3hop_rel, head_3hop_tail))
                                        
    return entity_name2adj, node2adj_2hop_triples, node2adj_2hop_nodes, node2adj_3hop_triples, node2adj_3hop_nodes
wc2014_entity_name_1hop_adj, wc2014_entity_name_2hop_adj_triples, wc2014_entity_name_2hop_adj_nodes, \
wc2014_entity_name_3hop_adj_triples, wc2014_entity_name_3hop_adj_nodes = load_football_kb(wc2014_data_dir)

In [435]:
# pathquestion_kb_data_dic = {
#     "PQ-2H.txt": "2H-kb.txt",
#     "PQ-3H.txt": "3H-kb.txt",
#     "PQL-2H.txt": "PQL2-KB.txt",
#     "PQL-3H.txt": "PQL3-KB.txt",
#     "PQL-3H_more.txt": "PQL3-KB.txt",
# }

In [286]:
def load_wc2014_data(data_dir):
    examples = list()
    with open(os.path.join(data_dir, "WC-P1.txt"), "r", encoding="utf-8") as fr:
        data_lines = fr.readlines()
    
    for line in tqdm(data_lines):
        question, answer_set, reason_path  = line.strip().split("\t")[0], line.strip().split("\t")[1], line.strip().split("\t")[2]
        answer_list = list(set(answer_set.replace("(", "/").replace("/)", "").split("/")))
        reason_path = reason_path.split("#<end>")[0].split("#")
        # print(reason_path)
        evidence_triples = list()
        for i in range(0, len(reason_path) - 1, 2):
            evidence_triples.append((reason_path[i], reason_path[i + 1], reason_path[i + 2]))
        subject = evidence_triples[0][0]

        try:
            adj_triples = list(wc2014_entity_name_3hop_adj_triples[subject])
            shuffle(adj_triples)
            adj_triples = adj_triples[:25]
            adj_triples = set(adj_triples).union(set(evidence_triples))
            adj_triples = list(adj_triples)

            all_nodes = set()
            for triple in adj_triples:
                all_nodes.add(triple[0])
                all_nodes.add(triple[2])
        
            examples.append({
                "question": question,
                "answer": answer_list,
                "evidence_triples": evidence_triples,
                "entities": list(all_nodes),
                "entity_2adj_triples": adj_triples,
            })
        except:
            continue
    return examples
wc2014_data = load_wc2014_data(wc2014_data_dir)            

100%|██████████| 6482/6482 [00:03<00:00, 2144.76it/s]


In [287]:
shuffle(wc2014_data)
wc2014_test_data, wc2014_train_data = wc2014_data[:1000], wc2014_data[1000:]

**构建instruction数据集**

In [288]:
def graphqa_wc2014_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [289]:
def graphqa_wc2014_instruction_unfaithful_answer(question: str, answer: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "football-worldcup-knowledge-base"
    instruction = "Task definition: given a question and a foot world cup knowledge graph, find an entity and a reasoning path in the graph to answer the question."
    edge_list = triples
    node_list = entities
    gcl = graphqa_wc2014_graph_language(task_name, node_list, edge_list)
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def graphqa_wc2014_dataset_unfaithful_answer(data: dict):
    task_name = "graph-language-modeling-graph-question-answering-wc2014"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        question = example["question"]
        answer = example["answer"]
        entities = example["entities"]
        triples = example["entity_2adj_triples"]
        evidence_triples = example["evidence_triples"]
        instruction = graphqa_wc2014_instruction_unfaithful_answer(question, answer, entities, triples)

        while True:
            answer_negative = entities[random.randint(0, len(entities) - 1)]
            if answer_negative != answer:
                break
                
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": answer,
            "answer_negative": [answer_negative],
            "hallucination_type": "unfaithful_answer",
            "answer_with_cot": [evidence_triples],
            "difficulty": "simple",
            "from": "WC2014",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphqa_wc2014_instruction_data_unfaithful_answer = {
    "train": graphqa_wc2014_dataset_unfaithful_answer(wc2014_train_data),
    "test": graphqa_wc2014_dataset_unfaithful_answer(wc2014_test_data),
}

100%|██████████| 5482/5482 [00:00<00:00, 33113.20it/s]


total number: 5482


100%|██████████| 1000/1000 [00:00<00:00, 41531.46it/s]

total number: 1000





In [290]:
print(graphqa_wc2014_instruction_data_unfaithful_answer["test"][122]["instruction"])
print(graphqa_wc2014_instruction_data_unfaithful_answer["test"][122]["answer_positive"])
print(graphqa_wc2014_instruction_data_unfaithful_answer["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="football-worldcup-knowledge-base"] {
    entity_list = ['RUI_PATRICIO', 'Georginio_WIJNALDUM', 'Alex_WILKINSON', 'Arjen_ROBBEN', 'Sporting_CP', 'Diego_LUGANO', 'Portugal', 'Kevin_Prince_BOATENG', 'Konstantinos_MITROGLOU', 'Antoine_GRIEZMANN', 'Juan_ZUNIGA', 'Glen_JOHNSON', 'JUNG_Sungryong', 'Midfielder', 'Nabil_BENTALEB', 'Forward', 'LEE_Bumyoung', 'Eliaquim_MANGALA', 'Diego_REYES', 'Goalkeeper', 'Benjamin_MOUKANDJO', 'Gianluigi_BUFFON', 'Jorge_FUCILE', 'Nicolas_LOMBAERTS', 'Miroslav_KLOSE', 'Islam_SLIMANI', 'Roman_SHIROKOV', '13', 'Emir_SPAHIC', 'Jose_CUBERO', 'Defender'];
    triple_list = [("Defender" -> "Nicolas_LOMBAE

In [294]:
def graphqa_wc2014_instruction_conflict_graph(question: str, answer: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "football-worldcup-knowledge-base"
    instruction = "Task definition: given a question and a foot world cup knowledge graph, find an entity and a reasoning path in the graph to answer the question."
    edge_list = triples
    node_list = entities

    # 随机挑选1～3个边，每条边分别复制一份，并修改权重，使得输入的graph存在权重冲突的边
    random_select_num = random.randint(1, min(3, len(edge_list)))
    random_select_edge_list = list()
    for i in range(random_select_num):
        random_select_edge = edge_list[random.randint(0, len(edge_list) - 1)]
        relation = random_select_edge[1]
        while True:
            rand_relation = list(relation_name2pid.keys())[random.randint(0, len(relation_name2pid) - 1)]
            if rand_relation != relation:
                break
        random_select_edge_list.append(((random_select_edge[0], rand_relation, random_select_edge[2]), (random_select_edge[0], relation, random_select_edge[2])))
        edge_list.append((random_select_edge[0], rand_relation, random_select_edge[2]))
    shuffle(edge_list)
    
    gcl = graphqa_wc2014_graph_language(task_name, node_list, edge_list)
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }, random_select_edge_list

def graphqa_wc2014_dataset_conflict_graph(data: dict):
    task_name = "graph-language-modeling-graph-question-answering-wc2014"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        question = example["question"]
        answer = example["answer"]
        entities = example["entities"]
        triples = example["entity_2adj_triples"]
        evidence_triples = example["evidence_triples"]
        instruction, conflict_edge_list = graphqa_wc2014_instruction_conflict_graph(question, answer, entities, triples)

        positive_answer = "Sorry, the graph contains some conflict edges in the follow:\n"
        for conflict_edge in conflict_edge_list:
            positive_answer += "(\"{}\" -> \"{}\")[relation=\"{}\"] is conflict with (\"{}\" -> \"{}\")[relation=\"{}\"]\n".format(
                conflict_edge[0][0], conflict_edge[0][2], conflict_edge[0][1],
                conflict_edge[1][0], conflict_edge[1][2], conflict_edge[1][1]
            )
        positive_answer += "so the question is unanswerable, you had better provide a correct graph."
                
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [positive_answer],
            "answer_negative": answer,
            "hallucination_type": "conflict_graph_information",
            "answer_with_cot": [evidence_triples],
            "difficulty": "simple",
            "from": "WC2014",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphqa_wc2014_instruction_data_conflict_graph = {
    "train": graphqa_wc2014_dataset_conflict_graph(wc2014_train_data),
    "test": graphqa_wc2014_dataset_conflict_graph(wc2014_test_data),
}

100%|██████████| 5482/5482 [00:00<00:00, 10974.84it/s]


total number: 5482


100%|██████████| 1000/1000 [00:00<00:00, 11376.36it/s]

total number: 1000





In [295]:
print(graphqa_wc2014_instruction_data_conflict_graph["test"][122]["instruction"])
print(graphqa_wc2014_instruction_data_conflict_graph["test"][122]["answer_positive"])
print(graphqa_wc2014_instruction_data_conflict_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="football-worldcup-knowledge-base"] {
    entity_list = ['RUI_PATRICIO', 'Georginio_WIJNALDUM', 'Alex_WILKINSON', 'Arjen_ROBBEN', 'Sporting_CP', 'Diego_LUGANO', 'Portugal', 'Kevin_Prince_BOATENG', 'Konstantinos_MITROGLOU', 'Antoine_GRIEZMANN', 'Juan_ZUNIGA', 'Glen_JOHNSON', 'JUNG_Sungryong', 'Midfielder', 'Nabil_BENTALEB', 'Forward', 'LEE_Bumyoung', 'Eliaquim_MANGALA', 'Diego_REYES', 'Goalkeeper', 'Benjamin_MOUKANDJO', 'Gianluigi_BUFFON', 'Jorge_FUCILE', 'Nicolas_LOMBAERTS', 'Miroslav_KLOSE', 'Islam_SLIMANI', 'Roman_SHIROKOV', '13', 'Emir_SPAHIC', 'Jose_CUBERO', 'Defender'];
    triple_list = [("Forward" -> "Antoine_GRIEZMA

In [301]:
def graphqa_wc2014_instruction_missing_graph(question: str, answer: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "football-worldcup-knowledge-base"
    instruction = "Task definition: given a question and a foot world cup knowledge graph, find an entity and a reasoning path in the graph to answer the question."
    edge_list = triples
    node_list = entities

    # 从node_list和edge_list中删除answer实体，构造不存在正确答案的graph
    node_list = [i for i in node_list if i not in answer]
    edge_list = [i for i in edge_list if i[0] not in answer and i[2] not in answer]
    
    gcl = graphqa_wc2014_graph_language(task_name, node_list, edge_list)
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def graphqa_wc2014_dataset_missing_graph(data: dict):
    task_name = "graph-language-modeling-graph-question-answering-wc2014"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        question = example["question"]
        answer = example["answer"]
        entities = example["entities"]
        triples = example["entity_2adj_triples"]
        evidence_triples = example["evidence_triples"]
        instruction = graphqa_wc2014_instruction_missing_graph(question, answer, entities, triples)

        positive_answer = ["Based on the world knowledge, the correct answer to the question is \"{}\", but the answer does not exist in the graph.".format(i) for i in answer]
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [positive_answer],
            "answer_negative": answer,
            "hallucination_type": "missing_graph_information",
            "answer_with_cot": [evidence_triples],
            "difficulty": "simple",
            "from": "WC2014",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphqa_wc2014_instruction_data_missing_graph = {
    "train": graphqa_wc2014_dataset_missing_graph(wc2014_train_data),
    "test": graphqa_wc2014_dataset_missing_graph(wc2014_test_data),
}

100%|██████████| 5482/5482 [00:00<00:00, 28201.03it/s]


total number: 5482


100%|██████████| 1000/1000 [00:00<00:00, 36522.71it/s]

total number: 1000





In [302]:
print(graphqa_wc2014_instruction_data_missing_graph["test"][122]["instruction"])
print(graphqa_wc2014_instruction_data_missing_graph["test"][122]["answer_positive"])
print(graphqa_wc2014_instruction_data_missing_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="football-worldcup-knowledge-base"] {
    entity_list = ['RUI_PATRICIO', 'Georginio_WIJNALDUM', 'Alex_WILKINSON', 'Arjen_ROBBEN', 'Sporting_CP', 'Diego_LUGANO', 'Kevin_Prince_BOATENG', 'Konstantinos_MITROGLOU', 'Antoine_GRIEZMANN', 'Juan_ZUNIGA', 'Glen_JOHNSON', 'JUNG_Sungryong', 'Midfielder', 'Nabil_BENTALEB', 'Forward', 'LEE_Bumyoung', 'Eliaquim_MANGALA', 'Diego_REYES', 'Goalkeeper', 'Benjamin_MOUKANDJO', 'Gianluigi_BUFFON', 'Jorge_FUCILE', 'Nicolas_LOMBAERTS', 'Miroslav_KLOSE', 'Islam_SLIMANI', 'Roman_SHIROKOV', '13', 'Emir_SPAHIC', 'Jose_CUBERO', 'Defender'];
    triple_list = [("Forward" -> "Antoine_GRIEZMANN")[relatio

In [307]:
# 构造为如下格式
graphqa_wc2014_preference_data = {
    "train": graphqa_wc2014_instruction_data_unfaithful_answer["train"] + graphqa_wc2014_instruction_data_conflict_graph["train"] + graphqa_wc2014_instruction_data_missing_graph["train"],
    "test": graphqa_wc2014_instruction_data_unfaithful_answer["test"] + graphqa_wc2014_instruction_data_conflict_graph["test"] + graphqa_wc2014_instruction_data_missing_graph["test"],
}
GraphQuestionAnswering_WC2014_preference_benchmark_dict = {
    "graph-language-modeling-graph-question-answering-wc2014": graphqa_wc2014_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [308]:
np.save("preference_dataset/graphlanguagemodeling_graphquestionanswering_wc2014_preference_dataset.npy", GraphQuestionAnswering_WC2014_preference_benchmark_dict)


### （3）GrailQA

- 数据集：https://dl.orangedox.com/WyaCpL/
- 使用预加载的FreeBase

数据集格式：
```
{
	"qid": 3202959008000, 
	"question": "what is the role of opera designer gig who designed the telephone / the medium?", 
	"answer": [
		{"answer_type": "Entity", "answer_argument": "m.0b787yg", "entity_name": "Set Designer"}
	], 
	"function": "none", 
	"num_node": 3, 
	"num_edge": 2, 
	"graph_query": {
		"nodes": [
			{"nid": 0, "node_type": "class", "id": "opera.opera_designer_role", "class": "opera.opera_designer_role", "friendly_name": "Opera Designer Role", "question_node": 1, "function": "none"}, 
			{"nid": 1, "node_type": "class", "id": "opera.opera_designer_gig", "class": "opera.opera_designer_gig", "friendly_name": "Opera Designer Gig", "question_node": 0, "function": "none"}, 
			{"nid": 2, "node_type": "entity", "id": "m.0pm2fgf", "class": "opera.opera_production", "friendly_name": "The Telephone / The Medium", "question_node": 0, "function": "none"}
		], 
		"edges": [
			{"start": 1, "end": 0, "relation": "opera.opera_designer_gig.design_role", "friendly_name": "Design Role"}, 
			{"start": 2, "end": 1, "relation": "opera.opera_production.designers", "friendly_name": "Designers"}
		]
	}, 
"sparql_query": "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX : <http://rdf.freebase.com/ns/> \nSELECT (?x0 AS ?value) WHERE {\nSELECT DISTINCT ?x0  WHERE { \n?x0 :type.object.type :opera.opera_designer_role . \n?x1 :type.object.type :opera.opera_designer_gig . \nVALUES ?x2 { :m.0pm2fgf } \n?x1 :opera.opera_designer_gig.design_role ?x0 . \n?x2 :opera.opera_production.designers ?x1 . \nFILTER ( ?x0 != ?x1 && ?x0 != ?x2 && ?x1 != ?x2  )\n}\n}", "domains": ["opera"], "level": "i.i.d.", "s_expression": "(AND opera.opera_designer_role (JOIN (R opera.opera_designer_gig.design_role) (JOIN (R opera.opera_production.designers) m.0pm2fgf)))"
}
```

In [309]:
grailqa_data_dir = "GrailQA/"

In [313]:
def load_grailqa_data(data_dir):
    with open(os.path.join(data_dir, "grailqa_v1.0_train.json"), "r", encoding="utf-8") as fr:
        train_data = json.loads(fr.readlines()[0])
    with open(os.path.join(data_dir, "grailqa_v1.0_dev.json"), "r", encoding="utf-8") as fr:
        test_data = json.loads(fr.readlines()[0])

    return train_data, test_data

grailqa_train_data, grailqa_test_data = load_grailqa_data(grailqa_data_dir)
shuffle(grailqa_train_data)
grailqa_train_data = grailqa_train_data[:10000]

In [312]:
len(grailqa_train_data)

44337

**构建instruction数据集**

In [311]:
def graphqa_grailqa_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [316]:
def graphqa_grailqa_instruction_missing_graph(question: str, answer: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "freebase-knowledge-base"
    instruction = "Task definition: given a question and answer the question."
    edge_list = triples
    node_list = entities
    gcl = graphqa_grailqa_graph_language(task_name, node_list, edge_list)
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def graphqa_grailqa_dataset_missing_graph(data: dict):
    task_name = "graph-language-modeling-graph-question-answering-grailqa"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        question = example["question"]
        try:
            answer = [i["entity_name"] for i in example["answer"]]
            answer_id = [freebase["entity_name2id"][i["answer_argument"]] for i in example["answer"]]
        except:
            continue
        evidence_1hop_triples = set()
        evidence_2hop_triples = list()
        graph_query = example["graph_query"]
        
        entities = [i["friendly_name"] for i in example["graph_query"]["nodes"]]
        entity_id = [i["id"] for i in example["graph_query"]["nodes"]]
        triples = list()
        # 检索Freebase中的子图
        for ent_id in entity_id:
            if ent_id in freebase["entity_name2id"].keys():
                ent_id = freebase["entity_name2id"][ent_id]
            if ent_id in freebase["entity_id2adj"].keys():
                try:
                    head_name = freebase["entity_id2name"][freebase["entity_id2adj"][ent_id][15850][0]]
                except:
                    continue
                adj = freebase["entity_id2adj"][ent_id]
                for rel_id, tail_ids in adj.items():
                    rel_name = freebase["relation_id2name"][rel_id].split(".")[-1].replace("_", " ")
                    for tail_id in tail_ids:
                        if tail_id in freebase["entity_id2adj"].keys():
                            try:
                                tail_name = freebase["entity_id2name"][freebase["entity_id2adj"][tail_id][15850][0]]
                            except:
                                continue
                            # tail_name = freebase["entity_id2name"][tail_id]
                            triples.append((head_name.replace("_", " "), rel_name, tail_name.replace("_", " ")))
                            if tail_id in answer_id:
                                evidence_1hop_triples.add((head_name.replace("_", " "), rel_name, tail_name.replace("_", " ")))
                            # 2-hop
                            # adj_2hop = freebase["entity_id2adj"][tail_id]
                            # for rel_2hop_id, tail_2hop_ids in adj_2hop.items():
                            #     rel_2hop_name = freebase["relation_id2name"][rel_2hop_id].split(".")[-1].replace("_", " ")
                            #     for tail_2hop_id in tail_2hop_ids:
                            #         if tail_2hop_id in freebase["entity_id2adj"].keys():
                            #             try:
                            #                 tail_2hop_name = freebase["entity_id2name"][freebase["entity_id2adj"][tail_2hop_id][15850][0]]
                            #             except:
                            #                 # print(tail_2hop_id)
                            #                 continue
                            #             # tail_name = freebase["entity_id2name"][tail_id]
                                        
                            #             triples.append((tail_name.replace("_", " "), rel_2hop_name, tail_2hop_name.replace("_", " ")))
                            #             if tail_2hop_id in answer_id:
                            #                 evidence_2hop_triples.extend([
                            #                     (head_name.replace("_", " "), rel_name, tail_name.replace("_", " ")),
                            #                     (tail_name.replace("_", " "), rel_2hop_name, tail_2hop_name.replace("_", " "))
                            #                 ])
        
        evidence_1hop_triples = list(evidence_1hop_triples)
        evidence_triples = evidence_1hop_triples[:1]
        if len(evidence_2hop_triples) > 0:
            evidence_triples.extend(evidence_2hop_triples[0])
        has_no_graph = False
        if len(triples) == 0:
            has_no_graph = True
        if len(evidence_1hop_triples) == 0:
            has_no_graph = True
        
        shuffle(triples)
        triples = triples[:26]
        triples.extend([(entities[i["start"]], i["friendly_name"], entities[i["end"]]) for i in example["graph_query"]["edges"]])
        triples.extend(evidence_triples)
        triples = list(set(triples))
        shuffle(triples)
        # if answer[0] not in entities:
        #     continue
        # print(triples)
        # for entity in entities:
        #     triples += freebase13_entity_name_3hop_adj_triples[entity.lower().replace(" ", "_")] if entity in freebase13_entity_name_3hop_adj_triples.keys() else []
        # triples = list(set(triples))
        entities = list()
        for triple in triples:
            entities.append(triple[0])
            entities.append(triple[2])
        entities = list(set(entities))

        if len(triples) < 2:
            continue

        has_answer_in_graph = False
        for ans in answer:
            if ans in entities:
                has_answer_in_graph = True
                break

        instruction = graphqa_grailqa_instruction_missing_graph(question, answer, entities, triples)

        if has_no_graph:
            answer_positive = ["I cannot answer the question directly because there are no graphs. However, based on the world knowledge, the correct answer to the question is \"{}\"".format(i) for i in answer]
        else:
            if has_answer_in_graph:
                # 该数据集天然存在unanswerable graph，本函数用于构造不存在answer的graph，如果本身数据集中graph中有answer，则不属于负样本构建范畴，则剔除
                continue
            else:
                answer_positive = ["Based on the world knowledge, the correct answer to the question is \"{}\", but the answer does not exist in the graph.".format(i) for i in answer]
            
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [answer_positive],
            "answer_negative": answer,
            "hallucination_type": "missing_graph_information",
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "GrailQA",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphqa_grailqa_instruction_data_missing_graph = {
    "train": graphqa_grailqa_dataset_missing_graph(grailqa_train_data),
    "test": graphqa_grailqa_dataset_missing_graph(grailqa_test_data),
}

100%|██████████| 10000/10000 [02:03<00:00, 81.15it/s]


total number: 2593


100%|██████████| 6763/6763 [00:38<00:00, 174.41it/s]

total number: 1214





In [317]:
print(graphqa_grailqa_instruction_data_missing_graph["test"][122]["instruction"])
print(graphqa_grailqa_instruction_data_missing_graph["test"][122]["answer_positive"])
print(graphqa_grailqa_instruction_data_missing_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="freebase-knowledge-base"] {
    entity_list = ['Gracefulness is to the body what understanding is to the mind.', 'Quotation Subject', 'Quotation', 'Angel of Remembrance'];
    triple_list = [("Quotation" -> "Quotation Subject")[relation="Subjects"], ("Gracefulness is to the body what understanding is to the mind." -> "Quotation Subject")[relation="Subjects"], ("Quotation" -> "Angel of Remembrance")[relation="Spoken by character (if from fictional work)"]];
}
```
Task definition: given a question and answer the question.
Q: the subject of the quote: gracefulness is to the body what understanding is to the mind. spoken by an

100%|██████████| 44337/44337 [15:57<00:00, 46.32it/s]  


total number: 14716


100%|██████████| 6763/6763 [00:39<00:00, 172.71it/s]

total number: 1535





In [321]:
# 构造为如下格式
graphqa_grailqa_preference_data = {
    "train": graphqa_grailqa_instruction_data_missing_graph["train"],
    "test": graphqa_grailqa_instruction_data_missing_graph["test"],
}
GraphQuestionAnswering_GrailQA_preference_benchmark_dict = {
    "graph-language-modeling-graph-question-answering-grailqa": graphqa_grailqa_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [319]:
np.save("preference_dataset/graphlanguagemodeling_graphquestionanswering_grailqa_preference_dataset.npy", GraphQuestionAnswering_GrailQA_preference_benchmark_dict)


### （4）WebQuestions

<!-- - 采用https://github.com/RUCAIBox/StructGPT 提供的Freebase，下载地址为：https://drive.google.com/drive/folders/11_2pqU_MhEtmxpp3zfK_8EJ1bbQzsnfJ
- WebQuestions数据集下载地址：
> "train": "https://worksheets.codalab.org/rest/bundles/0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/",
>
> "test": "https://worksheets.codalab.org/rest/bundles/0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/",

 -->

数据集格式：
```
{
	"answers": ["Portugal national football team"], 
	"answer_ids": ["/m/02rqxc"], 
	"outSeq": "which of the sports teams for whom cristiano ronaldo played was founded most recently ?", 
	"qId": 20990, 
	"inGraph": {
		"g_node_names": {
			"/m/02sf29t": "none", 
			"/m/02rqxc": "Portugal national football team", 
			"/m/02xt6q": "Cristiano Ronaldo", "1914": "1914"
		}, 
		"g_edge_types": {
			"/sports/sports_team_roster/team": "/sports/sports_team_roster/team", 
			"/sports/pro_athlete/teams": "/sports/pro_athlete/teams", 
			"/sports/sports_team/founded": "/sports/sports_team/founded"
		}, 
		"g_adj": {
			"/m/02sf29t": {
				"/m/02rqxc": "/sports/sports_team_roster/team"
			}, 
			"/m/02xt6q": {
				"/m/02sf29t": "/sports/pro_athlete/teams"
			}, 
			"/m/02rqxc": {
				"1914": "/sports/sports_team/founded"
			}
		}
	}
}
```

In [322]:
webquestions_dir = "WebQuestions/"

In [323]:
def load_webquestion(webquestions_dir):
    with open(os.path.join(webquestions_dir, "train.json"), "r", encoding="utf-8") as fr:
        train_data = [json.loads(i) for i in fr.readlines()]
    with open(os.path.join(webquestions_dir, "test.json"), "r", encoding="utf-8") as fr:
        test_data = [json.loads(i) for i in fr.readlines()]
    return train_data, test_data
webquestions_train_data, webquestions_test_data = load_webquestion(webquestions_dir)

In [326]:
webquestions_train_data = webquestions_train_data[:5000]

**构建instruction数据集**

In [325]:
def graphqa_webquestions_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [327]:
def graphqa_webquestions_instruction_missing_graph(question: str, answer: str, entities: list, triples: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "freebase-knowledge-base"
    instruction = "Task definition: given a question and a corresponding knowledge graph, and find an entity in the graph and answer the question."
    edge_list = triples
    node_list = entities
    gcl = graphqa_webquestions_graph_language(task_name, node_list, edge_list)
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", answer)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

def graphqa_webquestions_dataset_missing_graph(data: dict):
    task_name = "graph-language-modeling-graph-question-answering-webquestions"
    final_data = list()
    # if "entries" in data.keys():
    #     data = data["entries"]
    for ei, example in enumerate(tqdm(data)):
        question = example["outSeq"]
        try:
            answer = [i for i in example["answers"]]
            answer_id = [freebase["entity_name2id"][i[1:].replace("/", ".")] for i in example["answer_ids"]]
        except:
            continue
        if len(answer) == 0:
            continue
        evidence_1hop_triples = set()
        # entities = [i for i in example["inGraph"]["g_node_names"].values()]
        entity_id = [i[1:].replace("/", ".") for i in example["inGraph"]["g_node_names"].keys()]
        triples = list()
        # 检索Freebase中的子图
        for ent_id in entity_id:
            if ent_id in freebase["entity_name2id"].keys():
                ent_id = freebase["entity_name2id"][ent_id]
            if ent_id in freebase["entity_id2adj"].keys():
                try:
                    head_name = freebase["entity_id2name"][freebase["entity_id2adj"][ent_id][15850][0]]
                except:
                    continue
                adj = freebase["entity_id2adj"][ent_id]
                for rel_id, tail_ids in adj.items():
                    rel_name = freebase["relation_id2name"][rel_id].split(".")[-1].replace("_", " ")
                    for tail_id in tail_ids:
                        if tail_id in freebase["entity_id2adj"].keys():
                            try:
                                tail_name = freebase["entity_id2name"][freebase["entity_id2adj"][tail_id][15850][0]]
                            except:
                                continue
                            # tail_name = freebase["entity_id2name"][tail_id]
                            triples.append((head_name.replace("_", " "), rel_name, tail_name.replace("_", " ")))
                            if tail_id in answer_id:
                                evidence_1hop_triples.add((head_name.replace("_", " "), rel_name, tail_name.replace("_", " ")))
                            # 2-hop
                            # adj_2hop = freebase["entity_id2adj"][tail_id]
                            # for rel_2hop_id, tail_2hop_ids in adj_2hop.items():
                            #     rel_2hop_name = freebase["relation_id2name"][rel_2hop_id].split(".")[-1].replace("_", " ")
                            #     for tail_2hop_id in tail_2hop_ids:
                            #         if tail_2hop_id in freebase["entity_id2adj"].keys():
                            #             try:
                            #                 tail_2hop_name = freebase["entity_id2name"][freebase["entity_id2adj"][tail_2hop_id][15850][0]]
                            #             except:
                            #                 # print(tail_2hop_id)
                            #                 continue
                            #             # tail_name = freebase["entity_id2name"][tail_id]
                                        
                            #             triples.append((tail_name.replace("_", " "), rel_2hop_name, tail_2hop_name.replace("_", " ")))
                            #             if tail_2hop_id in answer_id:
                            #                 evidence_2hop_triples.extend([
                            #                     (head_name.replace("_", " "), rel_name, tail_name.replace("_", " ")),
                            #                     (tail_name.replace("_", " "), rel_2hop_name, tail_2hop_name.replace("_", " "))
                            #                 ])
        
        evidence_1hop_triples = list(evidence_1hop_triples)
        evidence_triples = evidence_1hop_triples[:1]
        # if len(evidence_2hop_triples) > 0:
        #     evidence_triples.extend(evidence_2hop_triples[0])
        has_no_graph = False
        if len(triples) == 0:
            has_no_graph = True
        if len(evidence_1hop_triples) == 0:
            has_no_graph = True
        shuffle(triples)
        triples = triples[:26]
        for head_entity_id, adj in example["inGraph"]["g_adj"].items():
            try:
                head_name = freebase["entity_id2name"][freebase["entity_id2adj"][head_entity_id][15850][0]]
            except:
                continue
            for tail_entity_id, rel_name in adj.items():
                try:
                    tail_name = freebase["entity_id2name"][freebase["entity_id2adj"][head_entity_id][15850][0]]
                except:
                    continue
                rel_name = rel_name.split("/")[-1]
                triples.append((head_name, rel_name, tail_name))
        
        entities = list()
        for triple in triples:
            entities.append(triple[0])
            entities.append(triple[2])
        entities = list(set(entities))

        if len(triples) < 4:
            continue

        has_answer_in_graph = False
        for ans in answer:
            if ans in entities:
                has_answer_in_graph = True
                break

        instruction = graphqa_webquestions_instruction_missing_graph(question, answer, entities, triples)
        
        if has_no_graph:
            answer_positive = ["I cannot answer the question directly because there are no graphs. However, based on the world knowledge, the correct answer to the question is \"{}\"".format(i) for i in answer]
        else:
            if has_answer_in_graph:
                # 该数据集天然存在unanswerable graph，本函数用于构造不存在answer的graph，如果本身数据集中graph中有answer，则不属于负样本构建范畴，则剔除
                continue
            else:
                answer_positive = ["Based on the world knowledge, the correct answer to the question is \"{}\", but the answer does not exist in the graph.".format(i) for i in answer]
            
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [answer_positive],
            "answer_negative": answer,
            "hallucination_type": "missing_graph_information",
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "WebQuestions",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphqa_webquestions_instruction_data_missing_graph = {
    "train": graphqa_webquestions_dataset_missing_graph(webquestions_train_data),
    "test": graphqa_webquestions_dataset_missing_graph(webquestions_test_data),
}

100%|██████████| 5000/5000 [16:04<00:00,  5.18it/s]  


total number: 2101


100%|██████████| 2000/2000 [02:49<00:00, 11.83it/s]

total number: 859





In [328]:
print(graphqa_webquestions_instruction_data_missing_graph["test"][122]["instruction"])
print(graphqa_webquestions_instruction_data_missing_graph["test"][122]["answer_positive"])
print(graphqa_webquestions_instruction_data_missing_graph["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="freebase-knowledge-base"] {
    entity_list = ['Vanderbilt Orbis', 'Vanderbilt University College of Arts and Science', "Vanderbilt Commodores men's basketball", 'Vanderbilt University School of Medicine', 'Vanderbilt Commodores football', 'Vanderbilt Commodores baseball', 'Cornelius Vanderbilt', 'Alpha Delta Pi', 'Vanderbilt University School of Engineering', 'Private university', 'Vanderbilt University Law School', 'Black', 'Vanderbilt University', 'International tuition', 'Blair School of Music', 'Peabody College', 'Nashville', 'Vanderbilt University Divinity School', 'United States of America'];
    triple_list = [("Va

In [330]:
# 构造为如下格式
graphqa_webquestions_preference_data = {
    "train": graphqa_webquestions_instruction_data_missing_graph["train"],
    "test": graphqa_webquestions_instruction_data_missing_graph["test"],
}
GraphQuestionAnswering_WebQuestions_preference_benchmark_dict = {
    "graph-language-modeling-graph-question-answering-webquestions": graphqa_webquestions_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [331]:
np.save("preference_dataset/graphlanguagemodeling_graphquestionanswering_webquestions_preference_dataset.npy", GraphQuestionAnswering_WebQuestions_preference_benchmark_dict)


### （5）WikiTableQuestions（TableQA）

数据地址：https://ppasupat.github.io/WikiTableQuestions/

该数据集为TableQA，可将Table cell转换为三元组形式。
- 对于一个表格，第一行为表头，接下来是每一行表元素
- 首先按照列来统计，将元素种类最多的列，作为主列
- 对于每一行，构建一个graph，由主列对应的元素作为头实体，并与当前行的其他列元素尾实体
- 假设有ABCDE共五列，以D为主列，那么每一行元素可以构建一个graph：(D->A, D->B, D->C, D->E)。

## 2.3 Graph Node Classification

In [339]:
from scipy.sparse import csr_matrix

### （1）CoRA
The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words.

```
{'name': 'cora',
 'order': 2708, # 共2708个节点
 'size': 5429, # 共5429条边
 'node_feature_number': 1433, # 节点的特征向量长度是1433
 'node_class_number': 7, # 节点类别有7个
 'is_directed': True,
 'is_weighted': False
}
```

数据集取自：https://github.com/jwzhanggy/Graph_Toolformer/blob/main/data/README.md
详细的数据集介绍参考：https://aistudio.baidu.com/projectdetail/2246479?shared=1

- 自行划分训练集和测试集：训练集1708个节点，测试集1000个节点。
- 每个节点的特征向量为1433长度的one-hot表征，以稀疏矩阵类型存储
- 由于节点特征只是向量，并非文字，其一般可用于GNN，但无法用于节点描述，因此不予使用节点特征；
- 构造instruction数据集时，graph language中只添加目标节点编号、目标节点的邻居子图、邻居子图中每个节点的类别；
- 划分训练集和测试集时候，确保节点不存在重叠，但允许邻居子图中的节点出现重叠，因为不涉及到模型参数的更新，因此不会存在数据泄露问题。

In [332]:
cora_data_dir = "CoRA/"

**处理CoRA数据集**

In [333]:
def load_cora(data_dir):
    import pickle
    with open(os.path.join(data_dir, "cora"), "rb") as fr:
        dataset = pickle.load(fr)
    return dataset
cora_data = load_cora(cora_data_dir)

In [334]:
print(cora_data.keys())
print(cora_data["data_profile"])
print(len(cora_data["nodes"].keys()))
print(cora_data["nodes"][109323])
print(cora_data["nodes"][109323]['features'].toarray().shape)
print(sum([i for i in cora_data["nodes"][109323]['features'].toarray()[0]]))
print(len(cora_data["links"].keys()))
print(cora_data["links"])

dict_keys(['data_profile', 'nodes', 'links'])
{'name': 'cora', 'order': 2708, 'size': 5429, 'node_feature_number': 1433, 'node_class_number': 7, 'is_directed': True, 'is_weighted': False}
2708
{'features': <1x1433 sparse matrix of type '<class 'numpy.float32'>'
	with 1433 stored elements in Compressed Sparse Row format>, 'label': 'Probabilistic_Methods'}
(1, 1433)
21.0
5429
{(35, 1033): 1, (35, 103482): 1, (35, 103515): 1, (35, 1050679): 1, (35, 1103960): 1, (35, 1103985): 1, (35, 1109199): 1, (35, 1112911): 1, (35, 1113438): 1, (35, 1113831): 1, (35, 1114331): 1, (35, 1117476): 1, (35, 1119505): 1, (35, 1119708): 1, (35, 1120431): 1, (35, 1123756): 1, (35, 1125386): 1, (35, 1127430): 1, (35, 1127913): 1, (35, 1128204): 1, (35, 1128227): 1, (35, 1128314): 1, (35, 1128453): 1, (35, 1128945): 1, (35, 1128959): 1, (35, 1128985): 1, (35, 1129018): 1, (35, 1129027): 1, (35, 1129573): 1, (35, 1129683): 1, (35, 1129778): 1, (35, 1130847): 1, (35, 1130856): 1, (35, 1131116): 1, (35, 1131360): 

In [335]:
def cora_generate_features(cora_data):
    ### 1. 首先获得每个节点的标签，并统计所有可能的类别
    node_list = list(cora_data["nodes"].keys())
    node2label = dict()
    classes = set()
    for node_id, info in tqdm(cora_data["nodes"].items()):
        label = info["label"].replace("_", " ")
        node2label[node_id] = label
        classes.add(label)
    ### 2. 对于每个节点，获得其2-hop邻接子图
    node2adj = dict() # 1-hop子图
    for (head_id, tail_id), _ in tqdm(cora_data["links"].items()):
        if head_id not in node2adj.keys():
            node2adj[head_id] = list()
        node2adj[head_id].append(tail_id)
    node2adj_2hop_triples = dict() # 2-hop子图中的所有三元组
    node2adj_2hop_nodes = dict() # 2-hop子图中的所有节点
    for node_id, adj in tqdm(node2adj.items()):
        if node_id not in node2adj_2hop_triples.keys():
            node2adj_2hop_triples[node_id] = set()
            node2adj_2hop_nodes[node_id] = {node_id}
        for onehop_tail_id in adj:
            node2adj_2hop_triples[node_id].add((node_id, onehop_tail_id))
            node2adj_2hop_nodes[node_id].add(onehop_tail_id)
            if onehop_tail_id in node2adj.keys():
                for twohop_tail_id in node2adj[onehop_tail_id]:
                    # if twohop_tail_id == node_id:
                    #     continue
                    node2adj_2hop_triples[node_id].add((onehop_tail_id, twohop_tail_id))
                    node2adj_2hop_nodes[node_id].add(twohop_tail_id)
    ### 3. 划分训练集/测试集
    # 剔除掉可能不存在于graph中的节点
    new_node_list = list()
    for node_id in node_list:
        if node_id in node2adj_2hop_triples.keys():
            new_node_list.append(node_id)
    shuffle(new_node_list)
    test_node_list, train_node_list = new_node_list[:1000], new_node_list[1000:]
    print("train num: {}".format(len(train_node_list)))
    print("test num: {}".format(len(test_node_list)))
    return train_node_list, test_node_list, {
        "node2adj_2hop_triples": node2adj_2hop_triples,
        "node2adj_2hop_nodes": node2adj_2hop_nodes,
        "node2label": node2label,
        "classes": classes
    }
cora_train_node_list, cora_test_node_list, cora_features = cora_generate_features(cora_data)

100%|██████████| 2708/2708 [00:00<00:00, 694392.32it/s]
100%|██████████| 5429/5429 [00:00<00:00, 879252.31it/s]
100%|██████████| 1565/1565 [00:00<00:00, 103670.20it/s]

train num: 565
test num: 1000





In [336]:
print(cora_features["node2adj_2hop_triples"][109323])
print(cora_features["node2adj_2hop_nodes"][109323])
print(cora_features["node2label"][109323])
print(cora_features["node2label"][137849])
print(cora_features["node2label"][17242])
print(cora_features["node2label"][1115291])
print(cora_features["classes"])

{(137849, 17242), (137849, 1115291), (109323, 137849)}
{137849, 17242, 109323, 1115291}
Probabilistic Methods
Probabilistic Methods
Probabilistic Methods
Probabilistic Methods
{'Probabilistic Methods', 'Theory', 'Genetic Algorithms', 'Case Based', 'Rule Learning', 'Reinforcement Learning', 'Neural Networks'}


**构造instruction数据集**

In [337]:
def nodecls_cora_graph_language(task_name: str, node_list: list, graph: list, target_node, feature):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    publication_node_list = <node_list>;
    publication_node_feature = <feature>;
    target_publication_node = <target_node>;
    citation_triple_list = <triple_list>\n}\n```"""
    node_list = "[" + ", ".join(["\"paper_{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"paper_{}\" -> \"paper_{}\")".format(triple[0], triple[1]) for triple in graph]) + "];"
    feature = "[" + ", ".join(["\"paper_{}\".category=\"{}\"".format(node, label) for node, label in feature.items()]) + "]"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)).replace("<target_node>", "\"paper_{}\"".format(str(target_node))).replace("<feature>", feature)


In [451]:
def nodecls_cora_instruction_unfaithful_answer(text: str, entities: list, triples: list, do_print: bool = False, target_node=None, feature=None):
    # 配置 instruction prompt
    task_name = "cora-scientific-publications-citation-graph"
    instruction = "Task definition: given a target scientific publication and corresponding citation graph, classify the target scientific publication into one of seven categories, such as 'Rule Learning', 'Genetic Algorithms', 'Theory', 'Case Based', 'Neural Networks', 'Reinforcement Learning' and 'Probabilistic Methods'."
    edge_list = triples
    node_list = entities
    gcl = nodecls_cora_graph_language(task_name, node_list, edge_list, target_node, feature)
    query = "Q: Please classify the target scientific publication."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
            "node_feature": feature,
        }
    }

def nodecls_cora_dataset_unfaithful_answer(data: list, node2label: dict, node2adj_2hop_nodes: dict, node2adj_2hop_triples: dict, classes: dict):
    task_name = "graph-language-modeling-graph-node-cls-cora"
    final_data = list()
    
    for ei, node_id in enumerate(tqdm(data)):
        label = node2label[node_id]
        node_list = node2adj_2hop_nodes[node_id]
        triple_list = node2adj_2hop_triples[node_id]
        feature = {node: node2label[node] for node in node_list if node != node_id}
        instruction = nodecls_cora_instruction_unfaithful_answer(label, node_list, triple_list, target_node=node_id, feature=feature)

        answer_positive = label
        answer_negative_candidates = ['Rule Learning', 'Genetic Algorithms', 'Theory', 'Case Based', 'Neural Networks', 'Reinforcement Learning', 'Probabilistic Methods']
        while True:
            answer_negative = answer_negative_candidates[random.randint(0, len(answer_negative_candidates) - 1)]
            if answer_negative != answer_positive:
                break
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [answer_positive],
            "answer_negative": [answer_negative],
            "hallucination_type": "unfaithful_answer",
            "answer_with_cot": [],
            "difficulty": "easy",
            "from": "CoRA",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

nodecls_cora_instruction_data_unfaithful_answer = {
    "train": nodecls_cora_dataset_unfaithful_answer(data=cora_train_node_list, **cora_features),
    "test": nodecls_cora_dataset_unfaithful_answer(data=cora_test_node_list, **cora_features),
}

100%|██████████| 565/565 [00:00<00:00, 16438.21it/s]


total number: 565


100%|██████████| 1000/1000 [00:00<00:00, 21255.49it/s]

total number: 1000





In [452]:
print(nodecls_cora_instruction_data_unfaithful_answer["test"][122]["instruction"])
print(nodecls_cora_instruction_data_unfaithful_answer["test"][122]["answer_positive"])
print(nodecls_cora_instruction_data_unfaithful_answer["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="cora-scientific-publications-citation-graph"] {
    publication_node_list = ["paper_166420", "paper_1119751"];
    publication_node_feature = ["paper_1119751".category="Theory"];
    target_publication_node = "paper_166420";
    citation_triple_list = [("paper_166420" -> "paper_1119751")];
}
```
Task definition: given a target scientific publication and corresponding citation graph, classify the target scientific publication into one of seven categories, such as 'Rule Learning', 'Genetic Algorithms', 'Theory', 'Case Based', 'Neural Networks', 'Reinforcement Learning' and 'Probabilistic Methods'.
Q: Please classify the targ

In [453]:
# 构造为如下格式
nodecls_cora_preference_data = {
    "train": nodecls_cora_instruction_data_unfaithful_answer["train"],
    "test": nodecls_cora_instruction_data_unfaithful_answer["test"],
}
GraphNodeCLS_CoRA_preference_benchmark_dict = {
    "graph-language-modeling-graph-nodel-cls-cora": nodecls_cora_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [454]:
np.save("preference_dataset/graphlanguagemodeling_graphnodecls_cora_preference_dataset.npy", GraphNodeCLS_CoRA_preference_benchmark_dict)


### （2）CiteSeer

The CiteSeer dataset consists of 3312 scientific publications classified into one of six classes. The citation network consists of 4732 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 3703 unique words.

```
{'name': 'citeseer',
 'order': 3312, # 节点总数3312
 'size': 4715, # 边总数4715
 'node_feature_number': 3703, # 节点特征向量长度3703
 'node_class_number': 6, # 节点类别数量
 'is_directed': True,
 'is_weighted': False
}
```

数据集取自：https://github.com/jwzhanggy/Graph_Toolformer/blob/main/data/README.md
详细的数据集介绍参考：https://paperswithcode.com/dataset/citeseer

- 自行划分训练集和测试集：训练集个节点，测试集1000个节点。
- 每个节点的特征向量为3703长度的one-hot表征，以稀疏矩阵类型存储
- 由于节点特征只是向量，并非文字，其一般可用于GNN，但无法用于节点描述，因此不予使用节点特征；
- 构造instruction数据集时，graph language中只添加目标节点编号、目标节点的邻居子图、邻居子图中每个节点的类别；
- 划分训练集和测试集时候，确保节点不存在重叠，但允许邻居子图中的节点出现重叠，因为不涉及到模型参数的更新，因此不会存在数据泄露问题。

In [345]:
citeseer_data_dict = "CiteSeer/"

**加载CiteSeer数据集**

In [346]:
def load_citeseer(data_dir):
    import pickle
    with open(os.path.join(data_dir, "citeseer"), "rb") as fr:
        dataset = pickle.load(fr)
    return dataset
citeseer_data = load_citeseer(citeseer_data_dict)

In [347]:
citeseer_data["data_profile"]

{'name': 'citeseer',
 'order': 3312,
 'size': 4715,
 'node_feature_number': 3703,
 'node_class_number': 6,
 'is_directed': True,
 'is_weighted': False}

In [621]:
print(citeseer_data.keys())
print(citeseer_data["data_profile"])
print(len(citeseer_data["nodes"].keys()))
print(citeseer_data["nodes"][123])
print(citeseer_data["nodes"][123]['features'].toarray().shape)
print(sum([i for i in citeseer_data["nodes"][123]['features'].toarray()[0]]))
print(len(citeseer_data["links"].keys()))
print(citeseer_data["links"])

dict_keys(['data_profile', 'nodes', 'links'])
{'name': 'citeseer', 'order': 3312, 'size': 4715, 'node_feature_number': 3703, 'node_class_number': 6, 'is_directed': True, 'is_weighted': False}
3312
{'features': <1x3703 sparse matrix of type '<class 'numpy.float32'>'
	with 3703 stored elements in Compressed Sparse Row format>, 'label': 'ML'}
(1, 3703)
43.0
4715
{(0, 0): 1, (0, 99): 1, (0, 111): 1, (0, 381): 1, (0, 415): 1, (0, 514): 1, (0, 585): 1, (0, 690): 1, (0, 691): 1, (0, 783): 1, (0, 784): 1, (0, 954): 1, (1, 153): 1, (1, 732): 1, (1, 1937): 1, (2177, 2903): 1, (1011, 1034): 1, (1011, 2028): 1, (1011, 2029): 1, (2179, 2200): 1, (2, 962): 1, (1012, 2181): 1, (1012, 2031): 1, (1013, 1417): 1, (1013, 1427): 1, (1013, 1545): 1, (1013, 2024): 1, (1014, 1059): 1, (1014, 1365): 1, (1014, 1378): 1, (1015, 1236): 1, (1015, 1439): 1, (1015, 1440): 1, (1015, 2093): 1, (2183, 2677): 1, (2183, 845): 1, (2183, 3156): 1, (1016, 2138): 1, (3, 634): 1, (2184, 2306): 1, (2184, 2356): 1, (2185, 2539

In [348]:
def citeseer_generate_features(citeseer_data):
    ### 1. 首先获得每个节点的标签，并统计所有可能的类别
    node_list = list(citeseer_data["nodes"].keys())
    node2label = dict()
    classes = set()
    for node_id, info in tqdm(citeseer_data["nodes"].items()):
        label = info["label"].replace("_", " ")
        node2label[node_id] = label
        classes.add(label)
    ### 2. 对于每个节点，获得其2-hop邻接子图
    node2adj = dict() # 1-hop子图
    for (head_id, tail_id), _ in tqdm(citeseer_data["links"].items()):
        if head_id not in node2adj.keys():
            node2adj[head_id] = list()
        node2adj[head_id].append(tail_id)
    node2adj_2hop_triples = dict() # 2-hop子图中的所有三元组
    node2adj_2hop_nodes = dict() # 2-hop子图中的所有节点
    for node_id, adj in tqdm(node2adj.items()):
        if node_id not in node2adj_2hop_triples.keys():
            node2adj_2hop_triples[node_id] = set()
            node2adj_2hop_nodes[node_id] = {node_id}
        for onehop_tail_id in adj:
            node2adj_2hop_triples[node_id].add((node_id, onehop_tail_id))
            node2adj_2hop_nodes[node_id].add(onehop_tail_id)
            if onehop_tail_id in node2adj.keys():
                for twohop_tail_id in node2adj[onehop_tail_id]:
                    # if twohop_tail_id == node_id:
                    #     continue
                    node2adj_2hop_triples[node_id].add((onehop_tail_id, twohop_tail_id))
                    node2adj_2hop_nodes[node_id].add(twohop_tail_id)
    ### 3. 划分训练集/测试集
    # 剔除掉可能不存在于graph中的节点
    new_node_list = list()
    for node_id in node_list:
        if node_id in node2adj_2hop_triples.keys():
            new_node_list.append(node_id)
    shuffle(new_node_list)
    test_node_list, train_node_list = new_node_list[:1000], new_node_list[1000:]
    print("train num: {}".format(len(train_node_list)))
    print("test num: {}".format(len(test_node_list)))
    return train_node_list, test_node_list, {
        "node2adj_2hop_triples": node2adj_2hop_triples,
        "node2adj_2hop_nodes": node2adj_2hop_nodes,
        "node2label": node2label,
        "classes": classes
    }
citeseer_train_node_list, citeseer_test_node_list, citeseer_features = citeseer_generate_features(citeseer_data)

100%|██████████| 3312/3312 [00:00<00:00, 732251.06it/s]
100%|██████████| 4715/4715 [00:00<00:00, 842865.08it/s]
100%|██████████| 1951/1951 [00:00<00:00, 161558.25it/s]

train num: 951
test num: 1000





In [569]:
citeseer_features["classes"]

{'AI', 'Agents', 'DB', 'HCI', 'IR', 'ML'}

**构建instruction数据集**

In [349]:
def nodecls_citeseer_graph_language(task_name: str, node_list: list, graph: list, target_node, feature):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    publication_node_list = <node_list>;
    publication_node_feature = <feature>;
    target_publication_node = <target_node>;
    citation_triple_list = <triple_list>\n}\n```"""
    node_list = "[" + ", ".join(["\"paper_{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"paper_{}\" -> \"paper_{}\")".format(triple[0], triple[1]) for triple in graph]) + "];"
    feature = "[" + ", ".join(["\"paper_{}\".category=\"{}\"".format(node, label) for node, label in feature.items()]) + "]"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)).replace("<target_node>", "\"paper_{}\"".format(str(target_node))).replace("<feature>", feature)


In [448]:
def nodecls_citeseer_instruction_unfaithful_answer(text: str, entities: list, triples: list, do_print: bool = False, target_node=None, feature=None):
    # 配置 instruction prompt
    task_name = "citeseer-scientific-publications-citation-graph"
    instruction = "Task definition: given a target scientific publication and corresponding citation graph, classify the target scientific publication into one of six categories, such as 'AI', 'Agents', 'DB', 'HCI', 'IR', and 'ML'."
    edge_list = triples
    node_list = entities
    gcl = nodecls_citeseer_graph_language(task_name, node_list, edge_list, target_node, feature)
    query = "Q: Please classify the target scientific publication."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
            "node_feature": feature,
        }
    }

def nodecls_citeseer_dataset_unfaithful_answer(data: list, node2label: dict, node2adj_2hop_nodes: dict, node2adj_2hop_triples: dict, classes: dict):
    task_name = "graph-language-modeling-graph-node-cls-citeseer"
    final_data = list()
    
    for ei, node_id in enumerate(tqdm(data)):
        label = node2label[node_id]
        node_list = node2adj_2hop_nodes[node_id]
        triple_list = node2adj_2hop_triples[node_id]
        feature = {node: node2label[node] for node in node_list if node != node_id}
        instruction = nodecls_citeseer_instruction_unfaithful_answer(label, node_list, triple_list, target_node=node_id, feature=feature)

        answer_positive = label
        answer_negative_candidates = ['AI', 'Agents', 'DB', 'HCI', 'IR', 'ML']
        while True:
            answer_negative = answer_negative_candidates[random.randint(0, len(answer_negative_candidates) - 1)]
            if answer_negative != answer_positive:
                break
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [answer_positive],
            "answer_negative": [answer_negative],
            "hallucination_type": "unfaithful_answer",
            "answer_with_cot": [],
            "difficulty": "easy",
            "from": "CiteSeer",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

nodecls_citeseer_instruction_data_unfaithful_answer = {
    "train": nodecls_citeseer_dataset_unfaithful_answer(data=citeseer_train_node_list, **citeseer_features),
    "test": nodecls_citeseer_dataset_unfaithful_answer(data=citeseer_test_node_list, **citeseer_features),
}

100%|██████████| 951/951 [00:00<00:00, 26497.73it/s]


total number: 951


100%|██████████| 1000/1000 [00:00<00:00, 27239.63it/s]

total number: 1000





In [351]:
print(nodecls_citeseer_instruction_data_unfaithful_answer["test"][122]["instruction"])
print(nodecls_citeseer_instruction_data_unfaithful_answer["test"][122]["answer_positive"])
print(nodecls_citeseer_instruction_data_unfaithful_answer["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="citeseer-scientific-publications-citation-graph"] {
    publication_node_list = ["paper_3259", "paper_2477"];
    publication_node_feature = ["paper_2477".category="ML"];
    target_publication_node = "paper_3259";
    citation_triple_list = [("paper_3259" -> "paper_2477")];
}
```
Task definition: given a target scientific publication and corresponding citation graph, classify the target scientific publication into one of six categories, such as 'AI', 'Agents', 'DB', 'HCI', 'IR', and 'ML'.
Q: Please classify the target scientific publication.
A:
['HCI']
['AI']


In [449]:
# 构造为如下格式
nodecls_citeseer_preference_data = {
    "train": nodecls_citeseer_instruction_data_unfaithful_answer["train"],
    "test": nodecls_citeseer_instruction_data_unfaithful_answer["test"],
}
GraphNodeCLS_CiteSeer_preference_benchmark_dict = {
    "graph-language-modeling-graph-nodel-cls-citeseer": nodecls_citeseer_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [450]:
np.save("preference_dataset/graphlanguagemodeling_graphnodecls_citeseer_preference_dataset.npy", GraphNodeCLS_CiteSeer_preference_benchmark_dict)


### （3）PubMed

The Pubmed dataset consists of 19717 scientific publications from PubMed database pertaining to diabetes classified into one of three classes. The citation network consists of 44338 links. Each publication in the dataset is described by a TF/IDF weighted word vector from a dictionary which consists of 500 unique words.

```
{'name': 'pubmed',
 'order': 19717, # 节点数量
 'size': 44324, # 边数量
 'node_feature_number': 500, # 每个节点的表征向量长度
 'node_class_number': 3, # 节点类别数量
 'is_directed': True,
 'is_weighted': False
}
```

数据集取自：https://github.com/jwzhanggy/Graph_Toolformer/blob/main/data/README.md
详细的数据集介绍参考：https://paperswithcode.com/dataset/pubmed

- 自行划分训练集和测试集：训练集500个节点，测试集1000个节点。
- 每个节点的特征向量为3703长度的one-hot表征，以稀疏矩阵类型存储
- 由于节点特征只是向量，并非文字，其一般可用于GNN，但无法用于节点描述，因此不予使用节点特征；
- 构造instruction数据集时，graph language中只添加目标节点编号、目标节点的邻居子图、邻居子图中每个节点的类别；
- 划分训练集和测试集时候，确保节点不存在重叠，但允许邻居子图中的节点出现重叠，因为不涉及到模型参数的更新，因此不会存在数据泄露问题。

In [354]:
pubmed_data_dict = "PubMed/"

**加载PubMed数据集**

In [355]:
def load_pubmed(data_dir):
    import pickle
    with open(os.path.join(data_dir, "pubmed"), "rb") as fr:
        dataset = pickle.load(fr)
    return dataset
pubmed_data = load_pubmed(pubmed_data_dict)

In [630]:
pubmed_data["data_profile"]

{'name': 'pubmed',
 'order': 19717,
 'size': 44324,
 'node_feature_number': 500,
 'node_class_number': 3,
 'is_directed': True,
 'is_weighted': False}

In [631]:
print(pubmed_data.keys())
print(pubmed_data["data_profile"])
print(len(pubmed_data["nodes"].keys()))
print(pubmed_data["nodes"][123])
print(pubmed_data["nodes"][123]['features'].toarray().shape)
print(sum([i for i in pubmed_data["nodes"][123]['features'].toarray()[0]]))
print(len(pubmed_data["links"].keys()))
print(pubmed_data["links"])

dict_keys(['data_profile', 'nodes', 'links'])
{'name': 'pubmed', 'order': 19717, 'size': 44324, 'node_feature_number': 500, 'node_class_number': 3, 'is_directed': True, 'is_weighted': False}
19717
{'features': <1x500 sparse matrix of type '<class 'numpy.float32'>'
	with 500 stored elements in Compressed Sparse Row format>, 'label': '0'}
(1, 500)
2.0965993646532297
44324
{(0, 1378): 1, (0, 1544): 1, (0, 6092): 1, (0, 7636): 1, (0, 14442): 1, (1, 2943): 1, (1, 8359): 1, (1, 10199): 1, (2, 10471): 1, (2, 11485): 1, (2, 15572): 1, (3, 8249): 1, (4, 14044): 1, (5, 1312): 1, (5, 12968): 1, (6, 767): 1, (6, 2128): 1, (6, 2216): 1, (6, 3150): 1, (6, 3509): 1, (6, 4464): 1, (6, 6572): 1, (6, 6697): 1, (6, 7296): 1, (6, 7335): 1, (6, 7691): 1, (6, 8661): 1, (6, 8981): 1, (6, 9232): 1, (6, 10265): 1, (6, 12098): 1, (6, 13655): 1, (6, 13656): 1, (6, 16720): 1, (6, 17284): 1, (6, 18121): 1, (6, 18614): 1, (7, 1568): 1, (7, 1588): 1, (7, 2019): 1, (7, 2343): 1, (7, 4058): 1, (7, 5564): 1, (7, 6242):

In [356]:
def pubmed_generate_features(pubmed_data):
    ### 1. 首先获得每个节点的标签，并统计所有可能的类别
    node_list = list(pubmed_data["nodes"].keys())
    node2label = dict()
    classes = set()
    for node_id, info in tqdm(pubmed_data["nodes"].items()):
        label = info["label"].replace("_", " ")
        node2label[node_id] = "class_{}".format(label)
        classes.add("class_{}".format(label))
    ### 2. 对于每个节点，获得其2-hop邻接子图
    node2adj = dict() # 1-hop子图
    for (head_id, tail_id), _ in tqdm(pubmed_data["links"].items()):
        if head_id not in node2adj.keys():
            node2adj[head_id] = list()
        node2adj[head_id].append(tail_id)
    node2adj_2hop_triples = dict() # 2-hop子图中的所有三元组
    node2adj_2hop_nodes = dict() # 2-hop子图中的所有节点
    for node_id, adj in tqdm(node2adj.items()):
        if node_id not in node2adj_2hop_triples.keys():
            node2adj_2hop_triples[node_id] = set()
            node2adj_2hop_nodes[node_id] = {node_id}
        for onehop_tail_id in adj:
            node2adj_2hop_triples[node_id].add((node_id, onehop_tail_id))
            node2adj_2hop_nodes[node_id].add(onehop_tail_id)
            if onehop_tail_id in node2adj.keys():
                for twohop_tail_id in node2adj[onehop_tail_id]:
                    # if twohop_tail_id == node_id:
                    #     continue
                    node2adj_2hop_triples[node_id].add((onehop_tail_id, twohop_tail_id))
                    node2adj_2hop_nodes[node_id].add(twohop_tail_id)
    ### 3. 划分训练集/测试集
    # 剔除掉可能不存在于graph中的节点
    new_node_list = list()
    for node_id in node_list:
        if node_id in node2adj_2hop_triples.keys():
            new_node_list.append(node_id)
    shuffle(new_node_list)
    test_node_list, train_node_list = new_node_list[:2000], new_node_list[2000:]
    print("train num: {}".format(len(train_node_list)))
    print("test num: {}".format(len(test_node_list)))
    return train_node_list, test_node_list, {
        "node2adj_2hop_triples": node2adj_2hop_triples,
        "node2adj_2hop_nodes": node2adj_2hop_nodes,
        "node2label": node2label,
        "classes": classes
    }
pubmed_train_node_list, pubmed_test_node_list, pubmed_features = pubmed_generate_features(pubmed_data)

100%|██████████| 19717/19717 [00:00<00:00, 492760.99it/s]
100%|██████████| 44324/44324 [00:00<00:00, 1047518.41it/s]
100%|██████████| 12971/12971 [00:00<00:00, 110562.60it/s]

train num: 10971
test num: 2000





In [633]:
pubmed_features["classes"]

{'class_0', 'class_1', 'class_2'}

**构建instruction数据集**

In [357]:
def nodecls_pubmed_graph_language(task_name: str, node_list: list, graph: list, target_node, feature):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    publication_node_list = <node_list>;
    publication_node_feature = <feature>;
    target_publication_node = <target_node>;
    citation_triple_list = <triple_list>\n}\n```"""
    node_list = "[" + ", ".join(["\"paper_{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"paper_{}\" -> \"paper_{}\")".format(triple[0], triple[1]) for triple in graph]) + "];"
    feature = "[" + ", ".join(["\"paper_{}\".category=\"{}\"".format(node, label) for node, label in feature.items()]) + "]"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)).replace("<target_node>", "\"paper_{}\"".format(str(target_node))).replace("<feature>", feature)


In [438]:
def nodecls_pubmed_instruction_unfaithful_answer(text: str, entities: list, triples: list, do_print: bool = False, target_node=None, feature=None):
    # 配置 instruction prompt
    task_name = "pubmed-scientific-publications-citation-graph"
    instruction = "Task definition: given a target scientific publication and corresponding citation graph, classify the target scientific publication into one of three categories, such as 'class_0', 'class_1' and 'class_2'."
    edge_list = triples
    node_list = entities
    gcl = nodecls_pubmed_graph_language(task_name, node_list, edge_list, target_node, feature)
    query = "Q: Please classify the target scientific publication."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
            "node_feature": feature,
        }
    }

def nodecls_pubmed_dataset_unfaithful_answer(data: list, node2label: dict, node2adj_2hop_nodes: dict, node2adj_2hop_triples: dict, classes: dict):
    task_name = "graph-language-modeling-graph-node-cls-pubmed"
    final_data = list()
    
    for ei, node_id in enumerate(tqdm(data)):
        label = node2label[node_id]
        node_list = node2adj_2hop_nodes[node_id]
        triple_list = node2adj_2hop_triples[node_id]
        feature = {node: node2label[node] for node in node_list if node != node_id}
        instruction = nodecls_pubmed_instruction_unfaithful_answer(label, node_list, triple_list, target_node=node_id, feature=feature)

        answer_positive = label
        answer_negative_candidates = ['class_0', 'class_1', 'class_2']
        while True:
            answer_negative = answer_negative_candidates[random.randint(0, len(answer_negative_candidates) - 1)]
            if answer_negative != answer_positive:
                break
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [answer_positive],
            "answer_negative": [answer_negative],
            "hallucination_type": "unfaithful_answer",
            "answer_with_cot": [],
            "difficulty": "easy",
            "from": "PubMed",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

nodecls_pubmed_instruction_data_unfaithful_answer = {
    "train": nodecls_pubmed_dataset_unfaithful_answer(data=pubmed_train_node_list, **pubmed_features),
    "test": nodecls_pubmed_dataset_unfaithful_answer(data=pubmed_test_node_list, **pubmed_features),
}

100%|██████████| 10971/10971 [00:00<00:00, 18694.61it/s]


total number: 10971


100%|██████████| 2000/2000 [00:00<00:00, 18952.85it/s]

total number: 2000





In [437]:
print(nodecls_pubmed_instruction_data_unfaithful_answer["test"][545]["instruction"])
print(nodecls_pubmed_instruction_data_unfaithful_answer["test"][545]["answer_positive"])
print(nodecls_pubmed_instruction_data_unfaithful_answer["test"][545]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="pubmed-scientific-publications-citation-graph"] {
    publication_node_list = ["paper_14464", "paper_17152", "paper_8966", "paper_16135", "paper_14096", "paper_14609", "paper_19091", "paper_7445", "paper_18069", "paper_10647", "paper_16664", "paper_18043", "paper_16796", "paper_14750", "paper_19615", "paper_12960", "paper_16164", "paper_5797", "paper_12582", "paper_4775", "paper_12967", "paper_17956", "paper_13614", "paper_19632", "paper_14001", "paper_14513", "paper_16307", "paper_19120", "paper_19255", "paper_15672", "paper_15040", "paper_15434", "paper_12491", "paper_19276", "paper_17230", "paper_14036", "paper_16469", 

In [439]:
# 构造为如下格式
nodecls_pubmed_preference_data = {
    "train": nodecls_pubmed_instruction_data_unfaithful_answer["train"],
    "test": nodecls_pubmed_instruction_data_unfaithful_answer["test"],
}
GraphNodeCLS_PubMed_preference_benchmark_dict = {
    "graph-language-modeling-graph-nodel-cls-pubmed": nodecls_pubmed_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [440]:
np.save("preference_dataset/graphlanguagemodeling_graphnodecls_pubmed_preference_dataset.npy", GraphNodeCLS_PubMed_preference_benchmark_dict)


### （4）OGBN-ArXiv

- OGB（斯坦福）评测基准官网：https://ogb.stanford.edu/
- 数据集描述：https://ogb.stanford.edu/docs/nodeprop/#ogbn-arxiv
- 手动下载地址：http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip，保存到OGB/OGBN-ArXiv；
- 在OGB/OGBN-ArXiv目录下，下载paper title & abstract信息：https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz

数据构造：
- 训练集采样30k，测试集采样3k；
- 每个样本选择2-hop子图，每跳最多采样5个节点，最多25个三元组；
- 再instruction中，每个样本类别采样10个；

In [362]:
import json
import os
from tqdm import tqdm
from random import shuffle
import random
import numpy as np
import datasets
from datasets import load_dataset
import csv
import gzip

In [363]:
from ogb.nodeproppred import NodePropPredDataset


dataset = NodePropPredDataset(name="ogbn-arxiv")

split_idx = dataset.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
graph, label = dataset[0] # graph: library-agnostic graph object

ModuleNotFoundError: No module named 'ogb'

In [620]:
graph

{'edge_index': array([[104447,  15858, 107156, ...,  45118,  45118,  45118],
        [ 13091,  47283,  69161, ..., 162473, 162537,  72717]]),
 'edge_feat': None,
 'node_feat': array([[-0.057943, -0.05253 , -0.072603, ...,  0.173364, -0.172796,
         -0.140059],
        [-0.1245  , -0.070665, -0.325202, ...,  0.068524, -0.372111,
         -0.301036],
        [-0.080242, -0.023328, -0.183787, ...,  0.109919,  0.117589,
         -0.139883],
        ...,
        [-0.22053 , -0.036568, -0.402199, ...,  0.11336 , -0.161393,
         -0.145171],
        [-0.138236,  0.040885, -0.251811, ..., -0.08929 , -0.041253,
         -0.376132],
        [-0.029875,  0.268417, -0.161124, ...,  0.120807,  0.077647,
         -0.091018]], dtype=float32),
 'node_year': array([[2013],
        [2015],
        [2014],
        ...,
        [2020],
        [2020],
        [2020]]),
 'num_nodes': 169343}

In [364]:
ogbn_arxiv_data_dir = "OGB/OGBN-ArXiv/"

In [365]:
def load_ogbn_arxiv(data_dir):
    # 读取title abstract
    with open(os.path.join(data_dir, "titleabs.tsv"), "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    paper_id2titleabs = dict()
    for line in tqdm(lines):
        paper_id, title, abstract = line.strip().split("\t")
        paper_id2titleabs[paper_id] = {
            "title": title,
            "abstract": abstract,
        }
    # 读取nodeid2paperid
    node_id2paper_id = dict()
    with open(os.path.join(data_dir, "mapping", "nodeidx2paperid.csv"), "r", encoding="utf-8") as fr:
        lines = fr.readlines()
        for line in tqdm(lines[1:]):
            line = line.strip()
            node_id, paper_id = line.split(",")
            node_id2paper_id[node_id] = paper_id
    # print(node_id2pape_rid.keys())
    # 获取labelid2arxivcategory
    label_id2name = dict()
    with open(os.path.join(data_dir, "mapping", "labelidx2arxivcategeory.csv"), "r", encoding="utf-8") as fr:
        lines = fr.readlines()
        for line in tqdm(lines[1:]):
            line = line.strip()
            label_id, label_name = line.split(",")
            label_id2name[label_id] = label_name.replace(" ", ".")
    # 读取边信息
    edge_list = list()
    with gzip.open(os.path.join(data_dir, "raw", "edge.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for row in tqdm(reader):
            edge_list.append((row[0], row[1]))

    # 根据边信息，构建一个2-hop子图
    node2adj = dict() # 1-hop子图
    node2adj_triples = dict()
    for head_id, tail_id in tqdm(edge_list):
        if head_id not in node2adj.keys():
            node2adj[head_id] = list()
            node2adj_triples[head_id] = list()
        node2adj[head_id].append(tail_id)
        node2adj_triples[head_id].append((head_id, tail_id))
    node2adj_2hop_triples = dict() # 2-hop子图中的所有三元组
    node2adj_2hop_nodes = dict() # 2-hop子图中的所有节点
    for node_id, adj in tqdm(node2adj.items()):
        if node_id not in node2adj_2hop_triples.keys():
            node2adj_2hop_triples[node_id] = set()
            node2adj_2hop_nodes[node_id] = {node_id}
        adj_random = adj
        shuffle(adj_random)
        adj_random = adj_random[:6] # 只采样5个1-hop邻居
        for onehop_tail_id in adj_random:
            node2adj_2hop_triples[node_id].add((node_id, onehop_tail_id))
            node2adj_2hop_nodes[node_id].add(onehop_tail_id)
            if onehop_tail_id in node2adj.keys():
                tail_adj_random = node2adj[onehop_tail_id]
                shuffle(tail_adj_random)
                tail_adj_random = tail_adj_random[:3]
                for twohop_tail_id in tail_adj_random:
                    # if twohop_tail_id == node_id:
                    #     continue
                    node2adj_2hop_triples[node_id].add((onehop_tail_id, twohop_tail_id))
                    node2adj_2hop_nodes[node_id].add(twohop_tail_id)
    
    # 读取节点的标签
    node_id2label_id = dict()
    with gzip.open(os.path.join(data_dir, "raw", "node-label.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            node_id2label_id[str(ei)] = row[0]
    # 读取节点对应的年份
    node_id2year = dict()
    with gzip.open(os.path.join(data_dir, "raw", "node_year.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            node_id2year[str(ei)] = row[0]
    
    # 读取训练集、验证集、测试集对应的node id索引
    train_idx, valid_idx, test_idx = list(), list(), list()
    with gzip.open(os.path.join(data_dir, "split", "time", "train.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            train_idx.append(row[0])
    with gzip.open(os.path.join(data_dir, "split", "time", "valid.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            valid_idx.append(row[0])
    with gzip.open(os.path.join(data_dir, "split", "time", "test.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            test_idx.append(row[0])

    # 训练集、测试集分别采样20k和2k
    shuffle(train_idx)
    shuffle(test_idx)
    train_idx = train_idx[:30000]
    test_idx = test_idx[:3000]
    
    def process(data):
        examples = list()
        for node_id in tqdm(data):
            paper_id = node_id2paper_id[node_id]
            paper_year = node_id2year[node_id]
            paper_label = label_id2name[node_id2label_id[node_id]]
            paper_title, paper_abstract = paper_id2titleabs[paper_id]["title"], paper_id2titleabs[paper_id]["abstract"]
            if node_id not in node2adj_2hop_triples.keys():
                continue
            node_paper_adj = node2adj_2hop_triples[node_id]
            adj_node_dict = dict()
            # print(node_paper_adj)
            for (head_node_id, tail_node_id) in node_paper_adj:
                head_paper_id, tail_paper_id = node_id2paper_id[head_node_id], node_id2paper_id[tail_node_id]
                if head_paper_id != paper_id and head_paper_id not in adj_node_dict.keys():
                    adj_node_dict[head_paper_id] = {
                        "year": node_id2year[head_node_id],
                        "title": paper_id2titleabs[head_paper_id]["title"],
                        # "abstract": paper_id2titleabs[head_paper_id]["abstract"],
                        "label": label_id2name[node_id2label_id[head_node_id]],
                    }
                if tail_paper_id != paper_id and tail_paper_id not in adj_node_dict.keys():
                    adj_node_dict[tail_paper_id] = {
                        "year": node_id2year[tail_node_id],
                        "title": paper_id2titleabs[tail_paper_id]["title"],
                        # "abstract": paper_id2titleabs[tail_paper_id]["abstract"],
                        "label": label_id2name[node_id2label_id[tail_node_id]],
                    }
            examples.append({
                "paper_id": paper_id,
                "paper_year": paper_year,
                "paper_label": paper_label,
                "paper_title": paper_title,
                "paper_abstract": paper_abstract,
                "paper_adj_triple": [(node_id2paper_id[node[0]], node_id2paper_id[node[1]]) for node in node_paper_adj],
                "adj_node_dict": adj_node_dict,
            })
        return examples
    
    return process(train_idx), process(valid_idx), process(test_idx), label_id2name
    
    
ogbnarxiv_train_data, ogbnarxiv_valid_data, ogbnarxiv_test_data, ogbnarxiv_label_id2name = load_ogbn_arxiv(ogbn_arxiv_data_dir)



100%|██████████| 179719/179719 [00:00<00:00, 441774.14it/s]
100%|██████████| 169343/169343 [00:00<00:00, 1945188.96it/s]
100%|██████████| 40/40 [00:00<00:00, 652809.96it/s]
1166243it [00:00, 1536676.45it/s]
100%|██████████| 1166243/1166243 [00:00<00:00, 1860715.65it/s]
100%|██████████| 151903/151903 [00:03<00:00, 38709.10it/s]
169343it [00:00, 1420074.90it/s]
169343it [00:00, 1391661.16it/s]
90941it [00:00, 1831476.09it/s]
29799it [00:00, 1753870.38it/s]
48603it [00:00, 1833614.48it/s]
100%|██████████| 30000/30000 [00:00<00:00, 44562.73it/s]
100%|██████████| 29799/29799 [00:00<00:00, 30620.78it/s]
100%|██████████| 3000/3000 [00:00<00:00, 24362.59it/s]


In [85]:
ogbnarxiv_train_data[0]

{'paper_id': '2294627763',
 'paper_year': '2016',
 'paper_label': 'arxiv.cs.ai',
 'paper_title': 'dimension coupling optimal active learning of halfspaces via query synthesis',
 'paper_abstract': 'In this paper, we consider the problem of actively learning a linear classifier through query synthesis where the learner can construct artificial queries in order to estimate the true decision boundaries. This problem has recently gained a lot of interest in automated science and adversarial reverse engineering for which only heuristic algorithms are known. In such applications, queries can be constructed de novo to elicit information (e.g., automated science) or to evade detection with minimal cost (e.g., adversarial reverse engineering). #R##N#We develop a general framework, called dimension coupling (DC), that 1) reduces a d-dimensional learning problem to d-1 low-dimensional sub-problems, 2) solves each sub-problem efficiently, and 3) appropriately aggregates the results and outputs a li

In [675]:
# with gzip.open(os.path.join(ogbn_arxiv_data_dir, "split", "time", "train.csv.gz"), "rt") as fr:
#     reader = csv.reader(fr, delimiter=',')
#     for ei, row in enumerate(tqdm(reader)):
#         print(row)

**构建instruction数据集**

In [366]:
def nodecls_ogbnarxiv_graph_language(
    task_name: str, node_list: list, graph: list, target_node, feature, 
    target_title, target_abstract, target_year,
):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    publication_node_list = <node_list>;
    publication_node_feature = <feature>;
    citation_triple_list = <triple_list>;
    target_publication_node = <target_node>;
    <target_node>.title = \"<target_title>\";
    <target_node>.abstract = \"<target_abstract>\";
    <target_node>.year = \"<target_year>\";\n}\n```"""
    node_list = "[" + ", ".join(["\"paper_{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"paper_{}\" -> \"paper_{}\")".format(triple[0], triple[1]) for triple in graph]) + "]"
    feature = "[" + ", ".join(["\"paper_{}\".feature=\"{}\"".format(node, label) for node, label in feature.items()]) + "]"
    
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))\
    .replace("<target_node>", "\"paper_{}\"".format(str(target_node))).replace("<feature>", feature)\
    .replace("<target_title>", target_title).replace("<target_abstract>", target_abstract).replace("<target_year>", target_year)

In [441]:
def nodecls_ogbnarxiv_instruction_unfaithful_answer(text: str, entities: list, triples: list, do_print: bool = False, target_node=None, 
                                  feature=None, target_title=None, target_abstract=None, target_year=None):
    # 标签太多，进行下采样10个label
    random_label_list = list(ogbnarxiv_label_id2name.values())
    shuffle(random_label_list)
    random_label_list = random_label_list[:9] + [text]
    random_label_list = list(set(random_label_list))
    shuffle(random_label_list)
    
    # 配置 instruction prompt
    task_name = "arxiv-scientific-publications-citation-graph"
    instruction = "Task definition: given a target scientific publication and corresponding citation graph, classify the target scientific publication into one of {}.".format(", ".join(list(random_label_list)))
    edge_list = triples
    node_list = entities
    gcl = nodecls_ogbnarxiv_graph_language(task_name, node_list, edge_list, target_node, feature, target_title, target_abstract, target_year)
    query = "Q: Please classify the target scientific publication."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
            "node_feature": feature,
        }
    }

def nodecls_ogbnarxiv_dataset_unfaithful_answer(data: list):
    task_name = "graph-language-modeling-graph-node-cls-ogbn-arxiv"
    final_data = list()
    
    for ei, example in enumerate(tqdm(data)):
        paper_label = example["paper_label"]
        paper_id = example["paper_id"]
        paper_year = example["paper_year"]
        paper_title = example["paper_title"]
        paper_abstract = example["paper_abstract"]
        paper_adj_triple = example["paper_adj_triple"]
        adj_node_dict = example["adj_node_dict"]

        
        node_list = [paper_id] + list(adj_node_dict.keys())
        triple_list = paper_adj_triple
        feature = adj_node_dict
        instruction = nodecls_ogbnarxiv_instruction_unfaithful_answer(
            paper_label, node_list, triple_list, target_node=paper_id, feature=feature,
            target_title=paper_title, target_abstract=paper_abstract, target_year=paper_year,
        )

        answer_positive = paper_label
        answer_negative_candidates = list(ogbnarxiv_label_id2name.values())
        while True:
            answer_negative = answer_negative_candidates[random.randint(0, len(answer_negative_candidates) - 1)]
            if answer_negative != answer_positive:
                break
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [paper_label],
            "answer_negative": [answer_negative],
            "hallucination_type": "unfaithful_answer",
            "answer_with_cot": [],
            "difficulty": "easy",
            "from": "OGBN-ArXiv",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

nodecls_ogbnarxiv_instruction_data_unfaithful_answer = {
    "train": nodecls_ogbnarxiv_dataset_unfaithful_answer(ogbnarxiv_train_data),
    "test": nodecls_ogbnarxiv_dataset_unfaithful_answer(ogbnarxiv_test_data),
}

100%|██████████| 25272/25272 [00:01<00:00, 15772.60it/s]


total number: 25272


100%|██████████| 2960/2960 [00:00<00:00, 11825.46it/s]

total number: 2960





In [368]:
print(nodecls_ogbnarxiv_instruction_data_unfaithful_answer["test"][122]["instruction"])
print(nodecls_ogbnarxiv_instruction_data_unfaithful_answer["test"][122]["answer_positive"])
print(nodecls_ogbnarxiv_instruction_data_unfaithful_answer["test"][122]["answer_negative"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="arxiv-scientific-publications-citation-graph"] {
    publication_node_list = ["paper_2971171734", "paper_2170857652", "paper_2891945150", "paper_2619228097", "paper_2954405932", "paper_2155929351", "paper_2950622308", "paper_2588334501", "paper_2951617294", "paper_2788708648", "paper_2952198716", "paper_1793987952", "paper_2951840971", "paper_2251147786", "paper_2951737564", "paper_2953180101", "paper_1509146554", "paper_2000213460"];
    publication_node_feature = ["paper_2170857652".feature="{'year': '2014', 'title': 'the effect of wording on message propagation topic and author controlled natural experiments on twitter'

In [442]:
# 构造为如下格式
nodecls_ogbnarxiv_preference_data = {
    "train": nodecls_ogbnarxiv_instruction_data_unfaithful_answer["train"],
    "test": nodecls_ogbnarxiv_instruction_data_unfaithful_answer["test"],
}
GraphNodeCLS_OGBNArXiv_preference_benchmark_dict = {
    "graph-language-modeling-graph-nodel-cls-ogbn-arxiv": nodecls_ogbnarxiv_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [443]:
np.save("preference_dataset/graphlanguagemodeling_graphnodecls_ogbnarxiv_preference_dataset.npy", GraphNodeCLS_OGBNArXiv_preference_benchmark_dict)


### （5）OGBN-Product

- 数据集描述：https://ogb.stanford.edu/docs/nodeprop/#ogbn-products
- 手动下载：https://snap.stanford.edu/ogb/data/nodeproppred/products.zip

数据构造：
- 训练集采样20k，测试集采样2k；
- 每个样本选择2-hop子图，每跳最多采样5个节点，最多25个三元组；
- 再instruction中，每个样本类别采样10个；

In [371]:
import json
import os
from tqdm import tqdm
from random import shuffle
import random
import numpy as np
import datasets
from datasets import load_dataset
import csv
import gzip

In [372]:
ogbn_products_data_dir = "OGB/OGBN-Products/"

In [373]:
def load_ogbn_products(data_dir):
    # 读取title abstract
    # with open(os.path.join(data_dir, "titleabs.tsv"), "r", encoding="utf-8") as fr:
    #     lines = fr.readlines()
    # paper_id2titleabs = dict()
    # for line in tqdm(lines):
    #     paper_id, title, abstract = line.strip().split("\t")
    #     paper_id2titleabs[paper_id] = {
    #         "title": title,
    #         "abstract": abstract,
    #     }
    # 读取nodeid2paperid
    node_id2paper_id = dict()
    # with open(os.path.join(data_dir, "mapping", "nodeidx2asin.csv.gz"), "r", encoding="utf-8") as fr:
    #     lines = fr.readlines()
    with gzip.open(os.path.join(data_dir, "mapping", "nodeidx2asin.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for row in tqdm(reader):
            node_id, paper_id = row[0], row[1]
            node_id2paper_id[node_id] = paper_id
    # print(node_id2pape_rid.keys())
    # 获取labelid2arxivcategory
    label_id2name = dict()
    # with open(os.path.join(data_dir, "mapping", "labelidx2arxivcategeory.csv"), "r", encoding="utf-8") as fr:
    #     lines = fr.readlines()
    with gzip.open(os.path.join(data_dir, "mapping", "labelidx2productcategory.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for row in tqdm(reader):
            label_id, label_name = row[0], row[1]
            label_id2name[label_id] = label_name.replace(" ", ".")
    # 读取边信息
    edge_list = list()
    with gzip.open(os.path.join(data_dir, "raw", "edge.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for row in tqdm(reader):
            edge_list.append((row[0], row[1]))

    # 根据边信息，构建一个2-hop子图
    node2adj = dict() # 1-hop子图
    node2adj_triples = dict()
    for head_id, tail_id in tqdm(edge_list):
        if head_id not in node2adj.keys():
            node2adj[head_id] = list()
            node2adj_triples[head_id] = list()
        node2adj[head_id].append(tail_id)
        node2adj_triples[head_id].append((head_id, tail_id))
    node2adj_2hop_triples = dict() # 2-hop子图中的所有三元组
    node2adj_2hop_nodes = dict() # 2-hop子图中的所有节点
    for node_id, adj in tqdm(node2adj.items()):
        if node_id not in node2adj_2hop_triples.keys():
            node2adj_2hop_triples[node_id] = set()
            node2adj_2hop_nodes[node_id] = {node_id}
        adj_random = adj
        shuffle(adj_random)
        adj_random = adj_random[:5] # 只采样5个1-hop邻居
        for onehop_tail_id in adj_random:
            node2adj_2hop_triples[node_id].add((node_id, onehop_tail_id))
            node2adj_2hop_nodes[node_id].add(onehop_tail_id)
            if onehop_tail_id in node2adj.keys():
                tail_adj_random = node2adj[onehop_tail_id]
                shuffle(tail_adj_random)
                tail_adj_random = tail_adj_random[:5]
                for twohop_tail_id in tail_adj_random:
                    # if twohop_tail_id == node_id:
                    #     continue
                    node2adj_2hop_triples[node_id].add((onehop_tail_id, twohop_tail_id))
                    node2adj_2hop_nodes[node_id].add(twohop_tail_id)
    
    # 读取节点的标签
    node_id2label_id = dict()
    with gzip.open(os.path.join(data_dir, "raw", "node-label.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            node_id2label_id[str(ei)] = row[0]
    # # 读取节点对应的年份
    # node_id2year = dict()
    # with gzip.open(os.path.join(data_dir, "raw", "node_year.csv.gz"), "rt") as fr:
    #     reader = csv.reader(fr, delimiter=',')
    #     for ei, row in enumerate(tqdm(reader)):
    #         node_id2year[str(ei)] = row[0]
    
    # 读取训练集、验证集、测试集对应的node id索引
    train_idx, valid_idx, test_idx = list(), list(), list()
    with gzip.open(os.path.join(data_dir, "split", "sales_ranking", "train.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            train_idx.append(row[0])
    with gzip.open(os.path.join(data_dir, "split", "sales_ranking", "valid.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            valid_idx.append(row[0])
    with gzip.open(os.path.join(data_dir, "split", "sales_ranking", "test.csv.gz"), "rt") as fr:
        reader = csv.reader(fr, delimiter=',')
        for ei, row in enumerate(tqdm(reader)):
            test_idx.append(row[0])

    # 训练集、测试集分别采样20k和2k
    shuffle(train_idx)
    shuffle(test_idx)
    train_idx = train_idx[:20000]
    test_idx = test_idx[:2000]
    
    def process(data):
        examples = list()
        for node_id in tqdm(data):
            paper_id = node_id2paper_id[node_id]
            # paper_year = node_id2year[node_id]
            paper_label = label_id2name[node_id2label_id[node_id]]
            # paper_title, paper_abstract = paper_id2titleabs[paper_id]["title"], paper_id2titleabs[paper_id]["abstract"]
            if node_id not in node2adj_2hop_nodes.keys():
                continue
            node_paper_adj = node2adj_2hop_triples[node_id]
            adj_node_dict = dict()
            # print(node_paper_adj)
            for (head_node_id, tail_node_id) in node_paper_adj:
                head_paper_id, tail_paper_id = node_id2paper_id[head_node_id], node_id2paper_id[tail_node_id]
                if head_paper_id != paper_id and head_paper_id not in adj_node_dict.keys():
                    adj_node_dict[head_paper_id] = {
                        # "year": node_id2year[head_node_id],
                        # "title": paper_id2titleabs[head_paper_id]["title"],
                        # "abstract": paper_id2titleabs[head_paper_id]["abstract"],
                        "label": label_id2name[node_id2label_id[head_node_id]],
                    }
                if tail_paper_id != paper_id and tail_paper_id not in adj_node_dict.keys():
                    adj_node_dict[tail_paper_id] = {
                        # "year": node_id2year[tail_node_id],
                        # "title": paper_id2titleabs[tail_paper_id]["title"],
                        # "abstract": paper_id2titleabs[tail_paper_id]["abstract"],
                        "label": label_id2name[node_id2label_id[tail_node_id]],
                    }
            examples.append({
                "product_id": paper_id,
                # "product_year": paper_year,
                "product_label": paper_label,
                # "paper_title": paper_title,
                # "paper_abstract": paper_abstract,
                "product_adj_triple": [(node_id2paper_id[node[0]], node_id2paper_id[node[1]]) for node in node_paper_adj],
                "adj_node_dict": adj_node_dict,
            })
        return examples
    
    return process(train_idx), process(valid_idx), process(test_idx), label_id2name
    
    
ogbnproducts_train_data, ogbnproducts_valid_data, ogbnproducts_test_data, ogbnproducts_label_id2name = load_ogbn_products(ogbn_products_data_dir)



2449030it [00:02, 1169610.19it/s]
48it [00:00, 119694.76it/s]
61859140it [00:41, 1474358.13it/s]
100%|██████████| 61859140/61859140 [01:54<00:00, 539237.63it/s] 
100%|██████████| 2120086/2120086 [03:52<00:00, 9107.09it/s]  
2449029it [00:01, 1378574.85it/s]
196615it [00:00, 1829673.41it/s]
39323it [00:00, 1755498.72it/s]
2213091it [00:01, 2015450.39it/s]
100%|██████████| 20000/20000 [00:01<00:00, 13565.09it/s]
100%|██████████| 39323/39323 [00:02<00:00, 16392.58it/s]
100%|██████████| 2000/2000 [00:00<00:00, 24976.28it/s]


In [27]:
ogbnproducts_train_data[0]

{'product_id': 'B00B6E8POM',
 'product_label': 'Toys.&.Games',
 'product_adj_triple': [('B00A8ELH3W', 'B004FLKOY2'),
  ('B00HDAG7DO', 'B00CEQ1CLO'),
  ('B00B6E8POM', 'B000EOASEK'),
  ('0800788222', 'B009PBAEF6'),
  ('0800788222', '081186958X'),
  ('B00A8ELH3W', 'B004S4ZZ0Y'),
  ('0800788222', 'B0009Z3IPK'),
  ('B00HDAG7DO', 'B00CRFQTWY'),
  ('B000ID31Z0', 'B000CIQ4DC'),
  ('B00B6E8POM', 'B00HDAG7DO'),
  ('B00B6E8POM', '0800788222'),
  ('B000EOASEK', 'B0055E7V6W'),
  ('B00HDAG7DO', 'B00D78DBPS'),
  ('B000ID31Z0', 'B0027AAO98'),
  ('B00HDAG7DO', 'B0009OUB2O'),
  ('B00A8ELH3W', 'B0033AH41M'),
  ('B000EOASEK', 'B002VK3O0K'),
  ('B000ID31Z0', 'B00844ME9Y'),
  ('B000EOASEK', 'B000F4G5K0'),
  ('B00B6E8POM', 'B000ID31Z0'),
  ('0800788222', 'B00486Y8Z0'),
  ('B00A8ELH3W', 'B005DEW3J4'),
  ('B00HDAG7DO', 'B00DVFNV7U'),
  ('B000EOASEK', 'B00EDY1XKU'),
  ('B000ID31Z0', 'B000IZ9N78'),
  ('0800788222', 'B004S5AC10'),
  ('B000ID31Z0', 'B0073Y2DHO'),
  ('B00A8ELH3W', 'B0019Z1PTU'),
  ('B000EOASEK', 'B

In [675]:
# with gzip.open(os.path.join(ogbn_arxiv_data_dir, "split", "time", "train.csv.gz"), "rt") as fr:
#     reader = csv.reader(fr, delimiter=',')
#     for ei, row in enumerate(tqdm(reader)):
#         print(row)

**构建instruction数据集**

In [374]:
def nodecls_ogbnproducts_graph_language(
    task_name: str, node_list: list, graph: list, target_node, feature
):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    product_node_list = <node_list>;
    product_node_feature = <feature>;
    product_triple_list = <triple_list>;
    target_product_node = <target_node>;\n}\n```"""
    node_list = "[" + ", ".join(["\"{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")".format(triple[0], triple[1]) for triple in graph]) + "]"
    feature = "[" + ", ".join(["\"{}\".category=\"{}\"".format(node, label["label"]) for node, label in feature.items()]) + "]"
    
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))\
    .replace("<target_node>", "\"{}\"".format(str(target_node))).replace("<feature>", feature)


In [444]:
def nodecls_ogbnproducts_instruction_unfaithful_answer(text: str, entities: list, triples: list, do_print: bool = False, target_node=None, feature=None):
    # 标签太多，进行下采样10个label
    random_label_list = list(ogbnproducts_label_id2name.values())
    shuffle(random_label_list)
    random_label_list = random_label_list[:9] + [text]
    random_label_list = list(set(random_label_list))
    shuffle(random_label_list)
    
    # 配置 instruction prompt
    task_name = "product-graph"
    instruction = "Task definition: given a product and corresponding graph, classify the target product into one of {}.".format(", ".join(random_label_list))
    edge_list = triples
    node_list = entities
    gcl = nodecls_ogbnproducts_graph_language(task_name, node_list, edge_list, target_node, feature)
    query = "Q: Please classify the target product."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
            "node_feature": feature,
        }
    }

def nodecls_ogbnproducts_dataset_unfaithful_answer(data: list):
    task_name = "graph-language-modeling-graph-node-cls-ogbn-products"
    final_data = list()
    
    for ei, example in enumerate(tqdm(data)):
        paper_label = example["product_label"]
        paper_id = example["product_id"]
        paper_adj_triple = example["product_adj_triple"]
        adj_node_dict = example["adj_node_dict"]

        
        node_list = [paper_id] + list(adj_node_dict.keys())
        triple_list = paper_adj_triple
        feature = adj_node_dict
        instruction = nodecls_ogbnproducts_instruction_unfaithful_answer(
            paper_label, node_list, triple_list, target_node=paper_id, feature=feature
        )

        
        answer_positive = paper_label
        answer_negative_candidates = list(ogbnproducts_label_id2name.values())
        try:
            while True:
                answer_negative = answer_negative_candidates[random.randint(0, len(answer_negative_candidates) - 1)]
                if answer_negative != answer_positive:
                    break
        except:
            continue
    
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer_positive": [paper_label],
            "answer_negative": [answer_negative],
            "hallucination_type": "unfaithful_answer",
            "answer_with_cot": [],
            "difficulty": "easy",
            "from": "OGBN-Products",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

nodecls_ogbnproducts_instruction_data_unfaithful_answer = {
    "train": nodecls_ogbnproducts_dataset_unfaithful_answer(ogbnproducts_train_data),
    "test": nodecls_ogbnproducts_dataset_unfaithful_answer(ogbnproducts_test_data),
}

100%|██████████| 19984/19984 [00:01<00:00, 16099.88it/s]


total number: 19984


100%|██████████| 1719/1719 [00:00<00:00, 20439.72it/s]

total number: 1719





In [445]:
# 构造为如下格式
nodecls_ogbnproducts_preference_data = {
    "train": nodecls_ogbnproducts_instruction_data_unfaithful_answer["train"],
    "test": nodecls_ogbnproducts_instruction_data_unfaithful_answer["test"],
}
GraphNodeCLS_OGBNProducts_preference_benchmark_dict = {
    "graph-language-modeling-graph-nodel-cls-ogbn-products": nodecls_ogbnproducts_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [446]:
np.save("preference_dataset/graphlanguagemodeling_graphnodecls_ogbnproducts_preference_dataset.npy", GraphNodeCLS_OGBNProducts_preference_benchmark_dict)


## 2.4 Graph Link Prediction

In [None]:
# ### （1）OGBN-Collab

# - 数据集描述：https://ogb.stanford.edu/docs/linkprop/#ogbl-collab
# - 手动下载：https://snap.stanford.edu/ogb/data/linkproppred//collab.zip

# 数据构造：
# - 训练集采样20k，测试集采样2k；
# - 每个样本选择2-hop子图，每跳最多采样5个节点，最多25个三元组；
# - 再instruction中，每个样本类别采样10个；

### （1）Wikidata5M

直接使用Wikidata5M数据集本身（Wikidata5M是一个三元组组成的知识库，本身数据集目的是用于训练知识图谱表征，可以支持link prediction任务）

任务定义：给定两个节点，预测其关系；

预测方法：
- 预测时，分别为头实体和尾实体构建一个1-hop子图，且确保子图中不包含这两个实体的关系边；
- 双重采样：1、每个关系对应的所有尾实体，最多采样5个实体；2、最后所有三元组最多采样30个
- 设计instruciton预测这两个实体的关系

训练集80k，测试集5133

**加载wikidata5m知识图谱**

In [656]:
wikidata5m_path = "Wiki/wikidata5m"
entity_file = os.path.join(wikidata5m_path, "wikidata5m_entity.txt")
relation_file = os.path.join(wikidata5m_path, "wikidata5m_relation.txt")
triple_file = os.path.join(wikidata5m_path, "wikidata5m_all_triplet.txt") 

In [657]:
def load_wikidata5m_entity_file(entity_file):
    entity_qid2names = dict() # 保存实体文件中，每个qid对应的所有可能的实体名称
    entity_name2qid = dict() # 保存实体文件中，每个实体名称对应的qid
    all_entity_name_list = list()
    print("loading entity ...")
    with open(entity_file, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in tqdm(lines):
        line = line.strip()
        entity_qid = line.split("\t")[0]
        entity_name_list = line.split("\t")[1:]
        all_entity_name_list.extend(entity_name_list)
        entity_qid2names[entity_qid] = entity_name_list
        for entity_name in entity_name_list:
            entity_name2qid[entity_name] = entity_qid
    return entity_qid2names, entity_name2qid, all_entity_name_list

wikidata5m_entity_qid2names, wikidata5m_entity_name2qid, wikidata5m_all_entity_name_list = load_wikidata5m_entity_file(entity_file)

loading entity ...


100%|██████████| 4813491/4813491 [00:18<00:00, 266050.00it/s]


In [658]:
def load_wikidata5m_relation_file(relation_file):
    relation_pid2names = dict() # 保存关系文件中，每个pid对应的所有可能的关系名称
    relation_name2pid = dict() # 保存关系文件中，每个关系名称对应的pid
    print("loading relation ...")
    with open(relation_file, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in tqdm(lines):
        line = line.strip()
        relation_pid = line.split("\t")[0]
        relation_name_list = line.split("\t")[1:]
        relation_pid2names[relation_pid] = relation_name_list
        for relation_name in relation_name_list:
            relation_name2pid[relation_name] = relation_pid
    return relation_pid2names, relation_name2pid

wikidata5m_relation_pid2names, wikidata5m_relation_name2pid = load_wikidata5m_relation_file(relation_file)

loading relation ...


100%|██████████| 825/825 [00:00<00:00, 276029.10it/s]


In [659]:
# 加载wikidata5M知识图谱
def load_wikidata5m_triple(triple_file):
    triple_qpqlist = list() # 保存所有三元组（qid，pid，qid）
    entity_qid_adj = dict() # 保存每个实体qid对应的邻接表
    print("loading triple ...")
    with open(triple_file, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in tqdm(lines):
        head_qid, relation_pid, tail_qid = line.strip().split("\t")
        triple_qpqlist.append((head_qid, relation_pid, tail_qid))
        if head_qid not in entity_qid_adj.keys():
            entity_qid_adj[head_qid] = dict()
        if relation_pid not in entity_qid_adj[head_qid].keys():
            entity_qid_adj[head_qid][relation_pid] = list()
        entity_qid_adj[head_qid][relation_pid].append(tail_qid)
    return triple_qpqlist, entity_qid_adj
    
wikidata5m_triple_qpqlist, wikidata5m_entity_qid_adj = load_wikidata5m_triple(triple_file)

loading triple ...


100%|██████████| 21354359/21354359 [01:56<00:00, 182577.87it/s]


**加载训练集、测试集**

In [660]:
def load_wikidata5m_transductive_data(data_dir):
    trans_data_dir = os.path.join(data_dir, "wikidata5m_transductive")

    def load_data(data_dir, data_file):
        # 加载数据集
        examples = list()
        with open(os.path.join(data_dir, data_file), "r", encoding="utf-8") as fr:
            lines = fr.readlines()
        for line in tqdm(lines):
            line = line.strip().split("\t")
            examples.append((line[0], line[1], line[2]))
        return examples
    
    def process(examples):
        # 处理数据集
        final_examples = list()
        for example in tqdm(examples):
            head_id, rel_id, tail_id = example
            try:
                head_name, rel_name, tail_name = wikidata5m_entity_qid2names[head_id][0], wikidata5m_relation_pid2names[rel_id][0], wikidata5m_entity_qid2names[tail_id][0]
            except:
                continue
            # 头实体和尾实体分别获得1-hop子图
            # 双重采样：1、每个关系对应的所有尾实体，最多采样5个实体；2、最后所有三元组最多采样30个
            head_adj_triples, tail_adj_triples = list(), list()
            random_relation_pid = list(wikidata5m_relation_pid2names.keys())
            shuffle(random_relation_pid)
            random_relation_pid = set([wikidata5m_relation_pid2names[pid][0] for pid in random_relation_pid[:6]])
            candidate_relation_classes = random_relation_pid
            candidate_relation_classes.add(rel_name)
            all_entity_nodes = set()
            all_entity_nodes.add(head_name)
            all_entity_nodes.add(tail_name)
            status = False
            if head_id in wikidata5m_entity_qid_adj.keys():
                status = True
                for head_rel_id, head_1hop_ents in wikidata5m_entity_qid_adj[head_id].items():
                    if head_rel_id in wikidata5m_relation_pid2names.keys():
                        head_rel_name = wikidata5m_relation_pid2names[head_rel_id][0]
                        candidate_relation_classes.add(head_rel_name)
                        random_head_1hop_ents = head_1hop_ents
                        shuffle(random_head_1hop_ents)
                        random_head_1hop_ents = random_head_1hop_ents[:5]
                        for head_1hop_ent in random_head_1hop_ents:
                            if head_1hop_ent == tail_id or head_1hop_ent not in wikidata5m_entity_qid2names.keys():
                                continue
                            head_1hop_ent_name = wikidata5m_entity_qid2names[head_1hop_ent][0]
                            head_adj_triples.append((head_name, head_rel_name, head_1hop_ent_name))
                            all_entity_nodes.add(head_1hop_ent_name)
            if tail_id in wikidata5m_entity_qid_adj.keys():
                status = True
                for tail_rel_id, tail_1hop_ents in  wikidata5m_entity_qid_adj[tail_id].items():
                    if tail_rel_id in wikidata5m_relation_pid2names.keys():
                        tail_rel_name = wikidata5m_relation_pid2names[tail_rel_id][0]
                        candidate_relation_classes.add(tail_rel_name)
                        random_tail_1hop_ents = tail_1hop_ents
                        shuffle(random_tail_1hop_ents)
                        random_tail_1hop_ents = random_tail_1hop_ents[:5]
                        for tail_1hop_ent in random_tail_1hop_ents:
                            if tail_1hop_ent == head_id or tail_1hop_ent not in wikidata5m_entity_qid2names.keys():
                                continue
                            tail_1hop_ent_name = wikidata5m_entity_qid2names[tail_1hop_ent][0]
                            tail_adj_triples.append((tail_name, tail_rel_name, tail_1hop_ent_name))
                            all_entity_nodes.add(tail_1hop_ent_name)

            if len(head_adj_triples + tail_adj_triples) > 30:
                head_adj_triples = head_adj_triples[:15]
                tail_adj_triples = tail_adj_triples[:15]
            for (head_name, _, tail_name) in head_adj_triples + tail_adj_triples:
                all_entity_nodes.add(head_name)
                all_entity_nodes.add(tail_name)
            all_entity_nodes = list(all_entity_nodes)
            
            if status is True and len(head_adj_triples) >= 2 and len(tail_adj_triples) >= 2:
                final_examples.append({
                    "head_name": head_name,
                    "tail_name": tail_name,
                    "label": rel_name,
                    "candidate_relation_classes": list(candidate_relation_classes),
                    "head_adj_triples": head_adj_triples,
                    "tail_adj_triples": tail_adj_triples,
                    "all_entity_nodes": all_entity_nodes
                })
        return final_examples
    train_data, test_data = load_data(trans_data_dir, "wikidata5m_transductive_train.txt"), load_data(trans_data_dir, "wikidata5m_transductive_test.txt")
    shuffle(train_data)
    train_data = train_data[:80000]
    
    return process(train_data), process(test_data)

wikidata5m_linkprediction_train_data, wikidata5m_linkprediction_test_data = load_wikidata5m_transductive_data(wikidata5m_path)

FileNotFoundError: [Errno 2] No such file or directory: 'Wiki/wikidata5m/wikidata5m_transductive/wikidata5m_transductive_train.txt'

In [265]:
wikidata5m_linkprediction_train_data[3]

{'head_name': 'simon (first name)',
 'tail_name': 'french vocabulary',
 'label': 'given name',
 'candidate_relation_classes': ["Wikimedia portal's main topic",
  'place of birth',
  'medical condition',
  'child astronomical body',
  'vessel class',
  'writing system',
  'country of citizenship',
  'given name',
  'said to be the same as',
  'charted in',
  'instance of',
  'name day',
  'language of work or name',
  'home world',
  'family name identical to this given name'],
 'head_adj_triples': [('simon wulfse', 'instance of', 'Huamn'),
  ('simon wulfse', 'place of birth', 'history of dordrecht'),
  ('simon wulfse', 'country of citizenship', 'Reino Hulandes')],
 'tail_adj_triples': [('simon (first name)',
   'said to be the same as',
   'simeon the pious'),
  ('simon (first name)', 'said to be the same as', 'szymon (given name)'),
  ('simon (first name)', 'said to be the same as', 'Simone (first name)'),
  ('simon (first name)', 'writing system', 'Latin alphabet letters'),
  ('simon

**构造Instruction数据集**

In [231]:
def linkpred_wikidata5m_graph_language(
    task_name: str, node_list: list, graph: list, head_node, tail_node
):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>;
    head_entity = "<head_node>";
    tail_entity = "<tail_node>";\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)) \
            .replace("<head_node>", head_node).replace("<tail_node>", tail_node)

def linkpred_wikidata5m_instruction(head_entity: str, tail_entity: str, relation: str, entities: list, triples: list, candidate_relation_classes: list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "wikidata-knowledge-graph"
    instruction = "Task definition: given one head entity and tail entity and corresponding one-hop knowledge sub-graph, classify the relation between head entity and tail entity into one of {}.".format(", ".join(candidate_relation_classes))
    edge_list = triples
    node_list = entities
    gcl = linkpred_wikidata5m_graph_language(task_name, node_list, edge_list, head_entity, tail_entity)
    query = "Q: Please classify the relation between head entity and tail entity."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }


In [232]:
def linkpred_wikidata5m_dataset(data: list):
    task_name = "graph-language-modeling-graph-link-prediction-wikidata5m"
    final_data = list()
    
    for ei, example in enumerate(tqdm(data)):
        head_name = example["head_name"]
        tail_name = example["tail_name"]
        label = example["label"]
        candidate_relation_classes = ["\"{}\"".format(i) for i in example["candidate_relation_classes"]]
        head_adj_triples = example["head_adj_triples"]
        tail_adj_triples = example["tail_adj_triples"]
        all_entity_nodes = example["all_entity_nodes"]

        node_list = all_entity_nodes
        triple_list = head_adj_triples + tail_adj_triples
        instruction = linkpred_wikidata5m_instruction(head_name, tail_name, label, all_entity_nodes, head_adj_triples + tail_adj_triples, candidate_relation_classes)
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer": [label],
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "Wikidata5M",
        })
    print("total number: {}".format(len(final_data)))
    return final_data


In [266]:
linkpred_wikidat5m_instruction_data = {
    "train": linkpred_wikidata5m_dataset(wikidata5m_linkprediction_train_data),
    "test": linkpred_wikidata5m_dataset(wikidata5m_linkprediction_test_data),
}

100%|██████████| 68131/68131 [00:02<00:00, 31851.78it/s]


total number: 68131


100%|██████████| 4390/4390 [00:00<00:00, 33525.35it/s]

total number: 4390





In [761]:
print(linkpred_wikidat5m_instruction_data["train"][932]["instruction"])
print(linkpred_wikidat5m_instruction_data["train"][932]["answer"])

NameError: name 'linkpred_wikidat5m_instruction_data' is not defined

In [268]:
# 构造为如下格式
GraphLinkPred_Wikidata5M_instruction_benchmark_dict = {
    "graph-language-modeling-graph-link-prediction-wikidata5m": linkpred_wikidat5m_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [269]:
np.save("instruction_dataset/graphlanguagemodeling_graphlinkprediction_wikidata5m_instruction_dataset.npy", GraphLinkPred_Wikidata5M_instruction_benchmark_dict)


### （2）FB15k-237

数据地址：https://huggingface.co/datasets/KGraph/FB15k-237/tree/main/data

任务定义：给定两个节点，预测其关系；

预测方法：
- 预测时，分别为头实体和尾实体构建一个1-hop子图，且确保子图中不包含这两个实体的关系边；
- 双重采样：1、每个关系对应的所有尾实体，最多采样5个实体；2、最后所有三元组最多采样30个
- 设计instruciton预测这两个实体的关系

训练集50k，测试集3k

In [184]:
fb15k237_data_path = "FB15k-237/"

In [270]:
def load_fb15k237_data(data_path):

    # 加载entity_id2name
    entity_id2name, entity_id2text = dict(), dict()
    with open(os.path.join(data_path, "FB15k_mid2name.txt"), "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in lines:
        line = line.strip()
        entity_id, entity_name = line.split("\t")
        entity_id2name[entity_id] = entity_name.replace("_", " ")
    # 加载entity_id2description
    with open(os.path.join(data_path, "FB15k_mid2description.txt"), "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in lines:
        line = line.strip()
        entity_id, entity_text = line.split("\t")
        entity_id2text[entity_id] = entity_text


    def load_data(data_file):
        examples = list()
        with open(os.path.join(data_path, data_file), "r", encoding="utf-8") as fr:
            lines = fr.readlines()
        for line in tqdm(lines):
            line = line.strip()
            head_id, rel_name, tail_id = line.split("\t")
            rel_name = rel_name.split("/")[-1] # 只取最后一个名称
            examples.append((head_id, rel_name, tail_id))
        return examples

    # 加载所有triple
    train_data, valid_data, test_data = load_data("train.txt"), load_data("valid.txt"), load_data("test.txt")
    print("train_data_num=", len(train_data))
    print("valid_data_num=", len(valid_data))
    print("test_data_num=", len(test_data))

    shuffle(train_data)
    train_data = train_data[:50000]
    test_data = test_data[:3000]

    # 构建一个邻接子图
    entity_id2adj = dict()
    all_relation_classes = set()
    for (head_id, rel_name, tail_id) in tqdm(train_data + valid_data + test_data):
        all_relation_classes.add(rel_name)
        if head_id not in entity_id2adj.keys():
            entity_id2adj[head_id] = dict()
        if rel_name not in entity_id2adj[head_id].keys():
            entity_id2adj[head_id][rel_name] = list()
        entity_id2adj[head_id][rel_name].append(tail_id)
    all_relation_classes = list(all_relation_classes)
    
    def process(examples):
        # 处理数据集
        final_examples = list()
        for example in tqdm(examples):
            head_id, rel_id, tail_id = example
            try:
                head_name, tail_name = entity_id2name[head_id], entity_id2name[tail_id]
            except:
                continue
            # 头实体和尾实体分别获得1-hop子图
            # 双重采样：1、每个关系对应的所有尾实体，最多采样5个实体；2、最后所有三元组最多采样30个
            head_adj_triples, tail_adj_triples = list(), list()
            random_relation_names = all_relation_classes
            shuffle(random_relation_names)
            random_relation_names = random_relation_names[:6]
            candidate_relation_classes = set(random_relation_names)
            candidate_relation_classes.add(rel_name)
            all_entity_nodes = set()
            all_entity_nodes.add(head_name)
            all_entity_nodes.add(tail_name)
            status = False
            if head_id in entity_id2adj.keys():
                status = True
                for head_rel_name, head_1hop_ents in entity_id2adj[head_id].items():
                    candidate_relation_classes.add(head_rel_name)
                    random_head_1hop_ents = head_1hop_ents
                    shuffle(random_head_1hop_ents)
                    random_head_1hop_ents = random_head_1hop_ents[:5]
                    for head_1hop_ent in random_head_1hop_ents:
                        if head_1hop_ent == tail_id or head_1hop_ent not in entity_id2name.keys():
                            continue
                        head_1hop_ent_name = entity_id2name[head_1hop_ent]
                        head_adj_triples.append((head_name, head_rel_name, head_1hop_ent_name))
                        all_entity_nodes.add(head_1hop_ent_name)
            if tail_id in entity_id2adj.keys():
                status = True
                for tail_rel_name, tail_1hop_ents in  entity_id2adj[tail_id].items():
                    candidate_relation_classes.add(tail_rel_name)
                    random_tail_1hop_ents = tail_1hop_ents
                    shuffle(random_tail_1hop_ents)
                    random_tail_1hop_ents = random_tail_1hop_ents[:5]
                    for tail_1hop_ent in random_tail_1hop_ents:
                        if tail_1hop_ent == head_id or tail_1hop_ent not in entity_id2name.keys():
                            continue
                        tail_1hop_ent_name = entity_id2name[tail_1hop_ent]
                        tail_adj_triples.append((tail_name, tail_rel_name, tail_1hop_ent_name))
                        # all_entity_nodes.add(tail_1hop_ent_name)

            if len(head_adj_triples + tail_adj_triples) > 30:
                head_adj_triples = head_adj_triples[:15]
                tail_adj_triples = tail_adj_triples[:15]
            for (head_name, _, tail_name) in head_adj_triples + tail_adj_triples:
                all_entity_nodes.add(head_name)
                all_entity_nodes.add(tail_name)
            all_entity_nodes = list(all_entity_nodes)
            
            if status is True and len(head_adj_triples) >= 2 and len(tail_adj_triples) >= 2:
                final_examples.append({
                    "head_name": head_name,
                    "tail_name": tail_name,
                    "head_text": entity_id2text[head_id] if head_id in entity_id2text.keys() else "",
                    "tail_text": entity_id2text[tail_id] if tail_id in entity_id2text.keys() else "",
                    "label": rel_name,
                    "candidate_relation_classes": list(candidate_relation_classes),
                    "head_adj_triples": head_adj_triples,
                    "tail_adj_triples": tail_adj_triples,
                    "all_entity_nodes": all_entity_nodes
                })
        return final_examples

    return process(train_data), process(test_data) 

fb15k237_linkprediction_train_data, fb15k237_linkprediction_test_data = load_fb15k237_data(fb15k237_data_path)

100%|██████████| 272115/272115 [00:00<00:00, 1271459.26it/s]
100%|██████████| 17535/17535 [00:00<00:00, 1231140.81it/s]
100%|██████████| 20466/20466 [00:00<00:00, 1250300.42it/s]


train_data_num= 272115
valid_data_num= 17535
test_data_num= 20466


100%|██████████| 70535/70535 [00:00<00:00, 776146.29it/s]
100%|██████████| 50000/50000 [00:03<00:00, 14681.17it/s]
100%|██████████| 3000/3000 [00:00<00:00, 15280.59it/s]


In [271]:
fb15k237_linkprediction_train_data[5]

{'head_name': 'Tulsa',
 'tail_name': 'Tulsa',
 'head_text': '"Blake Edwards was an American film director, screenwriter and producer.\\nEdwards’ career began in the 1940s as an actor, but he soon turned to writing screenplays and radio scripts before turning to producing and directing in film and television. His best known films include Breakfast at Tiffany’s, Days of Wine and Roses, and the hugely successful Pink Panther film series with British comedian Peter Sellers. Often thought of as primarily a director of comedies, he also directed dramas and detective films. Late in his career, he transitioned to writing, producing, and directing for theater.\\nIn 2004, he received an Honorary Academy Award in recognition of his writing, directing and producing an extraordinary body of work for the screen."@en',
 'tail_text': '"Tulsa is the second-largest city in the U.S. state of Oklahoma and 46th-largest city in the United States. With a population of 393,987, it is the principal municipalit

**构造Instruction数据集**

In [203]:
def linkpred_fb15k237_graph_language(
    task_name: str, node_list: list, graph: list, head_node, tail_node, head_text, tail_text
):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>;
    head_entity = "<head_node>";
    "<head_node>".description = "<head_text>";
    tail_entity = "<tail_node>";
    "<tail_node>".description = "<tail_text>";\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)) \
            .replace("<head_node>", head_node).replace("<tail_node>", tail_node).replace("<head_text>", head_text).replace("<tail_text>", tail_text)

def linkpred_fb15k237_instruction(
    head_entity: str, tail_entity: str, head_text: str, tail_text: str, relation: str, entities: list, 
    triples: list, candidate_relation_classes: list, do_print: bool = False
):
    # 配置 instruction prompt
    task_name = "freebase-knowledge-graph"
    instruction = "Task definition: given a head entity and a tail entity, and each entity may has a text description and a knowledge sub-graph, classify the relation between head entity and tail entity into one of {}.".format(", ".join(candidate_relation_classes))
    edge_list = triples
    node_list = entities
    gcl = linkpred_fb15k237_graph_language(task_name, node_list, edge_list, head_entity, tail_entity, head_text, tail_text)
    query = "Q: Please classify the relation between head entity and tail entity."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }


In [204]:
def linkpred_fb15k237_dataset(data: list):
    task_name = "graph-language-modeling-graph-link-prediction-fb15k237"
    final_data = list()
    
    for ei, example in enumerate(tqdm(data)):
        head_name = example["head_name"]
        head_text = example["head_text"]
        tail_name = example["tail_name"]
        tail_text = example["tail_text"]
        label = example["label"]
        candidate_relation_classes = ["\"{}\"".format(i) for i in example["candidate_relation_classes"]]
        head_adj_triples = example["head_adj_triples"]
        tail_adj_triples = example["tail_adj_triples"]
        all_entity_nodes = example["all_entity_nodes"]

        node_list = all_entity_nodes
        triple_list = head_adj_triples + tail_adj_triples
        instruction = linkpred_fb15k237_instruction(head_name, tail_name, head_text, tail_text, label, all_entity_nodes, head_adj_triples + tail_adj_triples, candidate_relation_classes)
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer": [label],
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "FB15k-237",
        })
    print("total number: {}".format(len(final_data)))
    return final_data


In [272]:
linkpred_fb15k237_instruction_data = {
    "train": linkpred_fb15k237_dataset(fb15k237_linkprediction_train_data),
    "test": linkpred_fb15k237_dataset(fb15k237_linkprediction_test_data),
}

100%|██████████| 34982/34982 [00:01<00:00, 31362.76it/s]


total number: 34982


100%|██████████| 1881/1881 [00:00<00:00, 33150.49it/s]

total number: 1881





In [273]:
print(linkpred_fb15k237_instruction_data["train"][9326]["instruction"])
print(linkpred_fb15k237_instruction_data["train"][9326]["answer"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="freebase-knowledge-graph"] {
    entity_list = ["1980 NCAA Men's Division I Basketball Tournament", "NCAA Men's Division I Basketball Championship", 'Ogden', "Louisville Cardinals men's basketball", 'College basketball', "Duke Blue Devils men's basketball", 'White', "North Carolina Tar Heels men's basketball"];
    triple_list = [("1980 NCAA Men's Division I Basketball Tournament" -> "Ogden")[relation="locations"], ("1980 NCAA Men's Division I Basketball Tournament" -> "Louisville Cardinals men's basketball")[relation="team"], ("1980 NCAA Men's Division I Basketball Tournament" -> "Duke Blue Devils men's basketball")[relat

In [274]:
# 构造为如下格式
GraphLinkPred_FB15k237_instruction_benchmark_dict = {
    "graph-language-modeling-graph-link-prediction-fb15k237": linkpred_fb15k237_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [275]:
np.save("instruction_dataset/graphlanguagemodeling_graphlinkprediction_fb15k237_instruction_dataset.npy", GraphLinkPred_FB15k237_instruction_benchmark_dict)


### （3）ConceptNet


In [276]:
conceptnet_data_path = "ConceptNet/"

In [277]:
def load_conceptnet_data(data_path):

    def load_data(data_file):
        examples = list()
        with open(os.path.join(data_path, data_file), "r", encoding="utf-8") as fr:
            lines = fr.readlines()
        for line in tqdm(lines):
            line = json.loads(line.strip())
            head_name, rel_name, tail_name = line["head"], line["relation"], line["tail"]
            examples.append((head_name, rel_name, tail_name))
        return examples

    # 加载所有triple
    train_data, valid_data, test_data = load_data("train.jsonl"), load_data("valid.jsonl"), load_data("test.jsonl")
    print("train_data_num=", len(train_data))
    print("valid_data_num=", len(valid_data))
    print("test_data_num=", len(test_data))

    shuffle(train_data)
    train_data = train_data[:200000]

    # 构建一个邻接子图
    entity_name2adj = dict()
    all_relation_classes = set()
    for (head_name, rel_name, tail_name) in tqdm(train_data + valid_data + test_data):
        all_relation_classes.add(rel_name)
        if head_name not in entity_name2adj.keys():
            entity_name2adj[head_name] = dict()
        if rel_name not in entity_name2adj[head_name].keys():
            entity_name2adj[head_name][rel_name] = list()
        entity_name2adj[head_name][rel_name].append(tail_name)
    all_relation_classes = list(all_relation_classes)
    
    def process(examples):
        # 处理数据集
        final_examples = list()
        for example in tqdm(examples):
            head_name, rel_name, tail_name = example
            # 头实体和尾实体分别获得2-hop子图
            # 双重采样：1、每个关系对应的所有尾实体，最多采样5个实体；2、最后所有三元组最多采样30个
            head_adj_triples, tail_adj_triples = list(), list()
            random_relation_names = all_relation_classes
            shuffle(random_relation_names)
            random_relation_names = random_relation_names[:6]
            candidate_relation_classes = set(random_relation_names)
            candidate_relation_classes.add(rel_name)
            all_entity_nodes = set()
            all_entity_nodes.add(head_name)
            all_entity_nodes.add(tail_name)
            status = False
            if head_name in entity_name2adj.keys():
                status = True
                for head_rel_name, head_1hop_ents in entity_name2adj[head_name].items():
                    candidate_relation_classes.add(head_rel_name)
                    random_head_1hop_ents = head_1hop_ents
                    shuffle(random_head_1hop_ents)
                    random_head_1hop_ents = random_head_1hop_ents[:5]
                    for head_1hop_ent in random_head_1hop_ents:
                        if head_1hop_ent == tail_name:
                            continue
                        head_1hop_ent_name = head_1hop_ent
                        head_adj_triples.append((head_name, head_rel_name, head_1hop_ent_name))
                        # all_entity_nodes.add(head_1hop_ent_name)
                        # 2-hop
                        if head_1hop_ent in entity_name2adj.keys():
                            for head_2hop_rel_name, head_2hop_ents in entity_name2adj[head_1hop_ent].items():
                                random_head_2hop_ents = head_2hop_ents
                                shuffle(random_head_2hop_ents)
                                random_head_2hop_ents = random_head_2hop_ents[:5]
                                for head_2hop_ent in random_head_2hop_ents:
                                    if head_2hop_ent == tail_name or head_2hop_ent == head_name:
                                        continue
                                    head_2hop_ent_name = head_2hop_ent
                                    head_adj_triples.append((head_1hop_ent, head_2hop_rel_name, head_2hop_ent_name))
                                    # all_entity_nodes.add(head_2hop_ent_name)
                                    
            if tail_name in entity_name2adj.keys():
                status = True
                for tail_rel_name, tail_1hop_ents in  entity_name2adj[tail_name].items():
                    candidate_relation_classes.add(tail_rel_name)
                    random_tail_1hop_ents = tail_1hop_ents
                    shuffle(random_tail_1hop_ents)
                    random_tail_1hop_ents = random_tail_1hop_ents[:5]
                    for tail_1hop_ent in random_tail_1hop_ents:
                        if tail_1hop_ent == head_name:
                            continue
                        tail_1hop_ent_name = tail_1hop_ent
                        tail_adj_triples.append((tail_name, tail_rel_name, tail_1hop_ent_name))
                        # all_entity_nodes.add(tail_1hop_ent_name)
                        # 2-hop
                        if tail_1hop_ent in entity_name2adj.keys():
                            for tail_2hop_rel_name, tail_2hop_ents in entity_name2adj[tail_1hop_ent].items():
                                random_tail_2hop_ents = tail_2hop_ents
                                shuffle(random_tail_2hop_ents)
                                random_tail_2hop_ents = random_tail_2hop_ents[:5]
                                for tail_2hop_ent in random_tail_2hop_ents:
                                    if tail_2hop_ent == tail_name or tail_2hop_ent == head_name:
                                        continue
                                    tail_2hop_ent_name = tail_2hop_ent
                                    tail_adj_triples.append((tail_1hop_ent, tail_2hop_rel_name, tail_2hop_ent_name))
                                    # all_entity_nodes.add(tail_2hop_ent_name)
            head_adj_triples = list(set(head_adj_triples))
            tail_adj_triples = list(set(tail_adj_triples))
            if len(head_adj_triples + tail_adj_triples) > 30:
                head_adj_triples = head_adj_triples[:15]
                tail_adj_triples = tail_adj_triples[:15]
            for (head_name, _, tail_name) in head_adj_triples + tail_adj_triples:
                all_entity_nodes.add(head_name)
                all_entity_nodes.add(tail_name)
            all_entity_nodes = list(all_entity_nodes)
            
            if status is True and len(head_adj_triples) >= 2 and len(tail_adj_triples) >= 2:
                final_examples.append({
                    "head_name": head_name,
                    "tail_name": tail_name,
                    "label": rel_name,
                    "candidate_relation_classes": list(candidate_relation_classes),
                    "head_adj_triples": head_adj_triples,
                    "tail_adj_triples": tail_adj_triples,
                    "all_entity_nodes": all_entity_nodes
                })
        return final_examples

    return process(train_data), process(test_data) 

conceptnet_linkprediction_train_data, conceptnet_linkprediction_test_data = load_conceptnet_data(conceptnet_data_path)

100%|██████████| 583082/583082 [00:01<00:00, 457622.80it/s]
100%|██████████| 1184/1184 [00:00<00:00, 430670.01it/s]
100%|██████████| 1187/1187 [00:00<00:00, 441917.17it/s]


train_data_num= 583082
valid_data_num= 1184
test_data_num= 1187


100%|██████████| 202371/202371 [00:00<00:00, 611991.65it/s]
100%|██████████| 200000/200000 [00:27<00:00, 7260.00it/s]
100%|██████████| 1187/1187 [00:00<00:00, 3219.24it/s]


In [278]:
conceptnet_linkprediction_train_data[1022]

{'head_name': 'school',
 'tail_name': 'one part of society',
 'label': 'AtLocation',
 'candidate_relation_classes': ['UsedFor',
  'CausesDesire',
  'IsA',
  'PartOf',
  'ReceivesAction',
  'InheritsFrom',
  'CapableOf',
  'AtLocation',
  'HasProperty',
  'InstanceOf',
  'MadeOf',
  'Desires'],
 'head_adj_triples': [('sun tan lotion',
   'UsedFor',
   'protect your skin from ultraviolet rays'),
  ('a cabinet', 'AtLocation', 'the corner'),
  ('a cabinet', 'ReceivesAction', 'found in a kitchen'),
  ('sun tan lotion', 'AtLocation', 'a cabinet'),
  ('sun tan lotion', 'UsedFor', 'protect your skin from ultraviolet ray')],
 'tail_adj_triples': [('kitchen', 'PartOf', 'cooking surface'),
  ('steel', 'UsedFor', 'make boat'),
  ('school', 'CapableOf', 'be major pain'),
  ('steel', 'ReceivesAction', 'form into many different shape'),
  ('steel', 'ReceivesAction', 'painted'),
  ('steel', 'IsA', 'alloy'),
  ('kitchen', 'IsA', 'in house'),
  ('place', 'AtLocation', 'dark'),
  ('steel', 'ReceivesActio

**构造Instruction数据集**

In [279]:
def linkpred_conceptnet_graph_language(
    task_name: str, node_list: list, graph: list, head_node, tail_node
):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>;
    head_entity = "<head_node>";
    tail_entity = "<tail_node>";\n}\n```"""
    node_list = str(node_list)
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)) \
            .replace("<head_node>", head_node).replace("<tail_node>", tail_node)

def linkpred_conceptnet_instruction(
    head_entity: str, tail_entity: str, relation: str, entities: list, 
    triples: list, candidate_relation_classes: list, do_print: bool = False
):
    # 配置 instruction prompt
    task_name = "conceptnet-knowledge-graph"
    instruction = "Task definition: given a head entity and a tail entity, and each entity may has a text description and a knowledge sub-graph, classify the relation between head entity and tail entity into one of {}.".format(", ".join(candidate_relation_classes))
    edge_list = triples
    node_list = entities
    gcl = linkpred_conceptnet_graph_language(task_name, node_list, edge_list, head_entity, tail_entity)
    query = "Q: Please classify the relation between head entity and tail entity."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", relation)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }


In [280]:
def linkpred_conceptnet_dataset(data: list):
    task_name = "graph-language-modeling-graph-link-prediction-conceptnet"
    final_data = list()
    
    for ei, example in enumerate(tqdm(data)):
        head_name = example["head_name"]
        tail_name = example["tail_name"]
        label = example["label"]
        candidate_relation_classes = ["\"{}\"".format(i) for i in example["candidate_relation_classes"]]
        head_adj_triples = example["head_adj_triples"]
        tail_adj_triples = example["tail_adj_triples"]
        all_entity_nodes = example["all_entity_nodes"]

        node_list = all_entity_nodes
        triple_list = head_adj_triples + tail_adj_triples
        instruction = linkpred_conceptnet_instruction(head_name, tail_name, label, all_entity_nodes, head_adj_triples + tail_adj_triples, candidate_relation_classes)
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer": [label],
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "ConceptNet",
        })
    print("total number: {}".format(len(final_data)))
    return final_data


In [281]:
linkpred_conceptnet_instruction_data = {
    "train": linkpred_conceptnet_dataset(conceptnet_linkprediction_train_data),
    "test": linkpred_conceptnet_dataset(conceptnet_linkprediction_test_data),
}

100%|██████████| 29405/29405 [00:01<00:00, 27910.41it/s]


total number: 29405


100%|██████████| 745/745 [00:00<00:00, 27257.84it/s]

total number: 745





In [762]:
print(linkpred_conceptnet_instruction_data["test"][701]["instruction"])
print(linkpred_conceptnet_instruction_data["test"][701]["answer"])

NameError: name 'linkpred_conceptnet_instruction_data' is not defined

In [283]:
# 构造为如下格式
GraphLinkPred_ConceptNet_instruction_benchmark_dict = {
    "graph-language-modeling-graph-link-prediction-conceptnet": linkpred_conceptnet_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [284]:
np.save("instruction_dataset/graphlanguagemodeling_graphlinkprediction_conceptnet_instruction_dataset.npy", GraphLinkPred_ConceptNet_instruction_benchmark_dict)


## 2.5 Graph Property Prediction

### （1）OGBG-Code2

- 数据集描述：https://ogb.stanford.edu/docs/graphprop/#ogbg-code2
- 自行下载地址：https://snap.stanford.edu/ogb/data/graphproppred/code2.zip

In [287]:
ogbgcode2_data_dir = "OGB/OGBG-Code2/"

## 2.6 Graph Relevance Inspection
给定一个graph和一个文本，判断两者是否具有相关性
直接从Wikipedia-Caption、WebNLG、GenWiki中随机采样10k样本作为正样本，再随机采样10k作为负样本
- 正样本：直接将原始样本中的graph和answer配对即可，label则为“相关”；
- 负样本：先随机采样另一组正样本，对于每个样本，将其answer从所有数据中随机挑选，组成错误的answer-graph对，label则为“不相关”


In [764]:
caption_data_file_dict = {
    "wikipedia": "./instruction_dataset/graphlanguagemodeling_graphcaptiongeneration_wikipedia_instruction_dataset.npy",
    "webnlg": "./instruction_dataset/graphlanguagemodeling_graphcaptiongeneration_webnlg_instruction_dataset.npy",
    "genwiki": "./instruction_dataset/graphlanguagemodeling_graphcaptiongeneration_genwiki_instruction_dataset.npy"
}

In [765]:
def load_graph_caption_data(data_file_dict: dict):
    caption_train_examples, caption_test_examples = list(), list()
    for _, data_file in tqdm(data_file_dict.items()):
        data = np.load(data_file, allow_pickle=True)[()]
        for task_name, task_data in data.items():
            task_train_data, task_test_data = task_data["train"], task_data["test"]
            caption_train_examples.extend(task_train_data)
            caption_test_examples.extend(task_test_data)
    print("train num=", len(caption_train_examples))
    print("test num=", len(caption_test_examples))
    return caption_train_examples, caption_test_examples
caption_data_train_data, caption_data_test_data = load_graph_caption_data(caption_data_file_dict)

 33%|███▎      | 1/3 [00:05<00:10,  5.19s/it]


FileNotFoundError: [Errno 2] No such file or directory: './instruction_dataset/graphlanguagemodeling_graphcaptiongeneration_webnlg_instruction_dataset.npy'

In [92]:
def graphrelevance_instruction(text: str, gcl, graph, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "knowledge-graph"
    caption = "Caption: {}".format(text)
    instruction = "Task definition: A binary classification task to inspect whether the given caption is relevant to the graph. The classes are \"Relevant\" and \"Irrelevant\". "
    instruction += "Note that: Irrelevance means that all the information mentioned in the caption cannot be found in the graph."
    query = "Q: Given you a knowledge graph and a caption, please inspect whether the caption is relevant to the graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, caption, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", relation)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": graph
    }


In [109]:
def sample_for_relevance_data(examples, num_per_kind=20000):
    task_name = "graph-language-modeling-graph-relevance-inspection"
    final_data = list()
    shuffle(examples)
    # 随机采样一组样本作为正样本
    positive_data = examples[:num_per_kind]
    for ei, example in enumerate(tqdm(positive_data)):
        gcl = example["graph_language"]
        text = example["answer"][0]
        graph = example["graph"]
        instruction = graphrelevance_instruction(text, gcl, graph)
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer": ["Relevant"],
            "answer_with_cot": [],
            "difficulty": "simple",
            "from": "GraphCaptionGeneration",
        })

    # 再随机采样一组样本用于构建负样本
    negative_data = examples[num_per_kind: num_per_kind*2]
    ### 统计所有的caption，并随机采样
    all_caption = [i["answer"][0] for i in examples]
    shuffle(all_caption)
    negative_caption = all_caption[-num_per_kind:]
    for example, caption in tqdm(zip(negative_data, all_caption)):
        gcl = example["graph_language"]
        text = example["answer"][0]
        graph = example["graph"]
        if caption == text:
            continue
        instruction = graphrelevance_instruction(caption, gcl, graph)
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer": ["irrelevant"],
            "answer_with_cot": [],
            "difficulty": "simple",
            "from": "GraphCaptionGeneration",
        })
    
    
    print("total number: {}".format(len(final_data)))
    return final_data
        
        
        

In [110]:
graphrelevance_instruction_data = {
    "train": sample_for_relevance_data(caption_data_train_data, 20000),
    "test": sample_for_relevance_data(caption_data_test_data, 1000),
}

100%|██████████| 20000/20000 [00:00<00:00, 264176.51it/s]
20000it [00:00, 150222.83it/s]


total number: 40000


100%|██████████| 1000/1000 [00:00<00:00, 266575.82it/s]
1000it [00:00, 344444.77it/s]

total number: 2000





In [763]:
print(graphrelevance_instruction_data["test"][331]["instruction"])
print(graphrelevance_instruction_data["test"][331]["answer"])

NameError: name 'graphrelevance_instruction_data' is not defined

In [113]:
# 构造为如下格式
GraphRelevance_instruction_benchmark_dict = {
    "graph-language-modeling-graph-relevance-inspection": graphrelevance_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [114]:
np.save("instruction_dataset/graphlanguagemodeling_graphrelevanceinspection_instruction_dataset.npy", GraphRelevance_instruction_benchmark_dict)


## 2.7 Graph Collaboration Filtering
以推荐系统为主要任务

### （1）MoiveLens

```
{
    'name': 'movielens',
    'order': 2625,
    'size': 100000,
    'is_directed': True,
    'is_weighted': True
}
```

数据集取自：https://github.com/jwzhanggy/Graph_Toolformer/blob/main/data/README.md
详细的数据集介绍参考：https://paperswithcode.com/dataset/movielens

数据集中只有userid和itemid两种节点，且没有任何其他信息，因此需要事先为每个user获得其协同信息。

任务定义：输入一个user和一个item，预测其是否存在边。

注意：
- 该任务类似link prediciton，但是区别在于无法使用两个节点的特征和其1-hop子图来预测，而应该获得user的1-hop子图中的所有item的协同信息；
- 因此，我们首先对每个用户获得其所有存在关系的item，组成一个item集合，其次两两计算相似度（Jaccard相似度），根据相似度，获得每个user最相似的topk个user；
- 当给定一个user和一个item时，我们直接给定topk个最相似的user的1-hop子图，这些1-hop子图直接融合在一起即可得到当前user的协同图。需要注意的是，这个图中必须包含当前的item，否则无法用于预测，即我们不考虑冷启动问题。


In [346]:
moivelens_data_dir = "MoiveLens"

In [347]:
def load_moivelens(data_dir):
    import pickle
    with open(os.path.join(data_dir, "movielens"), "rb") as fr:
        dataset = pickle.load(fr)
    return dataset
moivelens_data_dict = load_moivelens(moivelens_data_dir)

In [348]:
print(moivelens_data_dict.keys())
print(moivelens_data_dict["data_profile"])
print(list(moivelens_data_dict["nodes"].keys())[:10])
print(moivelens_data_dict["nodes"]["i302"])
print(len(moivelens_data_dict["links"].keys()))
print(list(moivelens_data_dict["links"].values())[:10])

dict_keys(['data_profile', 'nodes', 'links'])
{'name': 'movielens', 'order': 2625, 'size': 100000, 'is_directed': True, 'is_weighted': True}
['u196', 'i242', 'u186', 'i302', 'u22', 'i377', 'u244', 'i51', 'u166', 'i346']
{'label': 'item'}
100000
[{'label': '3', 'time': '881250949'}, {'label': '3', 'time': '891717742'}, {'label': '1', 'time': '878887116'}, {'label': '2', 'time': '880606923'}, {'label': '1', 'time': '886397596'}, {'label': '4', 'time': '884182806'}, {'label': '2', 'time': '881171488'}, {'label': '5', 'time': '891628467'}, {'label': '3', 'time': '886324817'}, {'label': '3', 'time': '883603013'}]


In [349]:
# 划分训练集、测试集
# 这里划分出的训练集只是用于构建user协同信息，但是将近90k的训练数据太多了，因此最后构建instruction数据集时还需要再次降采样。
movielens_data = list()
for edge, label in moivelens_data_dict["links"].items():
    movielens_data.append((edge[0], label["label"], edge[1]))
shuffle(movielens_data)
moivelens_test_data, moivelens_train_data_for_graph_construction = movielens_data[:2000], movielens_data[2000:]

In [351]:
def construct_moivelens_collaboration_graph(moivelens_data):
    # 首先获得每个user对应的item列表（等价于user的1-hop邻接子图）
    user2item = dict()
    useritem2label = dict()
    for (user, label, item) in tqdm(moivelens_data):
        if user not in user2item.keys():
            user2item[user] = set()
        user2item[user].add(item)
        if (user, item) not in useritem2label.keys():
            useritem2label[(user, item)] = label
    
    
    # 对于每个user，计算其与其他user的jaccard相似度，并获得topk个最相似的user，k=3
    user_list = list(user2item.keys())
    user2user = dict() # 每个user最相似的topk个user及其得分
    for cur_user, cur_item_set in tqdm(user2item.items()):
        if cur_user not in user2user.keys():
            user2user[cur_user] = list()
        score_list = list()
        for tgt_user, tgt_item_set in user2item.items():
            if cur_user == tgt_user:
                continue
            score_list.append((tgt_user, round(len(cur_item_set.intersection(tgt_item_set)) / len(cur_item_set.union(tgt_item_set)), 4)))
        score_list = sorted(score_list, reverse=True, key=lambda i: i[1])
        user2user[cur_user].extend(score_list[:5])
    return user2item, useritem2label, user2user
                
moivelens_user2item, moivelens_useritem2label, moivelens_user2user = construct_moivelens_collaboration_graph(moivelens_train_data_for_graph_construction)
    

100%|██████████| 98000/98000 [00:00<00:00, 789252.27it/s]
100%|██████████| 943/943 [00:11<00:00, 83.09it/s] 


In [352]:
moivelens_user2user["u244"]

[('u682', 0.3326),
 ('u457', 0.329),
 ('u551', 0.3215),
 ('u222', 0.3168),
 ('u92', 0.3148)]

In [353]:
# 再一次采样获得训练集
shuffle(moivelens_train_data_for_graph_construction)
moivelens_train_data = moivelens_train_data_for_graph_construction[:40000]

In [354]:
def graphcolla_moivelens_graph_language(task_name: str, node_list: list, graph: list, target_user, target_item):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    user_nodes = <node_list>;
    target_user = <target_user>;
    target_film = <target_item>;
    scoring_triples = <triple_list>\n}\n```"""
    node_list = "[" + ", ".join(["\"{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[score=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph if not (triple[0] == target_user and triple[2] == target_item)]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)).replace("<target_user>", "\"{}\"".format(target_user)).replace("<target_item>", "\"{}\"".format(target_item))

def graphcolla_moivelens_instruction(text: str, entities: list, triples: list, do_print: bool = False, target_user=None, target_item=None):
    # 配置 instruction prompt
    task_name = "film-scoring-graph"
    instruction = "Task definition: given target user and film, predict the score that the user likes this film based on the collaboration graph. Note that, 1) users in the collaboration graph have similar preferences with the target user; 2) the predict score should be one of '1', '2', '3', '4', and '5'."
    edge_list = triples
    node_list = entities
    gcl = graphcolla_moivelens_graph_language(task_name, node_list, edge_list, target_user, target_item)
    query = "Q: Please predict the score that the target user likes the target film."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }


In [355]:
def graphcolla_moivelens_dataset(data: list, user2item, useritem2label, user2user):
    task_name = "graph-language-modeling-graph-collaboration-filtering-moivelens"
    final_data = list()
    
    for ei, (cur_user, label, cur_item) in enumerate(tqdm(data)):
        similar_users = user2user[cur_user]
        node_list = [cur_user] + [u[0] for u in similar_users]
        triple_list = list()
        for u in node_list:
            item_list = list(user2item[u])
            if cur_item in item_list:
                triple_list.append((u, useritem2label[(u, cur_item)], cur_item))
            shuffle(item_list)
            for item in item_list[:8]:
                if item == cur_item:
                    continue
                triple_list.append((u, useritem2label[(u, item)], item))
            
        instruction = graphcolla_moivelens_instruction(label, node_list, triple_list, target_user=cur_user, target_item=cur_item)
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer": [label],
            "answer_with_cot": [],
            "difficulty": "easy",
            "from": "MoiveLens",
        })
    print("total number: {}".format(len(final_data)))
    return final_data


In [356]:
graphcolla_moivelens_instruction_data = {
    "train": graphcolla_moivelens_dataset(moivelens_train_data, moivelens_user2item, moivelens_useritem2label, moivelens_user2user),
    "test": graphcolla_moivelens_dataset(moivelens_test_data, moivelens_user2item, moivelens_useritem2label, moivelens_user2user),
}

100%|██████████| 40000/40000 [00:21<00:00, 1895.03it/s]


total number: 40000


100%|██████████| 2000/2000 [00:01<00:00, 1868.59it/s]

total number: 2000





In [768]:
print(graphcolla_moivelens_instruction_data["test"][162]["instruction"])
print(graphcolla_moivelens_instruction_data["test"][162]["answer"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="film-scoring-graph"] {
    user_nodes = ["u21", "u802", "u255", "u422", "u617", "u367"];
    target_user = "u21";
    target_film = "i219";
    scoring_triples = [("u21" -> "i591")[score="3"], ("u21" -> "i990")[score="2"], ("u21" -> "i569")[score="3"], ("u21" -> "i696")[score="2"], ("u21" -> "i289")[score="3"], ("u21" -> "i635")[score="4"], ("u21" -> "i452")[score="4"], ("u21" -> "i444")[score="3"], ("u802" -> "i219")[score="5"], ("u802" -> "i413")[score="4"], ("u802" -> "i331")[score="4"], ("u802" -> "i567")[score="4"], ("u802" -> "i288")[score="3"], ("u802" -> "i200")[score="4"], ("u802" -> "i687")[score="3"], ("u802" ->

In [358]:
# 构造为如下格式
GraphColla_MovieLens_instruction_benchmark_dict = {
    "graph-language-modeling-graph-collaboration-filtering-moivelens": graphcolla_moivelens_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [359]:
np.save("instruction_dataset/graphlanguagemodeling_graphcollaborationfiltering_moivelens_instruction_dataset.npy", GraphColla_MovieLens_instruction_benchmark_dict)


### （2）Amazon

```
{
    'name': 'amazon',
    'order': 396810,
    'size': 450578,
    'is_directed': True,
    'is_weighted': True
}
```

数据集取自：https://github.com/jwzhanggy/Graph_Toolformer/blob/main/data/README.md
详细的数据集介绍参考：https://paperswithcode.com/dataset/movielens

数据集中只有userid和itemid两种节点，且没有任何其他信息，因此需要事先为每个user获得其协同信息。

任务定义：输入一个user和一个item，预测其是否存在边。

注意：
- 该任务类似link prediciton，但是区别在于无法使用两个节点的特征和其1-hop子图来预测，而应该获得user的1-hop子图中的所有item的协同信息；
- 因此，我们首先对每个用户获得其所有存在关系的item，组成一个item集合，其次两两计算相似度（Jaccard相似度），根据相似度，获得每个user最相似的topk个user；
- 当给定一个user和一个item时，我们直接给定topk个最相似的user的1-hop子图，这些1-hop子图直接融合在一起即可得到当前user的协同图。需要注意的是，这个图中必须包含当前的item，否则无法用于预测，即我们不考虑冷启动问题。


In [770]:
amazon_data_dir = "Amazon/"

In [771]:
def load_amazon(data_dir):
    import pickle
    with open(os.path.join(data_dir, "amazon"), "rb") as fr:
        dataset = pickle.load(fr)
    return dataset
amazon_data_dict = load_amazon(amazon_data_dir)

In [772]:
print(amazon_data_dict.keys())
print(amazon_data_dict["data_profile"])
print(list(amazon_data_dict["nodes"].keys())[:10])
print(amazon_data_dict["nodes"]["0077613252"])
print(len(amazon_data_dict["links"].keys()))
print(list(amazon_data_dict["links"].values())[:10])

dict_keys(['data_profile', 'nodes', 'links'])
{'name': 'amazon', 'order': 396810, 'size': 450578, 'is_directed': True, 'is_weighted': True}
['A240ORQ2LF9LUI', '0077613252', 'A1YCCU0YRLS0FE', 'A1BJHRQDYVAY2J', 'APRDVZ6QBIQXT', 'A2JZTTBSLS1QXV', '0077775473', 'AGN3T5ERYJN5A', '0132147556', 'AHPK2GD0SQC59']
{'label': 'item'}
450578
[{'label': 4.0, 'time': 1394496000, 'review': "The materials arrived early and were in excellent condition.  However for the money spent they really should've come with a binder and not just loose leaf.", 'review_summary': 'Material Great'}, {'label': 4.0, 'time': 1393113600, 'review': 'I am really enjoying this book with the worksheets that make you review your goals, what to do when you do not make it, it reminds me  of my human sexuality classwork.', 'review_summary': 'Health'}, {'label': 1.0, 'time': 1392595200, 'review': 'IF YOU ARE TAKING THIS CLASS DON"T WASTE YOUR MONEY ON THIS SO CALLED BOOK! $140.00 FOR A "BOOK" THAT ISIN\'T EVEN BOUND LOOSE LEAFS, TH

In [773]:
# 划分训练集、测试集
# 这里划分出的训练集只是用于构建user协同信息，但是将近90k的训练数据太多了，因此最后构建instruction数据集时还需要再次降采样。
amazon_data = list()
for edge, label in amazon_data_dict["links"].items():
    amazon_data.append((edge[0], label["label"], edge[1], label["review"]))
shuffle(amazon_data)
amazon_test_data, amazon_train_data_for_graph_construction = amazon_data[:10000], amazon_data[10000:]

In [774]:
def construct_amazon_collaboration_graph(amazon_data):
    # 首先获得每个user对应的item列表（等价于user的1-hop邻接子图）
    user2item = dict()
    useritem2label = dict()
    useritem2review = dict()
    for (user, label, item, review) in tqdm(amazon_data):
        if user not in user2item.keys():
            user2item[user] = set()
        user2item[user].add(item)
        if (user, item) not in useritem2label.keys():
            useritem2label[(user, item)] = label
        if (user, item) not in useritem2review.keys():
            useritem2review[(user, item)] = review

    #如果是训练集，降采样12000个作为训练样本
    user_list = list(user2item.keys())
    # if do_train:
    shuffle(user_list)
    user_list = user_list[:200000]
    
    # 对于每个user，计算其与其他user的jaccard相似度，并获得topk个最相似的user，k=3
    user2user = dict() # 每个user最相似的topk个user及其得分
    # for cur_user, cur_item_set in tqdm(user2item.items()):
    for cur_user in tqdm(user_list):
        cur_item_set = user2item[cur_user]
        if cur_user not in user2user.keys():
            user2user[cur_user] = list()
        score_list = list()
        # amazon的节点数量太多了，因此先降采样
        tgt_user_list = list(user2item.keys())
        # shuffle(tgt_user_list)
        tgt_user_list = tgt_user_list[:5000]
        # for tgt_user, tgt_item_set in user2item.items():
        for tgt_user in tgt_user_list:
            tgt_item_set = user2item[tgt_user]
            if cur_user == tgt_user:
                continue
            score_list.append((tgt_user, round(len(cur_item_set.intersection(tgt_item_set)) / len(cur_item_set.union(tgt_item_set)), 4)))
        score_list = sorted(score_list, reverse=True, key=lambda i: i[1])
        user2user[cur_user].extend(score_list[:5])
    return user2item, useritem2label, user2user
                
amazon_user2item, amazon_useritem2label, amazon_user2user = construct_amazon_collaboration_graph(amazon_train_data_for_graph_construction)
    

100%|██████████| 440578/440578 [00:01<00:00, 423027.37it/s]
  0%|          | 339/200000 [00:09<1:32:08, 36.11it/s]


KeyboardInterrupt: 

In [289]:
amazon_user2item["A240ORQ2LF9LUI"]

{'0077613252'}

In [290]:
amazon_user2user["A240ORQ2LF9LUI"]

KeyError: 'A240ORQ2LF9LUI'

In [305]:
# 再一次采样获得训练集
# shuffle(amazon_train_data_for_graph_construction)
amazon_train_data = amazon_train_data_for_graph_construction[:20000]

In [408]:
def graphcolla_amazon_graph_language(task_name: str, node_list: list, graph: list, target_user, target_item, target_review):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    user_nodes = <node_list>;
    target_user = <target_user>;
    target_product = <target_item>;
    user_review = <target_review>;
    scoring_triples = <triple_list>\n}\n```"""
    node_list = "[" + ", ".join(["\"{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[score=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph if not (triple[0] == target_user and triple[2] == target_item)]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)).replace("<target_user>", "\"{}\"".format(target_user)).replace("<target_item>", "\"{}\"".format(target_item)).replace("<target_review>", "\"{}\"".format(target_review))

def graphcolla_amazon_instruction(text: str, entities: list, triples: list, do_print: bool = False, target_user=None, target_item=None, target_review=None):
    # 配置 instruction prompt
    task_name = "product-scoring-graph"
    instruction = "Task definition: given a target user, a target product, and the user review for the target product, predict the score that the user likes this product based on the collaboration graph. Note that, 1) users in the collaboration graph have similar preferences with the target user; 2) the predict score should be one of '1', '2', '3', '4', and '5'."
    edge_list = triples
    node_list = entities
    gcl = graphcolla_amazon_graph_language(task_name, node_list, edge_list, target_user, target_item, target_review)
    query = "Q: Please predict the score that the target user likes the target product."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }


In [404]:
def graphcolla_amazon_dataset(data: list, user2item, useritem2label, user2user):
    task_name = "graph-language-modeling-graph-collaboration-filtering-amazon"
    final_data = list()
    
    for ei, (cur_user, label, cur_item, review) in enumerate(tqdm(data)):
        if cur_user not in user2user.keys():
            continue
        similar_users = user2user[cur_user]
        node_list = [cur_user] + [u[0] for u in similar_users]
        triple_list = list()
        for u in node_list:
            item_list = list(user2item[u])
            if cur_item in item_list:
                triple_list.append((u, useritem2label[(u, cur_item)], cur_item))
            shuffle(item_list)
            for item in item_list[:10]:
                if item == cur_item:
                    continue
                triple_list.append((u, useritem2label[(u, item)], item))
            
        instruction = graphcolla_amazon_instruction(label, node_list, triple_list, target_user=cur_user, target_item=cur_item, target_review=review)
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer": [str(int(label))],
            "answer_with_cot": [],
            "difficulty": "easy",
            "from": "Amazon",
        })
    print("total number: {}".format(len(final_data)))
    return final_data


In [409]:
graphcolla_amazon_instruction_data = {
    "train": graphcolla_amazon_dataset(amazon_train_data, amazon_user2item, amazon_useritem2label, amazon_user2user),
    "test": graphcolla_amazon_dataset(amazon_test_data, amazon_user2item, amazon_useritem2label, amazon_user2user),
}

100%|██████████| 20000/20000 [00:00<00:00, 165937.23it/s]


total number: 2781


100%|██████████| 10000/10000 [00:00<00:00, 501177.45it/s]

total number: 305





In [769]:
print(graphcolla_amazon_instruction_data["train"][130]["instruction"])
print(graphcolla_amazon_instruction_data["train"][130]["answer"])

NameError: name 'graphcolla_amazon_instruction_data' is not defined

In [411]:
# 构造为如下格式
GraphColla_Amazon_instruction_benchmark_dict = {
    "graph-language-modeling-graph-collaboration-filtering-amazon": graphcolla_amazon_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [412]:
np.save("instruction_dataset/graphlanguagemodeling_graphcollaborationfiltering_amazon_instruction_dataset.npy", GraphColla_Amazon_instruction_benchmark_dict)


### （3）LastFM

```
{
    'name': 'last-fm',
    'order': 19524,
    'size': 118268,
    'is_directed': True,
    'is_weighted': True
}
```

数据集取自：https://github.com/jwzhanggy/Graph_Toolformer/blob/main/data/README.md
详细的数据集介绍参考：https://paperswithcode.com/dataset/movielens

数据集中只有userid和itemid两种节点，且没有任何其他信息，因此需要事先为每个user获得其协同信息。

任务定义：输入一个user和一个item，预测其是否存在边。

注意：
- 该任务类似link prediciton，但是区别在于无法使用两个节点的特征和其1-hop子图来预测，而应该获得user的1-hop子图中的所有item的协同信息；
- 因此，我们首先对每个用户获得其所有存在关系的item，组成一个item集合，其次两两计算相似度（Jaccard相似度），根据相似度，获得每个user最相似的topk个user；
- 当给定一个user和一个item时，我们直接给定topk个最相似的user的1-hop子图，这些1-hop子图直接融合在一起即可得到当前user的协同图。需要注意的是，这个图中必须包含当前的item，否则无法用于预测，即我们不考虑冷启动问题。


In [664]:
lastfm_data_dir = "LastFM/"

In [665]:
def load_lastfm(data_dir):
    import pickle
    with open(os.path.join(data_dir, "last-fm"), "rb") as fr:
        dataset = pickle.load(fr)
    return dataset
lastfm_data_dict = load_lastfm(lastfm_data_dir)

In [666]:
print(lastfm_data_dict.keys())
print(lastfm_data_dict["data_profile"])
print(list(lastfm_data_dict["nodes"].keys())[:10])
print(lastfm_data_dict["nodes"]["i302"])
print(len(lastfm_data_dict["links"].keys()))
print(list(lastfm_data_dict["links"].values())[:10])

dict_keys(['data_profile', 'nodes', 'links'])
{'name': 'last-fm', 'order': 19524, 'size': 118268, 'is_directed': True, 'is_weighted': True}
['u2', 'i51', 'i52', 'i53', 'i54', 'i55', 'i56', 'i57', 'i58', 'i59']
{'label': 'item'}
118268
[{'label': '13883', 'is_user_item_link': True}, {'label': '11690', 'is_user_item_link': True}, {'label': '11351', 'is_user_item_link': True}, {'label': '10300', 'is_user_item_link': True}, {'label': '8983', 'is_user_item_link': True}, {'label': '6152', 'is_user_item_link': True}, {'label': '5955', 'is_user_item_link': True}, {'label': '4616', 'is_user_item_link': True}, {'label': '4337', 'is_user_item_link': True}, {'label': '4147', 'is_user_item_link': True}]


In [667]:
# 划分训练集、测试集
# 这里划分出的训练集只是用于构建user协同信息，但是将近90k的训练数据太多了，因此最后构建instruction数据集时还需要再次降采样。
lastfm_data = list()
for edge, label in tqdm(lastfm_data_dict["links"].items()):
    try:
        lastfm_data.append((edge[0], label["label"], edge[1]))
    except:
        continue
shuffle(lastfm_data)
lastfm_test_data, lastfm_train_data_for_graph_construction = lastfm_data[:2000], lastfm_data[2000:]

100%|██████████| 118268/118268 [00:00<00:00, 1145177.47it/s]


In [668]:
def construct_lastfm_collaboration_graph(lastfm_data):
    # 首先获得每个user对应的item列表（等价于user的1-hop邻接子图）
    user2item = dict()
    useritem2label = dict()
    for (user, label, item) in tqdm(lastfm_data):
        if user not in user2item.keys():
            user2item[user] = set()
        user2item[user].add(item)
        if (user, item) not in useritem2label.keys():
            useritem2label[(user, item)] = label
    
    
    # 对于每个user，计算其与其他user的jaccard相似度，并获得topk个最相似的user，k=3
    user_list = list(user2item.keys())
    user2user = dict() # 每个user最相似的topk个user及其得分
    for cur_user, cur_item_set in tqdm(user2item.items()):
        if cur_user not in user2user.keys():
            user2user[cur_user] = list()
        score_list = list()
        for tgt_user, tgt_item_set in user2item.items():
            if cur_user == tgt_user:
                continue
            score_list.append((tgt_user, round(len(cur_item_set.intersection(tgt_item_set)) / len(cur_item_set.union(tgt_item_set)), 4)))
        score_list = sorted(score_list, reverse=True, key=lambda i: i[1])
        user2user[cur_user].extend(score_list[:5])
    return user2item, useritem2label, user2user
                
lastfm_user2item, lastfm_useritem2label, lastfm_user2user = construct_lastfm_collaboration_graph(lastfm_train_data_for_graph_construction)
    

100%|██████████| 90834/90834 [00:00<00:00, 738179.30it/s]
100%|██████████| 1892/1892 [00:19<00:00, 95.00it/s] 


In [669]:
list(lastfm_useritem2label.values())[:10]

['11', '525', '1159', '580', '94', '125', '115', '63', '452', '152']

In [670]:
lastfm_user2user["u244"]

[('u289', 0.1975),
 ('u1509', 0.1728),
 ('u569', 0.1707),
 ('u329', 0.1687),
 ('u1718', 0.1687)]

In [671]:
# 再一次采样获得训练集
shuffle(lastfm_train_data_for_graph_construction)
lastfm_train_data = lastfm_train_data_for_graph_construction[:40000]

In [672]:
def graphcolla_lastfm_graph_language(task_name: str, node_list: list, graph: list, target_user, target_item):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    user_nodes = <node_list>;
    target_user = <target_user>;
    target_music = <target_item>;
    scoring_triples = <triple_list>\n}\n```"""
    node_list = "[" + ", ".join(["\"{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[playtime=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph if not (triple[0] == target_user and triple[2] == target_item)]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list)).replace("<target_user>", "\"{}\"".format(target_user)).replace("<target_item>", "\"{}\"".format(target_item))

def graphcolla_lastfm_instruction(text: str, entities: list, triples: list, do_print: bool = False, target_user=None, target_item=None):
    # 配置 instruction prompt
    task_name = "music-playtime-graph"
    instruction = "Task definition: given target user and film, predict the playtime the user listens to the target music based on the collaboration graph. Note that, 1) users in the collaboration graph have similar preferences with the target user; 2) the predicted playtime can be any positive integer."
    edge_list = triples
    node_list = entities
    gcl = graphcolla_lastfm_graph_language(task_name, node_list, edge_list, target_user, target_item)
    query = "Q: Please predict the playtime that the target user listens to the target music."
    final_instruction = "{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction, note_instruciton, gcl, instruction, query)
    if do_print:
        print(final_instruction)
        print("answer=", text)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }


In [673]:
def graphcolla_lastfm_dataset(data: list, user2item, useritem2label, user2user):
    task_name = "graph-language-modeling-graph-collaboration-filtering-lastfm"
    final_data = list()
    
    for ei, (cur_user, label, cur_item) in enumerate(tqdm(data)):
        similar_users = user2user[cur_user]
        node_list = [cur_user] + [u[0] for u in similar_users]
        triple_list = list()
        for u in node_list:
            item_list = list(user2item[u])
            if cur_item in item_list:
                triple_list.append((u, useritem2label[(u, cur_item)], cur_item))
            shuffle(item_list)
            for item in item_list[:8]:
                if item == cur_item:
                    continue
                triple_list.append((u, useritem2label[(u, item)], item))
            
        instruction = graphcolla_lastfm_instruction(label, node_list, triple_list, target_user=cur_user, target_item=cur_item)
    
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": instruction["graph"],
            "answer": [label],
            "answer_with_cot": [],
            "difficulty": "easy",
            "from": "LastFM",
        })
    print("total number: {}".format(len(final_data)))
    return final_data


In [674]:
graphcolla_lastfm_instruction_data = {
    "train": graphcolla_lastfm_dataset(lastfm_train_data, lastfm_user2item, lastfm_useritem2label, lastfm_user2user),
    "test": graphcolla_lastfm_dataset(lastfm_test_data, lastfm_user2item, lastfm_useritem2label, lastfm_user2user),
}

100%|██████████| 40000/40000 [00:06<00:00, 5801.08it/s]


total number: 40000


100%|██████████| 2000/2000 [00:00<00:00, 5979.20it/s]

total number: 2000





In [775]:
print(graphcolla_lastfm_instruction_data["test"][1822]["instruction"])
print(graphcolla_lastfm_instruction_data["test"][1822]["answer"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="music-playtime-graph"] {
    user_nodes = ["u92", "u2021", "u390", "u1753", "u984", "u1071"];
    target_user = "u92";
    target_music = "i289";
    scoring_triples = [("u92" -> "i2497")[playtime="306"], ("u92" -> "i1686")[playtime="10"], ("u92" -> "i2515")[playtime="5"], ("u92" -> "i310")[playtime="9"], ("u92" -> "i1689")[playtime="5"], ("u92" -> "i2498")[playtime="72"], ("u92" -> "i301")[playtime="5"], ("u92" -> "i2514")[playtime="5"], ("u2021" -> "i289")[playtime="2543"], ("u2021" -> "i701")[playtime="1589"], ("u2021" -> "i89")[playtime="2842"], ("u2021" -> "i344")[playtime="1483"], ("u2021" -> "i644")[playtime="888"],

In [378]:
# 构造为如下格式
GraphColla_LastFM_instruction_benchmark_dict = {
    "graph-language-modeling-graph-collaboration-filtering-lastfm": graphcolla_lastfm_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [379]:
np.save("instruction_dataset/graphlanguagemodeling_graphcollaborationfiltering_lastfm_instruction_dataset.npy", GraphColla_LastFM_instruction_benchmark_dict)


# 三、Graph Construction Modeling

## 3.1 Knowledge Graph Generation

### （1）Wikipedia+Wikidata5M
直接获取Graph Caption Generation的数据集反向即可

任务定义：给定一个passage和一个实体列表，要求模型生成一个graph（即抽取所有三元组）

In [380]:
graphcaption_wikipedia_data_file = "instruction_dataset/graphlanguagemodeling_graphcaptiongeneration_wikipedia_instruction_dataset.npy"

In [409]:
graphcaption_wikipedia_data = np.load(os.path.join(graphcaption_wikipedia_data_file), allow_pickle=True)[()]
graphcaption_wikipedia_train_data, graphcaption_wikipedia_test_data = list(), list()
for task_name, task_data in graphcaption_wikipedia_data.items():
    task_train_data, task_test_data = task_data["train"], task_data["test"]
    graphcaption_wikipedia_train_data.extend(task_train_data)
    graphcaption_wikipedia_test_data.extend(task_test_data)

In [410]:
# 获得所有可能的实体和三元组
all_wikidata_entity_set = set()
all_wikidata_relation_set = set()
all_wikidata_edge_list = list()
for example in tqdm(graphcaption_wikipedia_train_data):
    graph = example["graph"]
    node_list = graph["node_list"]
    edge_list = graph["edge_list"]
    all_wikidata_entity_set.update(set(node_list))
    all_wikidata_edge_list.extend(edge_list)
    for edge in edge_list:
        all_wikidata_relation_set.add(edge[1])
all_wikidata_entity_list = list(all_wikidata_entity_set)
all_wikidata_relation_list = list(all_wikidata_relation_set)

100%|██████████| 521781/521781 [00:02<00:00, 254183.07it/s]


In [411]:
shuffle(graphcaption_wikipedia_train_data)
graphcaption_wikipedia_train_data = graphcaption_wikipedia_train_data[:80000]

In [383]:
graphcaption_wikipedia_train_data[230]

{'task_name': 'graph-language-modeling-graph-caption-generation-wikipedia',
 'idx': 464528,
 'instruction': 'You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.\nNote: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. \n```\nGraph[name="wikipedia-knowledge-graph"] {\n    entity_list = [\'the founder\', \'hannah\', \'samuel henry\', \'january\', \'tobacco\', \'helena\', \'samuel gluckstein\', \'factory\', \'married\', \'four sons\', \'samuel\', \'daughters\', \'lehmann\', \'the son\', \'the east end\', \'germany\', \'london\', \'children\', \'merchants\', \'rheinberg\', \'netherlands\', \'salmon & gluckstein\', \'23 january\', \'prussia\', \'joseph\'];\n    triple_list = [("january" -> "23 january")[relation="has part"], ("the east end" -> "londo

In [430]:
def graphconstruction_wikipedia_graph_language(node_list: list, graph: list):
    task_name = "knowledge-graph"
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    entity_list = <node_list>;
    triple_list = <triple_list>\n}\n```"""
    node_list = "[" + ", ".join(["\"{}\"".format(i) for i in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\")[relation=\"{}\"]".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))


In [431]:
def graphconstruction_wikipedia_instruction_unfaithful_unfactual_conflict_missing_graph(text: str, gcl, graph, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "knowledge-graph"
    passage = "Passage: \"{}\".".format(text)
    entity_list = "Entites: {}.".format(", ".join(["\"{}\"".format(i) for i in graph["node_list"]]))
    # instruction = "Task definition: extract some entities and corresponding structure factual triples from the passage, and generate a graph language to describe the graph. "
    instruction = "Task definition: given a passage and an entities list, extract some corresponding structure factual triples from the passage to form a knowledge graph, and generate a graph language to describe the graph. "
    instruction += "Note that: 1) the graph is a directed graph and the name is \"{}\". ".format(task_name)
    instruction += "2) The generated graph language should be a code-like structure, and the skeleton format can be expressed as the following:"
    instruction += """\n```\nGraph[name="knowledge-graph"] {
    entity_list = [\"xxx\", ...];
    triple_list = [(\"xxx\" -> \"xxx\")[relation=\"xxx\"], ...];\n}\n```"""
    query = "Q: Given you a passage and all entities, please generate a corresponding knowledge graph."
    final_instruction = "{}\n{}\n{}\n{}\n{}\n{}\nA:".format(system_instruction2, note_instruciton, instruction, passage, entity_list, query)
    if do_print:
        print(final_instruction)
        print("answer=", relation)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": graph
    }

def graphconstruction_wikipedia_dataset_unfaithful_unfactual_conflict_missing_graph(examples):
    task_name = "graph-construction-modeling-knowledge-graph-generation-wikipedia"
    final_data = list()
    
    for ei, example in enumerate(tqdm(examples)):
        gcl = example["graph_language"].replace("'", "\"")
        answer = example["answer"][0]
        graph = example["graph"]

        # 对graph进行负采样
        node_list = graph["node_list"]
        edge_list = graph["edge_list"]
        wrong_triple_num = random.randint(0, 5)


        for _ in range(wrong_triple_num):
            if len(edge_list) == 0:
                continue
            hallucination_type = random.random()
            random_entity = all_wikidata_entity_list[random.randint(0, len(all_wikidata_entity_list) - 1)]
            random_relation = all_wikidata_relation_list[random.randint(0, len(all_wikidata_relation_list) - 1)]
            random_triple = all_wikidata_edge_list[random.randint(0, len(all_wikidata_edge_list) - 1)]
            
            if hallucination_type < 0.2:
                # unfaithful entity
                # 随机选择一个entity，将其替换为错误的实体，使得node与edge存在不一致
                random_select_entity = node_list[random.randint(0, len(node_list) - 1)]
                node_list.remove(random_select_entity)
                node_list.append(random_entity)
                
            elif hallucination_type < 0.4:
                # 随机选择一个三元组，并将实体或者关系进行随机替换，表示unfactual graph
                random_select_triple = edge_list[random.randint(0, len(edge_list) - 1)]
                edge_list.remove(random_select_triple)
                if random.random() < 0.5:
                    # 随机替换实体
                    edge_list.append([random_select_triple[0], random_select_triple[1], random_entity])
                    if random_entity not in node_list:
                        node_list.append(random_entity)
                else:
                    edge_list.append([random_select_triple[0], random_relation, random_select_triple[2]])

            elif hallucination_type < 0.6:
                # 随机选择一个三元组，复制一份，并随机替换其中关系，形成conflict graph
                random_select_triple = edge_list[random.randint(0, len(edge_list) - 1)]
                if random.random() < 0.5:
                    # 随机替换实体
                    edge_list.append([random_select_triple[0], random_select_triple[1], random_entity])
                    if random_entity not in node_list:
                        node_list.append(random_entity)
                else:
                    edge_list.append([random_select_triple[0], random_relation, random_select_triple[2]])

            elif hallucination_type < 0.8:
                # 随机选择一个三元组，将其删除，表示missing graph
                random_select_triple = edge_list[random.randint(0, len(edge_list) - 1)]
                edge_list.remove(random_select_triple)
                
            else:
                # 随机添加一个三元组，表示redundant graph
                edge_list.append(random_triple)
                if random_triple[0] not in node_list:
                    node_list.append(random_triple[0])
                if random_triple[2] not in node_list:
                    node_list.append(random_triple[2])

        shuffle(node_list)
        shuffle(edge_list)
        gcl_negative = graphconstruction_wikipedia_graph_language(node_list, edge_list)

        instruction = graphconstruction_wikipedia_instruction_unfaithful_unfactual_conflict_missing_graph(answer, gcl, graph)
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": "(refer to 'answer_positive')",
            "graph": instruction["graph"],
            "answer_positive": [gcl],
            "answer_negative": [gcl_negative],
            "hallucination_type": "unfaithful_unfactual_conflict_missing_graph",
            "answer_with_cot": [],
            "difficulty": "medium",
            "from": "GraphCaption-Wikipedia",
        })
    print("total number: {}".format(len(final_data)))
    return final_data

graphconstruction_wikipedia_instruction_data_unfaithful_unfactual_conflict_missing_graph = {
    "train": graphconstruction_wikipedia_dataset_unfaithful_unfactual_conflict_missing_graph(graphcaption_wikipedia_train_data),
    "test": graphconstruction_wikipedia_dataset_unfaithful_unfactual_conflict_missing_graph(graphcaption_wikipedia_test_data),
}

100%|██████████| 80000/80000 [00:03<00:00, 21498.53it/s]


total number: 80000


100%|██████████| 2000/2000 [00:00<00:00, 21599.97it/s]

total number: 2000





In [432]:
print(graphconstruction_wikipedia_instruction_data_unfaithful_unfactual_conflict_missing_graph["test"][1374]["instruction"])
print(graphconstruction_wikipedia_instruction_data_unfaithful_unfactual_conflict_missing_graph["test"][1374]["answer_positive"])
print(graphconstruction_wikipedia_instruction_data_unfaithful_unfactual_conflict_missing_graph["test"][1374]["answer_negative"])

You are a good graph generator. You need to understand the task definition and generate a graph language to answer the question. 
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
Task definition: given a passage and an entities list, extract some corresponding structure factual triples from the passage to form a knowledge graph, and generate a graph language to describe the graph. Note that: 1) the graph is a directed graph and the name is "knowledge-graph". 2) The generated graph language should be a code-like structure, and the skeleton format can be expressed as the following:
```
Graph[name="knowledge-graph"] {
    entity_list = ["xxx", ...];
    triple_list = [("xxx" -> "xxx")[relation="xxx"], ...];
}
```
Passage: "Potulice () is a village in the administrative district of Gmina Wągrowiec, within Wągrowiec County, Greater Poland Voivodeship, in west-central Poland. It lies

In [433]:
# 构造为如下格式
graphconstruction_wikipedia_preference_data = {
    "train": graphconstruction_wikipedia_instruction_data_unfaithful_unfactual_conflict_missing_graph["train"],
    "test": graphconstruction_wikipedia_instruction_data_unfaithful_unfactual_conflict_missing_graph["test"],
}
GraphConstruction_Wikipedia_preference_benchmark_dict = {
    "graph-construction-modeling-knowledge-graph-generation-wikipedia": graphconstruction_wikipedia_preference_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [434]:
np.save("preference_dataset/graphconstructionmodeling_knowledgegraphgeneration_wikipedia_preference_dataset.npy", GraphConstruction_Wikipedia_preference_benchmark_dict)


### （2）IEInstructions（InstructionUIE）

- 论文：https://arxiv.org/pdf/2304.08085.pdf
- 地址：https://github.com/BeyonderXX/InstructUIE

主要使用关系抽取数据，给定一个短文本，模型抽取实体及其关系。

### （3）InstructIE（InstructKGC）

- 地址：https://github.com/zjunlp/DeepKE/tree/main/example/llm/InstructKGC#22datasets

主要使用关系抽取数据，给定一个短文本，模型抽取实体及其关系。

## 3.2 Structure Graph Generation
输入一个graph的属性，和指令，让LLM生成一个Graph。例如：

> 给你A、B、C、D和E五个节点，请生成一个全连接图。可以直接使用NLGraph提供的一些数据

### (1) Undirected Graph Generation (Connectivity)
可直接使用NLGraph Connectivity的数据
- 数据集提供的prompt可以作为描述graph的prompt；
- GCL转换后的graph language可以作为生成内容

### (2) Undirected Graph Generation (Cycle)
可直接使用NLGraph Cycle的数据
- 数据集提供的prompt可以作为描述graph的prompt；
- GCL转换后的graph language可以作为生成内容

### (3) Undirected-weighted Graph Generation (Shortest Path)
可直接使用NLGraph Shortest Path的数据
- 数据集提供的prompt可以作为描述graph的prompt；
- GCL转换后的graph language可以作为生成内容

### (4) Directed-weighted Graph Generation (Flow)
可直接使用NLGraph Maximum Flow的数据
- 数据集提供的prompt可以作为描述graph的prompt；
- GCL转换后的graph language可以作为生成内容

### （5）Graph Structure Editing

输入一个Graph language，输入一个修改的指令，让LLM生成一个修改后的Graph language

# 四、Graph Thought Modeling

NLP推理
- 给定一个数学题；
- 通过ICL的方法调用ChatGPT API获得模型针对该问题推理的evidence triple；
- 将正确的evidence triple与answer作为训练样本

任务类型：
- **Arithmetic**：AQuA、GSM8K、MultiArith、SVAMP
- **Factual（Commonsense）**：ARC-c、CSQA、OpenBookQA、StrategyQA
- **Symbolic**：CoinFlip、LastLetters

  

### （1）NLP Reasoning

In [499]:
graphthought_data_dir = "NLPReasoning/"

In [500]:
nlpreasoning_task_list = ["Arithmetic", "Factual", "Symbolic"]

In [501]:
def process_aqua(file_path):
    data = list()
    label2id = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4, "F": 5}
    with open(file_path, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
        for line in lines:
            example = json.loads(line.strip())
            answer = example["options"][label2id[example["correct"]]][2:]
            rationale = "\n".join(example["rationale"].split("\n")[:-1]).replace("Explanation :\n", "") + "\nSo the answer is {}.".format(answer)
            data.append({
                "question": example["question"],
                "answer": [answer],
                "answer_with_cot": [rationale],
                "evidence_triples": [],
                "from": "AQuA",
            })
    return data, "AQuA"

def process_gsm8k(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in lines:
        example = json.loads(line.strip())
        rationale, label = example["answer"].split("\n#### ")
        data.append({
            "question": example["question"],
            "answer": [label],
            "answer_with_cot": [rationale],
            "evidence_triples": [],
            "from": "GSM8K",
        })
    return data, "GSM8K"

def process_multiarith(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        examples = json.load(fr)
    for example in examples:
        data.append({
            "question": example["sQuestion"],
            "answer": [str(example["lSolutions"][0])],
            "answer_with_cot": [example["lEquations"][0] + "=" + str(example["lSolutions"][0])],
            "evidence_triples": [],
            "from": "MultiArith",
        })
    return data, "MultiArith"

def process_svamp(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        examples = json.load(fr)
    for example in examples:
        data.append({
            "question": example["Question"],
            "answer": [str(example["Answer"])],
            "answer_with_cot": [example["Equation"] + " = " + str(example["Answer"])],
            "evidence_triples": [],
            "from": "SVAMP",
        })
    return data, "SVAMP"

def process_arcc(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in lines:
        example = json.loads(line.strip())
        label2text = {i["label"]: i["text"] for i in example["question"]["choices"]}
        answer = label2text[example["answerKey"]]
        data.append({
            "question": example["question"]["stem"],
            "answer": [answer],
            "answer_with_cot": [],
            "evidence_triples": [],
            "from": "ARC-c",
        })
    return data, "ARC-c"

def process_strategyqa(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        examples = json.load(fr)
    examples = examples["examples"]
    for example in examples:
        for label, tag in example["target_scores"].items():
            if tag == 1:
                answer = label
        data.append({
            "question": example["input"],
            "answer": [answer],
            "answer_with_cot": [example["target"]],
            "evidence_triples": [],
            "from": "StrategyQA"
        })
    return data, "StrategyQA"

def process_csqa(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in lines:
        example = json.loads(line.strip())
        label2text = {i["label"]: i["text"] for i in example["question"]["choices"]}
        answer = label2text[example["answerKey"]]
        data.append({
            "question": example["question"]["stem"],
            "answer": [answer],
            "answer_with_cot": [],
            "evidence_triples": [],
            "from": "CommonsenseQA",
        })
    return data, "CommonsenseQA"

def process_openbookqa(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for line in lines:
        example = json.loads(line.strip())
        label2text = {i["label"]: i["text"] for i in example["question"]["choices"]}
        answer = label2text[example["answerKey"]]
        data.append({
            "question": example["question"]["stem"],
            "answer": [answer],
            "answer_with_cot": [],
            "evidence_triples": [],
            "from": "OpenBookQA",
        })
    return data, "OpenBookQA"

def process_conflip(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        examples = json.load(fr)
    for example in examples["examples"]:
        data.append({
            "question": example["question"],
            "answer": [example["answer"]],
            "answer_with_cot": [],
            "evidence_triples": [],
            "from": "Coin-Flip",
        })
    return data, "Coin-Flip"

def process_lastletters(file_path):
    data = list()
    with open(file_path, "r", encoding="utf-8") as fr:
        examples = json.load(fr)
    for example in examples["examples"]:
        data.append({
            "question": example["question"],
            "answer": [example["answer"]],
            "answer_with_cot": [],
            "evidence_triples": [],
            "from": "Last-Letters",
        })
    return data, "Last-Letters"


graphthought_task_mapping = {
    "aqua": process_aqua,
    "gsm8k": process_gsm8k,
    "multiarith": process_multiarith,
    "svamp": process_svamp,
    "arc-c": process_arcc,
    "strategyqa": process_strategyqa,
    "csqa": process_csqa,
    "openbookqa": process_openbookqa,
    "coin_flip": process_conflip,
    "last_letters": process_lastletters,
}

In [502]:
def load_graphthought_train_data(data_dir, task_list):

    def str2tuple(triple_str):
        # input a string, e.g., "(Janet, bake, 4 eggs for muffins)"
        # output a tuple, e.g., ('Janet', 'bake', '4 eggs for muffins')
        return tuple([i.replace("(", "").replace(")", "") for i in triple_str.split(", ")])
    
    train_data, test_data = list(), list()
    for task in tqdm(task_list):
        data_file_list = os.listdir(os.path.join(data_dir, task))
        for data_file in data_file_list:
            if data_file == ".DS_Store":
                continue
            # 测试集
            test_example, task_name = graphthought_task_mapping[data_file](os.path.join(data_dir, task, data_file, "test.json"))
            test_data.extend(test_example)
            
            # 训练集
            with open(os.path.join(data_dir, task, data_file, "train.json"), "r", encoding="utf-8") as fr:
                lines = fr.readlines()
            for line in lines:
                example = json.loads(line.strip())
                if example["label"] != example["pred"]:
                    continue
                prompt = example["prompt"]
                question = prompt.split("\n\n")[-1][3:].strip()
                reasoning_chains = example["reasoning_chains"]
                if "Explanation hints" not in reasoning_chains or "Evidence triples" not in reasoning_chains:
                    continue
                if "\nExplanation hints: " in reasoning_chains:
                    evidence_triples = [str2tuple(i.split(". ")[1]) for i in reasoning_chains.split("\nExplanation hints: ")[0].replace("A: Evidence triples:\n", "").split("\n")]
                    cot = reasoning_chains.split("\nExplanation hints: ")[1].split("\n")[0]
                elif " \nEvidence triples:\n" in reasoning_chains:
                    cot = reasoning_chains.split(" \nEvidence triples:\n")[0].replace("A: Explanation hints: ", "")
                    evidence_triples = [str2tuple(i.split(". ")[1]) for i in reasoning_chains.split(" \nEvidence triples:\n")[1].split("\n")[:-1] if i.strip() != ""]
                
                train_data.append({
                    "question": question,
                    "answer": [example["label"]],
                    "answer_with_cot": [cot],
                    "evidence_triples": evidence_triples,
                    "from": task_name,
                })
            
            
    print("train data num:", len(train_data))
    print("test data num:", len(test_data))
    return train_data, test_data
graphthought_train_data, graphthought_test_data = load_graphthought_train_data(graphthought_data_dir, nlpreasoning_task_list)

100%|██████████| 3/3 [00:00<00:00, 35.98it/s]

train data num: 72
test data num: 9356





In [503]:
shuffle(graphthought_test_data)
graphthought_test_data = graphthought_test_data[:2000]

In [504]:
graphthought_train_data[44]

{'question': 'A coin is heads up. Lucky does not flip the coin. Mireya flips the coin. Jj flips the coin. Kc flips the coin. Is the coin still heads up? Note that "flip" here means "reverse".',
 'answer': ['no'],
 'answer_with_cot': ['The coin was flipped by Mireya, Jj, and Kc. So the coin was flipped 3 times, which is an odd number. The coin started heads up, so after an odd number of flips, it will be tails up. '],
 'evidence_triples': [('coin', 'start with', 'head up'),
  ('coin', 'flips', 'flipped'),
  ('coin', 'not flips', 'flipped')],
 'from': 'Coin-Flip'}

**构建instruciton数据集**

In [505]:
def graphthought_graph_language(task_name: str, node_list: list, graph: list):
    # 配置Graph Language
    gcl = """```\nGraph[name="<task_name>"] {
    node_list = <node_list>;
    edge_list = <triple_list>\n}\n```"""
    node_list = "[" + ", ".join(["\"{}\"".format(node) for node in node_list]) + "]"
    triple_list = "[" + ", ".join(["(\"{}\" -> \"{}\"[relation=\"{}\"])".format(triple[0], triple[2], triple[1]) for triple in graph]) + "];"
    return gcl.replace("<task_name>", task_name).replace("<node_list>", node_list).replace("<triple_list>", str(triple_list))

    

In [506]:
def graphthought_instruction(question: str, gcl, node_list, edge_list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "reasoning-thought-graph"
    # instruction = "Task definition: extract some entities and corresponding structure factual triples from the passage, and generate a graph language to describe the graph. "
    instruction = "Task definition: given a reasoning problem, please think step by step: 1) generate a thought graph that expresses the reasoning evidence, 2) then generate a thinking explanation to describe this reasoning, and 3) finally output the final answer. "
    instruction += "Note that: 1) the graph is a directed-weighted graph and the name is \"{}\". ".format(task_name)
    instruction += "2) The generated graph language should be a code-like structure, and the skeleton format can be expressed as the following:"
    instruction += """\n```\nGraph[name="reasoning-thought-graph"] {
    entity_list = [\"xxx\", ...];
    triple_list = [(\"xxx\" -> \"xxx\")[relation="xxx"], ...];\n}\n```"""
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\nA:".format(system_instruction2, note_instruciton, instruction, query)
    if do_print:
        print(final_instruction)
        # print("answer=", relation)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

In [510]:
def graphthought_dataset(data: dict):
    task_name = "graph-thought-modeling-natural-language-reasoning-nlpreasoning"
    # print(data)
    final_data = list()
    for ei, example in enumerate(tqdm(data)):
        question = example["question"]
        edge_list = example["evidence_triples"]
        answer_with_cot = example["answer_with_cot"]
        answer = example["answer"][0]
        new_edge_list = list()
        node_list = set()
        for edge in edge_list:
            if len(edge) == 3:
                node_list.add(edge[0])
                node_list.add(edge[2])
                new_edge_list.append(edge)
            elif len(edge) == 2:
                node_list.add(edge[0])
                node_list.add(edge[1])
                new_edge_list.append((edge[0], "is", edge[1]))
            elif len(edge) > 3:
                node_list.add(edge[0])
                node_list.add(edge[-1])
                new_edge_list.append((edge[0], ", ".join(list(edge[1:-1])), edge[-1]))
        node_list = list(node_list)
        edge_list = new_edge_list
        if len(edge_list) > 0:
            gcl = graphthought_graph_language("reasoning-thought-graph", node_list, edge_list)
        else:
            gcl = ""
        instruction = graphthought_instruction(question, gcl, node_list, edge_list)
        if gcl != "":
            final_answer = "The thought graph that expresses the reasoning evidence is {}. \n".format(gcl)
            final_answer += "{} \n".format(answer_with_cot[0].strip())
            final_answer += "So the answer is {}. ".format(answer)
        else:
            if len(answer_with_cot) > 0:
                final_answer = "{} \n".format(answer_with_cot[0].strip())
                final_answer += "So the answer is {} ".format(answer)
            else:
                final_answer = "So the answer is {}. ".format(answer)
            
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": "(refer to 'answer')",
            "answer": [final_answer],
            "answer_with_cot": example["answer_with_cot"],
            "difficulty": "medium",
            "from": "NLPReasonong-{}".format(example["from"]),
        })
    print("total number: {}".format(len(final_data)))
    return final_data
        

In [511]:
graphthought_instruction_data = {
    "train": graphthought_dataset(graphthought_train_data),
    "test": graphthought_dataset(graphthought_test_data),
}

100%|██████████| 72/72 [00:00<00:00, 38411.33it/s]


total number: 72


100%|██████████| 2000/2000 [00:00<00:00, 115343.79it/s]

total number: 2000





In [781]:
print(graphthought_instruction_data["test"][1992]["instruction"])
print(graphthought_instruction_data["test"][1992]["answer"])

You are a good graph generator. You need to understand the task definition and generate a graph language to answer the question. 
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
Task definition: given a reasoning problem, please think step by step: 1) generate a thought graph that expresses the reasoning evidence, 2) then generate a thinking explanation to describe this reasoning, and 3) finally output the final answer. Note that: 1) the graph is a directed-weighted graph and the name is "reasoning-thought-graph". 2) The generated graph language should be a code-like structure, and the skeleton format can be expressed as the following:
```
Graph[name="reasoning-thought-graph"] {
    entity_list = ["xxx", ...];
    triple_list = [("xxx" -> "xxx")[relation="xxx"], ...];
}
```
Q: A starts a business with Rs.40,000. After 2 months, B joined him with Rs.60,000. C joined them after 

In [513]:
# 构造为如下格式
GraphThought_instruction_benchmark_dict = {
    "graph-thought-modeling-natural-language-reasoning-nlpreasoning": graphthought_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [514]:
np.save("instruction_dataset/graphthoughtmodeling_instruction_dataset.npy", GraphThought_instruction_benchmark_dict)


### （2）Knowledge Probing
类似于KBQA，只是将KB去掉，只让模型根据question来回答问题，回答问题的时候首先先生成一个reasoning path

In [481]:
kbqa_processed_data_files = [
    "./instruction_dataset/graphlanguagemodeling_graphquestionanswering_grailqa_instruction_dataset.npy",
    "./instruction_dataset/graphlanguagemodeling_graphquestionanswering_pathquestion_instruction_dataset.npy",
    "./instruction_dataset/graphlanguagemodeling_graphquestionanswering_webquestions_instruction_dataset.npy",
]

In [482]:
def load_kbqa_data(data_files):
    train_data, test_data = list(), list()
    for file in data_files:
        data_dict = np.load(file, allow_pickle=True)[()]
        for _, data in data_dict.items():
            train_data.extend(data["train"])
            test_data.extend(data["test"])
    return train_data, test_data
probing_train_data, probing_test_data = load_kbqa_data(kbqa_processed_data_files)

In [483]:
def graphthought_probing_instruction(question: str, gcl, node_list, edge_list, do_print: bool = False):
    # 配置 instruction prompt
    task_name = "factual-knowledge-graph"
    # instruction = "Task definition: extract some entities and corresponding structure factual triples from the passage, and generate a graph language to describe the graph. "
    instruction = "Task definition: given a factual knowledge question, please think step by step: 1) find the topic entity and generate a corresponding knowledge subgraph to express the world knowledge information, 2) then generate a thinking explanation to describe this reasoning, and 3) finally output the final answer. "
    instruction += "Note that: 1) the graph is a directed-weighted graph and the name is \"{}\". ".format(task_name)
    instruction += "2) The generated graph language should be a code-like structure, and the skeleton format can be expressed as the following:"
    instruction += """\n```\nGraph[name="factual-knowledge-graph"] {
    entity_list = [\"xxx\", ...];
    triple_list = [(\"xxx\" -> \"xxx\")[relation="xxx"], ...];\n}\n```"""
    query = "Q: {}".format(question)
    final_instruction = "{}\n{}\n{}\n{}\nA:".format(system_instruction2, note_instruciton, instruction, query)
    if do_print:
        print(final_instruction)
        # print("answer=", relation)
    return {
        "instruction": final_instruction,
        "graph_language": gcl,
        "graph": {
            "node_list": node_list,
            "edge_list": edge_list,
        }
    }

In [476]:
probing_train_data[169]

{'task_name': 'graph-language-modeling-graph-question-answering-grailqa',
 'idx': 505,
 'instruction': 'You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.\nNote: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. \n```\nGraph[name="freebase-knowledge-base"] {\n    entity_list = [\'Marcus Fenix\', \'Rezin Schneider\', \'Soldier\', \'marcus\', \'Male\', \'Female\', \'Character Occupation\'];\n    triple_list = [("Rezin Schneider" -> "Character Occupation")[relation="Occupation"], ("Marcus Fenix" -> "Male")[relation="gender"], ("Rezin Schneider" -> "Female")[relation="gender"], ("Rezin Schneider" -> "Soldier")[relation="occupation"], ("marcus" -> "Character Occupation")[relation="Occupation"], ("Marcus Fenix" -> "Soldier")[relation="occupation"]];\

In [494]:
def graphthought_probing_dataset(data: dict):
    task_name = "graph-thought-modeling-factual-knowledge-probing-kbqa"
    # print(data)
    final_data = list()
    for ei, example in enumerate(tqdm(data)):
        gcl = example["graph_language"]
        graph = example["graph"]
        question = example["instruction"].split("\n")[-2][3:]
        answer = example["answer"][0]
        reasoning_triples = answer.split(", so the answer entity is ")[0].replace("Based on the graph, we can find a reasoning path (", "")
        topic_entity = reasoning_triples.split(", ")[0]
        
        if not "Based on the graph, we can find a reasoning path" in answer:
            continue
        final_answer = "To answer this question, we first find the topic entity is \"{}\".\n".format(topic_entity.replace("'", ""))
        final_answer += "Then, we construct a knowledge subgraph of the topic entity, the graph language is:\n{}\n".format(gcl)
        final_answer += answer + "."
        node_list = graph["node_list"]
        edge_list = graph["edge_list"]
        instruction = graphthought_probing_instruction(question, gcl, node_list, edge_list)
        
        final_data.append({
            "task_name": task_name,
            "idx": ei,
            "instruction": instruction["instruction"],
            "graph_language": instruction["graph_language"],
            "graph": "(refer to 'answer')",
            "answer": [final_answer],
            "answer_with_cot": example["answer_with_cot"],
            "difficulty": "medium",
            "from": "Probing-{}".format(example["from"]),
        })
    print("total number: {}".format(len(final_data)))
    return final_data
        

In [495]:
graphthought_probing_instruction_data = {
    "train": graphthought_probing_dataset(probing_train_data),
    "test": graphthought_probing_dataset(probing_test_data),
}

100%|██████████| 63211/63211 [00:00<00:00, 193229.51it/s]


total number: 13025


100%|██████████| 4463/4463 [00:00<00:00, 159523.95it/s]

total number: 1389





In [784]:
print(graphthought_probing_instruction_data["test"][1170]["instruction"])
print(graphthought_probing_instruction_data["test"][1170]["answer"][0])

You are a good graph generator. You need to understand the task definition and generate a graph language to answer the question. 
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
Task definition: given a factual knowledge question, please think step by step: 1) find the topic entity and generate a corresponding knowledge subgraph to express the world knowledge information, 2) then generate a thinking explanation to describe this reasoning, and 3) finally output the final answer. Note that: 1) the graph is a directed-weighted graph and the name is "factual-knowledge-graph". 2) The generated graph language should be a code-like structure, and the skeleton format can be expressed as the following:
```
Graph[name="factual-knowledge-graph"] {
    entity_list = ["xxx", ...];
    triple_list = [("xxx" -> "xxx")[relation="xxx"], ...];
}
```
Q: what is the capital city of the location w

In [497]:
# 构造为如下格式
GraphThought_probing_instruction_benchmark_dict = {
    "graph-thought-modeling-factual-knowledge-probing-kbqa": graphthought_probing_instruction_data,
}

**存储数据集**

结构：
```json
{
    "<task_name>": {
        "train": [xxx, xxx],
        "test" : [xxx, xxx],
    },
    ...
}
```

In [498]:
np.save("instruction_dataset/graphthoughtmodeling_factualknowledgeprobing_kbqa_instruction_dataset.npy", GraphThought_probing_instruction_benchmark_dict)
