# Graph Instruction数据集重处理

根据针对不同graph task-specific任务进行处理后得到的instruction数据集，进行重处理，主要涉及如下几个环节：
- 数据质量抽样调查；
- 数据分布统计；
- 分词（tokenizer）检测与长度检测；
- 测试集采样

In [1]:
import os
import json
import numpy as np
from tqdm import tqdm
from random import shuffle
import random

In [2]:
instruction_data_dir = "./instruction_dataset/"

In [4]:
instruction_train_data = list() # regular-scale
instruction_test_data = list()
instruction_train_data_small = list() # small-scale
instruction_test_data_small = list() # small-scale
test_data_num = 0
for data_file in tqdm(os.listdir(instruction_data_dir)):
    if data_file in [".DS_Store", "released", "instruction_train_data_forcot.npy", "instruction_train_data_forcot_2.npy"]:
        continue
    data = np.load(os.path.join(instruction_data_dir, data_file), allow_pickle=True)[()]
    for task_name, task_data in data.items():
        task_train_data, task_test_data = task_data["train"], task_data["test"]
        instruction_train_data.extend(task_train_data)
        instruction_test_data.extend(task_test_data)
        test_data_num += len(task_test_data)
        
        shuffle(task_train_data)
        shuffle(task_test_data)
        task_train_data = task_train_data[:int(0.1 * len(task_train_data))]
        task_test_data = task_test_data[:int(0.1 * len(task_test_data))]
        instruction_train_data_small.extend(task_train_data)
        instruction_test_data_small.extend(task_test_data)

100%|███████████████████████████████████████████| 37/37 [01:58<00:00,  3.19s/it]


In [5]:
print("instruction train data num: ", len(instruction_train_data))
print("instruction test data num: ", test_data_num)
print("instruction train data small num: ", len(instruction_train_data_small))
print("instruction test data small num: ", len(instruction_test_data_small))

instruction train data num:  1603436
instruction test data num:  50169
instruction train data small num:  160335
instruction test data small num:  5007


In [5]:
instruction_train_data[19]

{'task_name': 'graph-language-modeling-graph-question-answering-webquestions',
 'idx': 19,
 'instruction': 'You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.\nNote: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. \n```\nGraph[name="freebase-knowledge-base"] {\n    entity_list = [\'Dr. Lawrence Angelo\', \'The Elder Brother\', \'Fabian\', \'Archy Stallings\', \'Old Man Willow\', \'Edgar Derby\', \'Male\', \'Simon Phoenix\', \'Brian Chavez\', \'Matter-Eater Lad\', \'Buddy Walling\', \'Peter Hines\', \'Zeus\', \'Trevor West\', \'Ogden Morrow\', \'Aegeus\', \'Ted\', \'Local Man\', \'Cura\', \'Tomohisa Kaname\', \'Priest A\', \'Alexander Kaseph\', \'Alfred Bellows\', \'Tarble\', \'Jewish Official\', \'Spirit\', \'Policeman Fire\'];\n    triple_li

In [62]:
type(int(instruction_train_data[19]["graph"]["edge_list"][0][0]))

int

## 一、数据质量抽样
随机抽样一部分样本，进行人工查阅

In [6]:
def process(examples: dict):
    unanserable_num = 0
    for example in tqdm(examples):
        task_name = example["task_name"]
        answer = example["answer"]
        if answer == "" or len(answer) == 0 or answer[0] == "":
            unanserable_num += 1
#             example["answer"] = ["This question is unanswerable, so I cannot answer it."]
            continue
        answer_with_cot = example["answer_with_cot"]
        answer_with_cot = [str(i) for i in answer_with_cot]
        example["answer_with_cot"] = answer_with_cot
        if type(example["graph"]) == dict:
            for key, values in example["graph"].items():
                if type(values) == set:
                    example["graph"][key] = list(values)
            if "node_feature" in example["graph"].keys():
                new_feature = dict()
                for k, v in example["graph"]["node_feature"].items():
                    if type(k) == np.int32:
#                         print(type(k))
                        k = str(k)
#                         print(v)
#                         assert 1>2
                    if type(v) == set:
                        v = list(v)
                    new_feature[k] = v
                example["graph"]["node_feature"] = new_feature
            if "graph-structure-modeling" in task_name or "structure-graph-generation" in task_name:
                try:
                    if "node_list" in example["graph"].keys():
                        example["graph"]["node_list"] = [str(i) for i in example["graph"]["node_list"]]
                    if "edge_list" in example["graph"].keys():
                        example["graph"]["edge_list"] = [[str(i) for i in pair] for pair in example["graph"]["edge_list"]]
                except:
                    continue
    print("unanserable_num=", unanserable_num)
    return examples

instruction_train_data, instruction_test_data = process(instruction_train_data), process(instruction_test_data)
instruction_train_data_small, instruction_test_data_small = process(instruction_train_data_small), process(instruction_test_data_small)


100%|█████████████████████████████| 1603436/1603436 [00:06<00:00, 246807.31it/s]


unanserable_num= 19


100%|█████████████████████████████████| 50169/50169 [00:00<00:00, 123078.46it/s]


unanserable_num= 27


100%|████████████████████████████████| 160335/160335 [00:02<00:00, 73984.17it/s]


unanserable_num= 2


100%|████████████████████████████████████| 5007/5007 [00:00<00:00, 43870.92it/s]

unanserable_num= 1





In [7]:
# 对超过一定长度的样本进行剔除
def clip_max_length(data):
    examples = list()
    for example in tqdm(data):
        instruction = example["instruction"]
        answer = example["answer"][0]
        instruction_with_answer_len = len((instruction + answer).split(" "))
        if instruction_with_answer_len >= 390:
            continue
        examples.append(example)
    return examples

instruction_train_data, instruction_test_data = clip_max_length(instruction_train_data), clip_max_length(instruction_test_data)
instruction_train_data_small, instruction_test_data_small = clip_max_length(instruction_train_data_small), clip_max_length(instruction_test_data_small)



100%|██████████████████████████████| 1603436/1603436 [00:24<00:00, 64416.09it/s]
100%|██████████████████████████████████| 50169/50169 [00:01<00:00, 46354.57it/s]
100%|████████████████████████████████| 160335/160335 [00:04<00:00, 38341.21it/s]
100%|████████████████████████████████████| 5007/5007 [00:00<00:00, 34494.17it/s]


In [8]:
len(instruction_train_data)

1457122

In [9]:
instruction_train_data[190344]

{'task_name': 'graph-language-modeling-graph-caption-generation-eventna',
 'idx': 22635,
 'instruction': 'You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.\nNote: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. \n```\nGraph[name="factual-graph"] {\n    entity_list = ["speed skating at the 2003 Asian Winter Games – women\'s 500 metres", \'2003\', \'Japan\'];\n    triple_list = [("speed skating at the 2003 Asian Winter Games – women\'s 500 metres" -> "Japan")[relation="country"], ("speed skating at the 2003 Asian Winter Games – women\'s 500 metres" -> "2003")[relation="point in time"]];\n}\n```\nTask definition: given an event title and a factual knowledge graph, generate an event narration.\nQ: Please generate an event narration based on the 

**数据分层**

instructGraph数据包含4个capacity corner：
- graph structure modeling
- graph language modeling
- graph construction modeling
- graph thought modeling

每个capacity corner中包含若干task cluster，每个cluster中则为具体的task，分布情况如下所示

In [10]:
instructgraph_task_info = {}
instructgraph_taskset = set()

task_capacity_num = 0
task_cluster_num = 0

for example in tqdm(instruction_train_data): # 训练集与测试集的分层形式一样，因此只需要统计训练集即可
    capacity = "-".join(example["task_name"].split("-")[:3])
    cluster = "-".join(example["task_name"].split("-")[3:6])
    task_name = example["from"]
    
    if capacity not in instructgraph_task_info.keys():
        instructgraph_task_info[capacity] = dict()
        task_capacity_num += 1
    if cluster not in instructgraph_task_info[capacity]:
        instructgraph_task_info[capacity][cluster] = set()
        task_cluster_num += 1
    instructgraph_task_info[capacity][cluster].add(task_name)
    instructgraph_taskset.add(task_name)

print("capacity name num: {}".format(task_capacity_num))
print("cluster name num: {}".format(task_cluster_num))
print("task name num: {}".format(len(instructgraph_taskset)))

100%|█████████████████████████████| 1457122/1457122 [00:07<00:00, 197274.43it/s]

capacity name num: 4
cluster name num: 18
task name num: 60





In [11]:
len(instructgraph_taskset)
instructgraph_taskset

{'Agenda',
 'Amazon',
 'CiteSeer',
 'CoRA',
 'ConceptNet',
 'EventNarrative',
 'FB15k-237',
 'GenWiki',
 'GrailQA',
 'GraphCaption-Wikipedia',
 'GraphCaptionGeneration',
 'InstructionKGC',
 'InstructionUIE-ADE_corpus',
 'InstructionUIE-ADE_corpus_sample_15000',
 'InstructionUIE-GIDS',
 'InstructionUIE-NYT11',
 'InstructionUIE-NYT11_sample_30000',
 'InstructionUIE-New-York-Times-RE',
 'InstructionUIE-New-York-Times-RE_sample_30000',
 'InstructionUIE-SciERC',
 'InstructionUIE-SciERC_sample_10000',
 'InstructionUIE-conll04',
 'InstructionUIE-conll04_sample_5000',
 'InstructionUIE-fewrel_0',
 'InstructionUIE-fewrel_1',
 'InstructionUIE-fewrel_2',
 'InstructionUIE-fewrel_3',
 'InstructionUIE-fewrel_4',
 'InstructionUIE-kbp37',
 'InstructionUIE-semval-RE',
 'InstructionUIE-wiki_0',
 'InstructionUIE-wiki_1',
 'InstructionUIE-wiki_2',
 'InstructionUIE-wiki_3',
 'InstructionUIE-wiki_4',
 'LastFM',
 'MoiveLens',
 'NLGraph',
 'NLPReasonong-AQuA',
 'NLPReasonong-ARC-c',
 'NLPReasonong-Coin-Flip',


In [12]:
instructgraph_task_info

{'graph-language-modeling': {'graph-question-answering': {'GrailQA',
   'PathQuestion',
   'WC2014',
   'WebQuestions',
   'WikiTableQuestions'},
  'graph-link-prediction': {'ConceptNet', 'FB15k-237', 'Wikidata5M'},
  'graph-node-cls': {'CiteSeer',
   'CoRA',
   'OGBN-ArXiv',
   'OGBN-Products',
   'PubMed'},
  'graph-caption-generation': {'Agenda',
   'EventNarrative',
   'GenWiki',
   'WebNLG',
   'Wikipedia-Wikidata5M',
   'XAlign'},
  'graph-collaboration-filtering': {'Amazon', 'LastFM', 'MoiveLens'},
  'graph-relevance-inspection': {'GraphCaptionGeneration'}},
 'graph-structure-modeling': {'connectivity-detection': {'NLGraph'},
  'cycle-detection': {'NLGraph'},
  'maximum-flow': {'NLGraph'},
  'hamilton-path': {'NLGraph'},
  'job-interest': {'NLGraph'},
  'shortest-path': {'NLGraph'},
  'topological-sort': {'NLGraph'},
  'degree-computing': {'NLGraph'}},
 'graph-construction-modeling': {'structure-graph-generation': {'NLGraph'},
  'knowledge-graph-generation': {'GraphCaption-Wikip

**不同task的样本数量统计**

In [13]:
instructgraph_scale_info = {
    "train": {
        "capacity_specific": {},
        "cluster_specific": {},
        "task_specific": {}
    },
    "test": {
        "capacity_specific": {},
        "cluster_specific": {},
        "task_specific": {}
    },
}


def data_scale_statistics(data: list, data_kind: str = "train"):
    for example in tqdm(data):
        capacity = "-".join(example["task_name"].split("-")[:3])
        cluster = "-".join(example["task_name"].split("-")[3:6])
        task_name = example["from"]
        
        if capacity not in instructgraph_scale_info[data_kind]["capacity_specific"].keys():
            instructgraph_scale_info[data_kind]["capacity_specific"][capacity] = 0
        instructgraph_scale_info[data_kind]["capacity_specific"][capacity] += 1
        
        if cluster not in instructgraph_scale_info[data_kind]["cluster_specific"].keys():
            instructgraph_scale_info[data_kind]["cluster_specific"][cluster] = 0
        instructgraph_scale_info[data_kind]["cluster_specific"][cluster] += 1
        
        if task_name not in instructgraph_scale_info[data_kind]["task_specific"].keys():
            instructgraph_scale_info[data_kind]["task_specific"][task_name] = 0
        instructgraph_scale_info[data_kind]["task_specific"][task_name] += 1

data_scale_statistics(instruction_train_data, "train")
data_scale_statistics(instruction_test_data, "test")


100%|█████████████████████████████| 1457122/1457122 [00:07<00:00, 192542.37it/s]
100%|██████████████████████████████████| 38822/38822 [00:00<00:00, 87329.74it/s]


### 训练集数据分布情况

**capacity manner**

In [14]:
instructgraph_scale_info["train"]["capacity_specific"]

{'graph-language-modeling': 1048151,
 'graph-structure-modeling': 16013,
 'graph-construction-modeling': 390595,
 'graph-thought-modeling': 2363}

**cluster manner**

In [18]:
instructgraph_scale_info["train"]["cluster_specific"]

{'graph-question-answering': 64727,
 'connectivity-detection': 3628,
 'cycle-detection': 2877,
 'maximum-flow': 1335,
 'hamilton-path': 1305,
 'job-interest': 1715,
 'shortest-path': 1580,
 'topological-sort': 1155,
 'degree-computing': 2418,
 'structure-graph-generation': 2990,
 'graph-link-prediction': 68937,
 'factual-knowledge-probing': 2292,
 'graph-node-cls': 40401,
 'graph-caption-generation': 752139,
 'knowledge-graph-generation': 387605,
 'graph-collaboration-filtering': 82385,
 'graph-relevance-inspection': 39562,
 'natural-language-reasoning': 71}

**task manner**

In [19]:
instructgraph_scale_info["train"]["task_specific"]

{'WebQuestions': 12351,
 'NLGraph': 19003,
 'WC2014': 5482,
 'FB15k-237': 2528,
 'Probing-GrailQA': 1859,
 'Probing-WebQuestions': 433,
 'PubMed': 9984,
 'Agenda': 36501,
 'OGBN-ArXiv': 8933,
 'InstructionKGC': 30206,
 'CoRA': 551,
 'Amazon': 2385,
 'GraphCaptionGeneration': 39562,
 'EventNarrative': 58597,
 'PathQuestion': 30530,
 'GenWiki': 100000,
 'CiteSeer': 942,
 'XAlign': 30000,
 'WebNLG': 12237,
 'NLPReasonong-AQuA': 1,
 'NLPReasonong-GSM8K': 8,
 'NLPReasonong-SVAMP': 7,
 'NLPReasonong-MultiArith': 10,
 'NLPReasonong-ARC-c': 2,
 'NLPReasonong-CommonsenseQA': 13,
 'NLPReasonong-OpenBookQA': 1,
 'NLPReasonong-Coin-Flip': 10,
 'NLPReasonong-Last-Letters': 19,
 'OGBN-Products': 19991,
 'GrailQA': 13647,
 'LastFM': 40000,
 'MoiveLens': 40000,
 'Wikipedia-Wikidata5M': 514804,
 'InstructionUIE-wiki_4': 5543,
 'InstructionUIE-New-York-Times-RE': 56178,
 'InstructionUIE-New-York-Times-RE_sample_30000': 29987,
 'InstructionUIE-fewrel_3': 3454,
 'InstructionUIE-semval-RE': 6507,
 'Instruc

### 测试集数据分布情况

**capacity manner**

In [20]:
instructgraph_scale_info["test"]["capacity_specific"]

{'graph-language-modeling': 29501,
 'graph-structure-modeling': 972,
 'graph-construction-modeling': 6156,
 'graph-thought-modeling': 2193}

**cluster manner**

In [21]:
instructgraph_scale_info["test"]["cluster_specific"]

{'graph-question-answering': 5423,
 'connectivity-detection': 227,
 'cycle-detection': 191,
 'maximum-flow': 55,
 'hamilton-path': 54,
 'job-interest': 69,
 'shortest-path': 64,
 'topological-sort': 85,
 'degree-computing': 227,
 'structure-graph-generation': 397,
 'graph-link-prediction': 3697,
 'factual-knowledge-probing': 194,
 'graph-node-cls': 5826,
 'graph-caption-generation': 8318,
 'knowledge-graph-generation': 5759,
 'graph-collaboration-filtering': 4249,
 'graph-relevance-inspection': 1988,
 'natural-language-reasoning': 1999}

**task manner**

In [22]:
instructgraph_scale_info["test"]["task_specific"]

{'WebQuestions': 1364,
 'NLGraph': 1369,
 'WC2014': 1000,
 'FB15k-237': 78,
 'Probing-GrailQA': 158,
 'Probing-WebQuestions': 36,
 'PubMed': 1802,
 'Agenda': 935,
 'OGBN-ArXiv': 349,
 'InstructionKGC': 990,
 'CoRA': 965,
 'Amazon': 249,
 'GraphCaptionGeneration': 1988,
 'EventNarrative': 1945,
 'PathQuestion': 1000,
 'GenWiki': 1000,
 'CiteSeer': 995,
 'XAlign': 470,
 'WebNLG': 2000,
 'NLPReasonong-GSM8K': 267,
 'NLPReasonong-ARC-c': 262,
 'NLPReasonong-CommonsenseQA': 260,
 'NLPReasonong-Coin-Flip': 110,
 'NLPReasonong-StrategyQA': 501,
 'NLPReasonong-SVAMP': 193,
 'NLPReasonong-OpenBookQA': 112,
 'NLPReasonong-AQuA': 62,
 'NLPReasonong-MultiArith': 128,
 'NLPReasonong-Last-Letters': 104,
 'OGBN-Products': 1715,
 'GrailQA': 1404,
 'LastFM': 2000,
 'MoiveLens': 2000,
 'Wikipedia-Wikidata5M': 1968,
 'InstructionUIE-semval-RE': 122,
 'InstructionUIE-wiki_4': 267,
 'InstructionUIE-kbp37': 156,
 'InstructionUIE-conll04_zeroshot': 75,
 'InstructionUIE-fewrel_1': 154,
 'InstructionUIE-ADE_co

## 二、分词与长度检验
检验样本分词后的长度

In [29]:
from transformers.models.llama.tokenization_llama import LlamaTokenizer
from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
from transformers.models.t5.tokenization_t5 import T5Tokenizer

In [70]:
llama2_tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b", token="See Your Huggingface Access Tokens in https://huggingface.co/settings/tokens .")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
flant5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
a = llama2_tokenizer.tokenize("You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.\nNote: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. \n```\nGraph[name=\"cycle-detection\"] {\n    node_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];\n    edge_list = [(1 <-> 8), (3 <-> 13), (11 <-> 13), (2 <-> 3), (2 <-> 14), (1 <-> 6), (14 <-> 4), (16 <-> 6), (7 <-> 14), (12 <-> 16), (0 <-> 4), (5 <-> 12), (10 <-> 11), (9 <-> 5), (8 <-> 14), (15 <-> 3)];\n}\n```\nTask definition: determine if there is a cycle in this graph.\nQ: Is there a cycle in this graph?\nA:")
a


['▁You',
 '▁are',
 '▁a',
 '▁good',
 '▁graph',
 '▁reason',
 'er',
 '.',
 '▁Give',
 '▁you',
 '▁a',
 '▁graph',
 '▁language',
 '▁that',
 '▁describes',
 '▁a',
 '▁graph',
 '▁structure',
 '▁and',
 '▁node',
 '▁information',
 '.',
 '▁You',
 '▁need',
 '▁to',
 '▁understand',
 '▁the',
 '▁graph',
 '▁and',
 '▁the',
 '▁task',
 '▁definition',
 ',',
 '▁and',
 '▁answer',
 '▁the',
 '▁question',
 '.',
 '<0x0A>',
 'Note',
 ':',
 '▁(',
 'i',
 '▁<',
 '->',
 '▁j',
 ')',
 '▁means',
 '▁that',
 '▁node',
 '▁i',
 '▁and',
 '▁node',
 '▁j',
 '▁are',
 '▁connected',
 '▁with',
 '▁an',
 '▁und',
 'irect',
 'ed',
 '▁edge',
 '.',
 '▁(',
 'i',
 '▁->',
 '▁j',
 ')',
 '▁means',
 '▁that',
 '▁node',
 '▁i',
 '▁and',
 '▁node',
 '▁j',
 '▁are',
 '▁connected',
 '▁with',
 '▁a',
 '▁directed',
 '▁edge',
 '.',
 '▁',
 '<0x0A>',
 '```',
 '<0x0A>',
 'Graph',
 '[',
 'name',
 '="',
 'cycle',
 '-',
 'd',
 'ete',
 'ction',
 '"]',
 '▁{',
 '<0x0A>',
 '▁▁▁',
 '▁node',
 '_',
 'list',
 '▁=',
 '▁[',
 '0',
 ',',
 '▁',
 '1',
 ',',
 '▁',
 '2',
 ',',
 '▁'

In [74]:
llama2_tokenizer.encode("4343 You are a good graph reasoner", add_special_tokens=True)

[1, 29871, 29946, 29941, 29946, 29941, 887, 526, 263, 1781, 3983, 2769, 261]

因为样本数量太多，为了加快统计的速度，因此对不同的数据集分别进行均匀采样进行统计估计。

### 统计不同分词器分词前后token长度

In [19]:
instructgraph_token_info = {
    "before_tokenization": { # 分词前长度信息统计
        "overall": { # 所有样本统计
            "avg": { # 所有样本的平均统计
                "instruction": 0.0,
                "answer": 0.0,
                "answer_with_cot": 0.0,
                "instruction_plus_answer": 0.0,
                "graph_language": 0.0,
            },
            "example_distribution": { # 所有样本中，每个样本长度区间的统计分布

            }
        },
    },
    "after_tokenization": {
        "llama2": {
            "overall": { # 所有样本统计
                "avg": { # 所有样本的平均统计
                    "instruction": 0.0,
                    "answer": 0.0,
                    "answer_with_cot": 0.0,
                    "instruction_plus_answer": 0.0,
                    "graph_language": 0.0,
                },
                "example_distribution": { # 所有样本中，每个样本长度区间的统计分布

                }
            },
        },
        "gpt2": {
            "overall": { # 所有样本统计
                "avg": { # 所有样本的平均统计
                    "instruction": 0.0,
                    "answer": 0.0,
                    "answer_with_cot": 0.0,
                    "instruction_plus_answer": 0.0,
                    "graph_language": 0.0,
                },
                "example_distribution": { # 所有样本中，每个样本长度区间的统计分布

                }
            },
        },
        "flan-t5": {
            "overall": { # 所有样本统计
                "avg": { # 所有样本的平均统计
                    "instruction": 0.0,
                    "answer": 0.0,
                    "answer_with_cot": 0.0,
                    "instruction_plus_answer": 0.0,
                    "graph_language": 0.0,
                },
                "example_distribution": { # 所有样本中，每个样本长度区间的统计分布

                }
            },
        }
    }
}

In [20]:
def before_tokenization_statistic(example: dict):
    # 分词前，对样本本身统计
    # 分词前的统计方式是以空格为分隔符进行分词
    instruction_num, instruction_token_sum = 0, 0
    answer_num, answer_token_sum = 0, 0
    answer_with_cot_num, answer_with_cot_token_sum = 0, 0
    instruction_plus_answer_num, instruction_plus_answer_token_sum = 0, 0
    graph_language_num, graph_language_token_sum = 0, 0
    
    instruction = example["instruction"]
    answer = example["answer"]
    answer_with_cot = example["answer_with_cot"]
    graph_language = example["graph_language"]
    
    instruction_num += 1
    cur_instruction_len = len(instruction.split(" "))
    instruction_token_sum += cur_instruction_len

    if len(answer) != 0:
        answer_num += 1
        instruction_plus_answer_num += 1
        cur_answer_len = len(answer[0].split(" "))
        answer_token_sum += cur_answer_len
        instruction_plus_answer_token_sum += cur_instruction_len + cur_answer_len

    if len(answer_with_cot) != 0:
        answer_with_cot_num += 1
        answer_with_cot_token_sum += len(answer_with_cot[0].split(" "))

    if "Graph[name=" in graph_language:
        graph_language_num += 1
        graph_language_token_sum += len(graph_language.split(" "))
    
    return {
        "task_name": example["task_name"],
        "instruction_num": instruction_num,
        "instruction_token_sum": instruction_token_sum,
        "answer_num": answer_num,
        "answer_token_sum": answer_token_sum,
        "answer_with_cot_num": answer_with_cot_num,
        "answer_with_cot_token_sum": answer_with_cot_token_sum,
        "instruction_plus_answer_num": instruction_plus_answer_num,
        "instruction_plus_answer_token_sum": instruction_plus_answer_token_sum,
        "graph_language_num": graph_language_num,
        "graph_language_token_sum": graph_language_token_sum,
    }
    

In [21]:
def after_tokenization_statistic(example: dict, tokenizer):
    # 分词前，对样本本身统计
    # 分词前的统计方式是以空格为分隔符进行分词
    instruction_num, instruction_token_sum = 0, 0
    answer_num, answer_token_sum = 0, 0
    answer_with_cot_num, answer_with_cot_token_sum = 0, 0
    instruction_plus_answer_num, instruction_plus_answer_token_sum = 0, 0
    graph_language_num, graph_language_token_sum = 0, 0
    
    instruction = example["instruction"]
    answer = example["answer"]
    answer_with_cot = example["answer_with_cot"]
    graph_language = example["graph_language"]
    
    instruction_num += 1
    cur_instruction_len = len(tokenizer.tokenize(instruction))
    instruction_token_sum += cur_instruction_len

    if len(answer) != 0:
        answer_num += 1
        instruction_plus_answer_num += 1
        cur_answer_len = len(tokenizer.tokenize(answer[0]))
        answer_token_sum += cur_answer_len
        instruction_plus_answer_token_sum += cur_instruction_len + cur_answer_len

    if len(answer_with_cot) != 0:
        answer_with_cot_num += 1
        answer_with_cot_token_sum += len(tokenizer.tokenize(answer_with_cot[0]))

    if "Graph[name=" in graph_language:
        graph_language_num += 1
        graph_language_token_sum += len(tokenizer.tokenize(graph_language))
    
    return {
        "task_name": example["task_name"],
        "instruction_num": instruction_num,
        "instruction_token_sum": instruction_token_sum,
        "answer_num": answer_num,
        "answer_token_sum": answer_token_sum,
        "answer_with_cot_num": answer_with_cot_num,
        "answer_with_cot_token_sum": answer_with_cot_token_sum,
        "instruction_plus_answer_num": instruction_plus_answer_num,
        "instruction_plus_answer_token_sum": instruction_plus_answer_token_sum,
        "graph_language_num": graph_language_num,
        "graph_language_token_sum": graph_language_token_sum,
    }

In [104]:
def analysis_token_length(data: list, window_size=200):

    before_tokenization_token_info = list()
    llama2_tokenization_token_info = list()
    gpt2_tokenization_token_info = list()
    flant5_tokenization_token_info = list()
    
    ## 获取每个样本的统计信息
    for example in tqdm(data):
        instruction = example["instruction"]
        answer = example["answer"]
        answer_with_cot = example["answer_with_cot"]
        graph_language = example["graph_language"]
        
        # 当前样本，分词前的长度信息统计（以空格为分词）
        before_tokenization_token_info.append(before_tokenization_statistic(example))
        
        # 当前样本，llama2分词后的长度信息统计
        llama2_tokenization_token_info.append(after_tokenization_statistic(example, llama2_tokenizer))
        
# #         # 当前样本，llama2分词后的长度信息统计
        gpt2_tokenization_token_info.append(after_tokenization_statistic(example, gpt2_tokenizer))
        
# #         # 当前样本，llama2分词后的长度信息统计
        flant5_tokenization_token_info.append(after_tokenization_statistic(example, flant5_tokenizer))
    
    return {
        "before_tokenization_token_info": before_tokenization_token_info,
        "llama2_tokenization_token_info": llama2_tokenization_token_info,
        "gpt2_tokenization_token_info": gpt2_tokenization_token_info,
        "flant5_tokenization_token_info": flant5_tokenization_token_info,
    }


meta_train_token_info = analysis_token_length(instruction_train_data)
meta_test_token_info = analysis_token_length(instruction_test_data)

100%|██████████████████████████████| 1259710/1259710 [2:20:59<00:00, 148.91it/s]
100%|████████████████████████████████████| 32668/32668 [02:20<00:00, 232.18it/s]


100%|██████████████████████████████████████| 4376/4376 [00:30<00:00, 141.58it/s]


In [90]:
print(meta_train_token_info["before_tokenization_token_info"][23343])
print(meta_test_token_info["before_tokenization_token_info"][243])

{'task_name': 'graph-language-modeling-graph-caption-generation-genwiki', 'instruction_num': 1, 'instruction_token_sum': 112, 'answer_num': 1, 'answer_token_sum': 32, 'answer_with_cot_num': 0, 'answer_with_cot_token_sum': 0, 'instruction_plus_answer_num': 1, 'instruction_plus_answer_token_sum': 144, 'graph_language_num': 1, 'graph_language_token_sum': 29}
{'task_name': 'graph-language-modeling-graph-question-answering-wc2014', 'instruction_num': 1, 'instruction_token_sum': 229, 'answer_num': 1, 'answer_token_sum': 1, 'answer_with_cot_num': 1, 'answer_with_cot_token_sum': 3, 'instruction_plus_answer_num': 1, 'instruction_plus_answer_token_sum': 230, 'graph_language_num': 1, 'graph_language_token_sum': 134}


**统计所有样本的平均长度**

In [105]:
def overall_statistics(meta_token_info):
    
    overall_token_info = dict()
    
    for kind in ["before", "llama2", "gpt2", "flant5"]:
        
        instruction_token_num, instruction_token_sum = 0, 0
        answer_token_num, answer_token_sum = 0, 0
        answer_with_cot_token_num, answer_with_cot_token_sum = 0, 0
        instruction_plus_answer_token_num, instruction_plus_answer_token_sum = 0, 0
        graph_language_token_num, graph_language_token_sum = 0, 0
        
        max_instruction_plus_answer_token_sum = 0
        max_i = 0
        
        over_length_num = 0 # 统计超越2000 token长度的样本数量

        for ei, info in enumerate(tqdm(meta_token_info["{}_tokenization_token_info".format(kind)])):
            instruction_token_num += info["instruction_num"]
            instruction_token_sum += info["instruction_token_sum"]
            
            answer_token_num += info["answer_num"]
            answer_token_sum += info["answer_token_sum"]

            answer_with_cot_token_num += info["answer_with_cot_num"]
            answer_with_cot_token_sum += info["answer_with_cot_token_sum"]
            
            instruction_plus_answer_token_num += info["instruction_plus_answer_num"]
            instruction_plus_answer_token_sum += info["instruction_plus_answer_token_sum"]

            graph_language_token_num += info["graph_language_num"]
            graph_language_token_sum += info["graph_language_token_sum"]
            
            if info["instruction_plus_answer_token_sum"] > max_instruction_plus_answer_token_sum:
                max_instruction_plus_answer_token_sum = info["instruction_plus_answer_token_sum"]
                max_i = ei
            
            if info["instruction_plus_answer_token_sum"] > 2000:
                over_length_num += 1
        
        avg_instruction_token_sum = round(instruction_token_sum / instruction_token_num, 4)
        avg_answer_token_sum = round(answer_token_sum / answer_token_num, 4)
        avg_answer_with_cot_token_sum = round(answer_with_cot_token_sum / answer_with_cot_token_num, 4)
        avg_instruction_plus_answer_token_sum = round(instruction_plus_answer_token_sum / instruction_plus_answer_token_num, 4)
        avg_graph_language_token_sum = round(graph_language_token_sum / graph_language_token_num, 4)
        
        overall_token_info["{}_tokenization".format(kind)] = {
            "avg_instruction_token_sum": avg_instruction_token_sum,
            "avg_answer_token_sum": avg_answer_token_sum,
            "avg_answer_with_cot_token_sum": avg_answer_with_cot_token_sum,
            "avg_instruction_plus_answer_token_sum": avg_instruction_plus_answer_token_sum,
            "avg_graph_language_token_sum": avg_graph_language_token_sum,
            "max_instruction_plus_answer_token_sum": max_instruction_plus_answer_token_sum,
            "max_i": max_i,
            "over_length_num": over_length_num,
        }
    return overall_token_info

train_overall_token_info, test_overall_token_info = overall_statistics(meta_train_token_info), overall_statistics(meta_test_token_info)


100%|█████████████████████████████| 1259710/1259710 [00:12<00:00, 103027.52it/s]
100%|█████████████████████████████| 1259710/1259710 [00:02<00:00, 446228.93it/s]
100%|█████████████████████████████| 1259710/1259710 [00:02<00:00, 462371.03it/s]
100%|█████████████████████████████| 1259710/1259710 [00:02<00:00, 475132.74it/s]
100%|█████████████████████████████████| 32668/32668 [00:00<00:00, 194166.93it/s]
100%|████████████████████████████████| 32668/32668 [00:00<00:00, 1063032.10it/s]
100%|████████████████████████████████| 32668/32668 [00:00<00:00, 1103314.49it/s]
100%|████████████████████████████████| 32668/32668 [00:00<00:00, 1119522.87it/s]


In [106]:
train_overall_token_info

{'before_tokenization': {'avg_instruction_token_sum': 194.9669,
  'avg_answer_token_sum': 36.5948,
  'avg_answer_with_cot_token_sum': 5.8681,
  'avg_instruction_plus_answer_token_sum': 231.5617,
  'avg_graph_language_token_sum': 74.7743,
  'max_instruction_plus_answer_token_sum': 401,
  'max_i': 508,
  'over_length_num': 0},
 'llama2_tokenization': {'avg_instruction_token_sum': 413.3385,
  'avg_answer_token_sum': 72.3072,
  'avg_answer_with_cot_token_sum': 35.2151,
  'avg_instruction_plus_answer_token_sum': 485.6457,
  'avg_graph_language_token_sum': 273.6371,
  'max_instruction_plus_answer_token_sum': 2543,
  'max_i': 27206,
  'over_length_num': 819},
 'gpt2_tokenization': {'avg_instruction_token_sum': 394.5139,
  'avg_answer_token_sum': 65.8861,
  'avg_answer_with_cot_token_sum': 32.9232,
  'avg_instruction_plus_answer_token_sum': 460.3999,
  'avg_graph_language_token_sum': 259.5475,
  'max_instruction_plus_answer_token_sum': 2085,
  'max_i': 104420,
  'over_length_num': 3},
 'flant5

In [107]:
test_overall_token_info

{'before_tokenization': {'avg_instruction_token_sum': 201.0852,
  'avg_answer_token_sum': 21.096,
  'avg_answer_with_cot_token_sum': 11.3186,
  'avg_instruction_plus_answer_token_sum': 222.1812,
  'avg_graph_language_token_sum': 93.7141,
  'max_instruction_plus_answer_token_sum': 401,
  'max_i': 811,
  'over_length_num': 0},
 'llama2_tokenization': {'avg_instruction_token_sum': 521.4099,
  'avg_answer_token_sum': 47.6235,
  'avg_answer_with_cot_token_sum': 43.5548,
  'avg_instruction_plus_answer_token_sum': 569.0333,
  'avg_graph_language_token_sum': 421.2474,
  'max_instruction_plus_answer_token_sum': 2521,
  'max_i': 8184,
  'over_length_num': 79},
 'gpt2_tokenization': {'avg_instruction_token_sum': 479.7712,
  'avg_answer_token_sum': 44.5938,
  'avg_answer_with_cot_token_sum': 38.4516,
  'avg_instruction_plus_answer_token_sum': 524.3651,
  'avg_graph_language_token_sum': 375.2111,
  'max_instruction_plus_answer_token_sum': 1940,
  'max_i': 3712,
  'over_length_num': 0},
 'flant5_tok

In [110]:
instruction_train_data[504830]

{'task_name': 'graph-language-modeling-graph-caption-generation-wikipedia',
 'idx': 131215,
 'instruction': 'You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.\nNote: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. \n```\nGraph[name="wikipedia-knowledge-graph"] {\n    entity_list = [\'track cyclist\', \'team sprint\', \'miroslav minchev\', \'time trial\', \'competed\', \'2016 uec european track championships\', \'17 january\', \'international\', \'bulgaria\'];\n    triple_list = [("team sprint" -> "track cyclist")[relation="sport"], ("time trial" -> "track cyclist")[relation="sport"], ("2016 uec european track championships" -> "track cyclist")[relation="sport"]];\n}\n```\nTask definition: given a knowledge graph with all entities and structu

## 三、从训练集中进行采样，得到的样本用于调用ChatGPT获得CoT
采样的任务：
- graph structure modeling
- graph language modeling
条件：只采样数据集本身没有提供cot的
采样样本数量：50,000。

In [15]:
def sample_for_cot_generation(data: list, N: int=20000):
    shuffle(data)
    for_cot_data = list()
    # 采样前的样本数量
    print("before sampling: {}".format(len(data)))
    for example in tqdm(data):
        if len(for_cot_data) >= N:
            break
        task_name = example["task_name"]
        instruction = example["instruction"]
#         answer_with_cot = example["answer_with_cot"]
        if len(instruction.split(" ")) > 350:
            continue
        if "graph-caption-generation" in task_name:
            # 这个任务不需要进行推理
            continue
        if "graph-structure-modeling" in task_name:
            if random.random() < 0.7:
                for_cot_data.append(example)
        elif "graph-language-modeling" in task_name:
            if random.random() < 0.04:
                for_cot_data.append(example)
    print("after sampling: {}".format(len(for_cot_data)))
    return for_cot_data
                
instruction_train_data_forcot = sample_for_cot_generation(instruction_train_data)

before sampling: 1457122


 92%|███████████████████████████▍  | 1335036/1457122 [01:17<00:07, 17163.89it/s]

after sampling: 20000





In [16]:
instruction_train_data_forcot[14804]

{'task_name': 'graph-language-modeling-graph-link-prediction-wikidata5m',
 'idx': 51057,
 'instruction': 'You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.\nNote: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. \n```\nGraph[name="wikidata-knowledge-graph"] {\n    entity_list = [\'Trisector\', \'concert album\', \'1990s prog rock\', \'Cerniw\', \'audio album\', \'Van der Graaf generator (band)\', \'virgin records\', \'Present (Van der Graaf Generator album)\', \'a grounding in numbers\', \'Real Time (Van der Graaf Generator album)\'];\n    triple_list = [("Real Time (Van der Graaf Generator album)" -> "Present (Van der Graaf Generator album)")[relation="follows"], ("Real Time (Van der Graaf Generator album)" -> "Van der Graaf generator (band)

In [17]:
# 保存用于cot的数据
np.save("instruction_dataset/instruction_train_data_forcot.npy", instruction_train_data_forcot)

In [24]:
# # 保存instruction训练数据集
# np.save("instruction_dataset/released/instruction_train_data.npy", instruction_train_data)
# np.save("instruction_dataset/released/instruction_train_data_small.npy", instruction_train_data_small)


In [19]:

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)



In [20]:
with open("instruction_dataset/released/instructgraph_train_data.json", "w", encoding="utf-8") as fw:
    for example in tqdm(instruction_train_data):
#         for key, values in example.items():
#             if type(values) not in [str, list, int, dict]:
#                 print(key)
        fw.write(json.dumps(example, cls=NpEncoder) + "\n")

100%|███████████████████████████████| 1457122/1457122 [09:48<00:00, 2473.95it/s]


In [20]:
(
    "111"
    "222"
)

'111222'

In [21]:
with open("instruction_dataset/released/instructgraph_test_data.json", "w", encoding="utf-8") as fw:
    for example in tqdm(instruction_test_data):
#         for key, values in example.items():
#             if type(values) not in [str, list, int, dict]:
#                 print(key)
        fw.write(json.dumps(example, cls=NpEncoder) + "\n")

100%|███████████████████████████████████| 38822/38822 [00:06<00:00, 5902.79it/s]


In [22]:
with open("instruction_dataset/released/instructgraph_train_data_small.json", "w", encoding="utf-8") as fw:
    for example in tqdm(instruction_train_data_small):
#         for key, values in example.items():
#             if type(values) not in [str, list, int, dict]:
#                 print(key)
        fw.write(json.dumps(example, cls=NpEncoder) + "\n")

100%|█████████████████████████████████| 145671/145671 [01:06<00:00, 2183.27it/s]


In [23]:
with open("instruction_dataset/released/instructgraph_test_data_small.json", "w", encoding="utf-8") as fw:
    for example in tqdm(instruction_test_data_small):
#         for key, values in example.items():
#             if type(values) not in [str, list, int, dict]:
#                 print(key)
        fw.write(json.dumps(example, cls=NpEncoder) + "\n")

100%|█████████████████████████████████████| 3892/3892 [00:00<00:00, 4915.72it/s]


In [24]:
instruction_train_data_mini = instruction_train_data_small
shuffle(instruction_train_data_mini)
instruction_train_data_mini = instruction_train_data_mini[:2000]

In [26]:
with open("instruction_dataset/released/instructgraph_train_data_mini.json", "w", encoding="utf-8") as fw:
    for example in tqdm(instruction_train_data_mini):
#         for key, values in example.items():
#             if type(values) not in [str, list, int, dict]:
#                 print(key)
        fw.write(json.dumps(example, cls=NpEncoder) + "\n")

100%|█████████████████████████████████████| 2000/2000 [00:00<00:00, 2146.50it/s]


In [32]:
print(instruction_test_data[15700]["instruction"])
print(instruction_test_data[15700]["answer"])

You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.
Note: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. 
```
Graph[name="citeseer-scientific-publications-citation-graph"] {
    publication_node_list = ["paper_3008", "paper_3009"];
    publication_node_feature = ["paper_3009".category="IR"];
    target_publication_node = "paper_3008";
    citation_triple_list = [("paper_3008" -> "paper_3009")];
}
```
Task definition: given a target scientific publication and corresponding citation graph, classify the target scientific publication into one of six categories, such as 'AI', 'Agents', 'DB', 'HCI', 'IR', and 'ML'.
Q: Please classify the target scientific publication.
A:
['IR']


## 三、对测试集进行处理
转换为task dict形式

In [25]:
def test_dict(data):
    task_name2data_dict = dict()
    for example in tqdm(data):
        task_name = example["task_name"]
        if task_name not in task_name2data_dict.keys():
            task_name2data_dict[task_name] = list()
        task_name2data_dict[task_name].append(example)
    return task_name2data_dict

instruction_test_data_dict, instruction_test_data_dict_small = test_dict(instruction_test_data), test_dict(instruction_test_data_small)


100%|██████████████████████████████████| 32645/32645 [00:00<00:00, 32662.27it/s]
100%|██████████████████████████████████| 3276/3276 [00:00<00:00, 1147148.10it/s]


In [26]:
# 保存instruction测试数据集
np.save("instruction_dataset/released/instruction_test_data.npy", instruction_test_data_dict)
np.save("instruction_dataset/released/instruction_test_data_small.npy", instruction_test_data_dict_small)


In [144]:
50000/1250000

0.04

In [44]:
## 1年
# meituan
7 + 3.53*15.5 # 61.715

# ant
20 + 3.36*16 # 73.76

# tencent
6 + 2.8*17 + 4.8 + 30 # 88.40

## 3年

#meituan:
7 + 3.53*15.5*3 # 171.145w + 70w = 241

#ant 
20 + 3.36*16*3 # 181.28 + 10w = 190

# tencent
6 + 2.8*17*3 + 4.8*3 + 90 = 253.2





181.28

In [116]:
6 + 2.8*16 + 4.8 + 30 # 88.40

85.6

In [131]:
2393.16*2*6

28717.92

In [107]:
34398.47 + 1709

36107.47

In [108]:
instruction_train_data_with_cot = list()
if os.path.exists("instruction_dataset/released/instruction_train_data_with_cot.npy"):
    instruction_train_data_with_cot = np.load("instruction_dataset/released/instruction_train_data_with_cot.npy", allow_pickle=True)


In [109]:
len(instruction_train_data_with_cot)

225

In [110]:
instruction_train_data_with_cot[150:]

array([{'task_name': 'graph-structure-modeling-graph-connectivity-detection-nlgraph', 'idx': 1056, 'instruction': 'You are a good graph reasoner. Give you a graph language that describes a graph structure and node information. You need to understand the graph and the task definition, and answer the question.\nNote: (i <-> j) means that node i and node j are connected with an undirected edge. (i -> j) means that node i and node j are connected with a directed edge. \n```\nGraph[name="connectivity-detection"] {\n    node_list = [1, 3, 5];\n    edge_list = [(1 <-> 5), (3 <-> 5)];\n}\n```\nTask definition: determine if there is a path between two nodes in the graph.\nQ: Is there a path between node 0 and node 1? Let\'s think step by step. \nA:', 'graph_language': '```\nGraph[name="connectivity-detection"] {\n    node_list = [1, 3, 5];\n    edge_list = [(1 <-> 5), (3 <-> 5)];\n}\n```', 'graph': {'node_list': [1, 3, 5], 'edge_list': [('1', '5'), ('3', '5')]}, 'answer': ['The answer is no.'],

In [59]:
3238/2/2/23238/5

1214.25

In [49]:
(1252.20+1753.08)*2

6010.5599999999995

In [39]:
36549*0.07*2

5116.860000000001

In [37]:
36549*0.07 - 2393.16

165.27000000000044

In [68]:
125965/8/2/2/2

1968.203125

In [78]:
1259650/16/8*3*4.1/3600

33.62347005208333

In [80]:
1259650/8/4*4.1/3600

44.831293402777774

In [79]:
554+273+75448+13832+82121+87038

259266

In [28]:
import torch
from torch.nn import CrossEntropyLoss
shift_logits = torch.randn([3,5])
shift_labels = torch.tensor([-100,-100,-100],dtype=torch.int64)
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits, shift_labels)
print(loss)

tensor(nan)


In [38]:
a = torch.Tensor([-100, -100, -100])
torch.sum((a!=-100).long()).tolist()

0

In [29]:
a = """2323\n```\nGraph[name="knowledge-graph"] {\n    entity_list = ["1956 NCAA Track and Field Championships", "Berkeley, California", "UCLA", "University of Southern California", "Bobby Morrow", "1956 Summer Olympics", "Rafer Johnson", "1960 Summer Olympics", "Arnie Sowell", "University of Pittsburgh", "June 1956", "June 1956", "1960"];\n    triple_list = [("1960 Summer Olympics" -> "1960")[relation="occurrence time"], ("1956 NCAA Track and Field Championships" -> "Berkeley, California")[relation="scene"], ("1956 Summer Olympics" -> "Arnie Sowell")[relation="participant"], ("1956 NCAA Track and Field Championships" -> "June 1956")[relation="occurrence time"], ("1956 Summer Olympics" -> "June 1956")[relation="occurrence time"]];\n}\n```\newdedwe"""

In [30]:
print(a)

2323
```
Graph[name="knowledge-graph"] {
    entity_list = ["1956 NCAA Track and Field Championships", "Berkeley, California", "UCLA", "University of Southern California", "Bobby Morrow", "1956 Summer Olympics", "Rafer Johnson", "1960 Summer Olympics", "Arnie Sowell", "University of Pittsburgh", "June 1956", "June 1956", "1960"];
    triple_list = [("1960 Summer Olympics" -> "1960")[relation="occurrence time"], ("1956 NCAA Track and Field Championships" -> "Berkeley, California")[relation="scene"], ("1956 Summer Olympics" -> "Arnie Sowell")[relation="participant"], ("1956 NCAA Track and Field Championships" -> "June 1956")[relation="occurrence time"], ("1956 Summer Olympics" -> "June 1956")[relation="occurrence time"]];
}
```
ewdedwe


In [35]:
lines = a.split("\n")
status = 0
start, end = -1, -1
for ei, line in enumerate(lines):
    if line == "```":
        if status == 0:
            status = 1
            start = ei
        elif status == 1:
            status = 2
            end = ei
print(start)
print(end)
gcl = lines[start: end + 1]

1
6


In [65]:

entity_list, triple_list = list(), list()
triple_strs = gcl[3][4:].replace("triple_list = ", "")[:-1]
triple_strs = [i.split(")[relation=") for i in triple_strs.split("], (")]
# print(triple_strs)
for triple_str in triple_strs:
    entity_str, relation = triple_str
    head, tail = entity_str.split(" -> ")
    head = head.replace("[(", "")
    relation = relation.replace("]]", "")
    if head[0] == "\"":
        head = head[1:]
    if head[-1] == "\"":
        head = head[:-1]
    if tail[0] == "\"":
        tail = tail[1:]
    if tail[-1] == "\"":
        tail = tail[:-1]
    if relation[0] == "\"":
        relation = relation[1:]
    if relation[-1] == "\"":
        relation = relation[:-1]
    triple_list.append((head, relation, tail))
print(triple_list)

[('1960 Summer Olympics', 'occurrence time', '1960'), ('1956 NCAA Track and Field Championships', 'scene', 'Berkeley, California'), ('1956 Summer Olympics', 'participant', 'Arnie Sowell'), ('1956 NCAA Track and Field Championships', 'occurrence time', 'June 1956'), ('1956 Summer Olympics', 'occurrence time', 'June 1956')]


In [73]:
triple_str = "edge_list = [(\"0\" -> \"1\"),(\"0\" -> \"2\")];"
triple_str = triple_str.replace("edge_list = ", "")[:-1]
print(triple_str)
triple_str.split("),(")

[("0" -> "1"),("0" -> "2")]


['[("0" -> "1"', '"0" -> "2")]']