# Dependency, Setting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/DATA410'

/content/drive/MyDrive/DATA410


In [None]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
import spacy
import numpy as np
import pandas as pd
import os

In [None]:
import jsonlines

train = []
with jsonlines.open('/content/drive/MyDrive/DATA410/MMLU_train.jsonl') as reader:
    for obj in reader:
        train.append(obj["Question"])
        # train.append(obj["output"])

eval = []
with jsonlines.open('/content/drive/MyDrive/DATA410/MMLU_eval.jsonl') as reader:
    for obj in reader:
        eval.append(obj["Question"])
        # eval.append(obj["output"])

df = pd.DataFrame(train, columns=['answer'])
df = pd.concat([df, pd.DataFrame(eval,columns=['answer'])])

df.head()

Unnamed: 0,answer
0,Consider a segment of length 10. Points A and ...
1,A teacher believes that giving her students a ...
2,Suppose it takes 1 second to factor a general ...
3,Statement 1| For a continuous random variable ...
4,The disadvantage of Grid search is


# Build a Knowledge Graph

In [None]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import spacy

model_name = "bert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

nlp = spacy.load("en_core_web_sm")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def create_knowledge_graph(dataframe):
    knowledge_graph = []

    for sentence in dataframe['answer']:
        doc = nlp(sentence)

        for ent in doc.ents:
            ent_root = ent.root
            for token in doc:
                if token.head == ent_root:
                    relation = {"entity": ent.text, "relation": token.dep_, "value": token.text}
                    knowledge_graph.append(relation)

    return knowledge_graph

knowledge_graph = create_knowledge_graph(df)

In [None]:
import json

with open('knowledge_graph.json', 'w', encoding='utf-8') as json_file:
    json.dump(knowledge_graph, json_file, ensure_ascii=False, indent=4)

# Entity Extraction

In [None]:
entity_set = set(entry['entity'] for entry in knowledge_graph)
relation_set = set(entry['entity'] for entry in knowledge_graph)
value_set = set(entry['entity'] for entry in knowledge_graph)

whole_set = entity_set.union(relation_set).union(value_set)

In [None]:
file_path = 'knowledge_graph_values_MMLU.txt'

with open(file_path, 'w') as file:
    for value in whole_set:
        file.write(value + '\n')

file_path

'knowledge_graph_values_MMLU.txt'