In [1]:
from neo4j import GraphDatabase
from langchain.graphs import Neo4jGraph
from openai import OpenAI
from dotenv import load_dotenv
import os
import json

load_dotenv(dotenv_path='secrets.env')

url=os.environ["NEO4J_URI"]
username=os.environ["NEO4J_USERNAME"]
password=os.environ["NEO4J_PASSWORD"]


In [2]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [3]:
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [4]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

In [5]:
driver = GraphDatabase.driver(url, auth=(username, password))

In [6]:
def add_node(tx, name, type):
    tx.run(f"CREATE (n:{type} {{name: $name}})", name=name)

def add_edge(tx, source, target, type):
    tx.run(f"MATCH (s), (t) WHERE s.name = $source AND t.name = $target CREATE (s)-[r:{type}]->(t)", source=source, target=target)


In [53]:
# with driver.session() as session:
#     session.execute_write(add_node, "The Pentagon", "Building")
#     session.execute_write(add_edge, "Alice", "The Pentagon", "LIVES_IN")


In [7]:
node_functions = [
    {
        # could add Properties to the node
        "name": 'extract_node',
        "description": "Extract a node entity from a document",
        "parameters": {
            "type": "object",
            "properties": {
                "name": {
                    "type": "string",
                    "description": "A short name for the node entity."
                },
                "type": {
                    "type": "string",
                    "description": "The type of the node entity."
                }
            },
            "required": ["name", "type"]
        }
    }
]

In [8]:
relationship_functions = [
    {
        "name": "extract_relationship",
        "description": "Extract a relationship between two nodesfrom a document",
        "parameters": {
            "type": "object",
            "properties": {
                "source": {
                    "type": "string",
                    "description": "The name of the source node."
                },
                "target": {
                    "type": "string",
                    "description": "The name of the target node."
                },
                "type": {
                    "type": "string",
                    "description": "The type of the relationship."
                }
            },
            "required": ["source", "target", "type"]
        }
    }
]

In [15]:
test_string = "Alice lives in the Pentagon. Bob lives in the White House. Joe is Alice's brother. Bob enjoys reading French History books with Joe."

In [26]:
''' PARAMETERS '''

loop_parameter = 10

In [73]:
# function to identify nodes in a block of text

def get_nodes_from_text(text, loop_parameter=loop_parameter):
    nodes = []
    for i in range(loop_parameter):
        decide_continue_response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {'role': 'system', 'content': 'You are a highly intelligent agent creating a knowledge graph from a given block of text. Your job is to answer a yes or no question: whether you have found all nodes (entities and concepts) in the provided text.'},
                {'role': 'user', 'content': 
                    'The block of text is:\n\n' 
                    + text 
                    + '\n\nYou have already identified these nodes:\n\n'
                    + ','.join(str(x) for x in nodes) 
                    + '\n\nIf you think you have identified every node, please respond with the word "yes". Else, please respond with the word "no". Do not respond with anything else. You must respond with either yes or no.'}
                ]
        )
        if (decide_continue_response.choices[0].message.content.lower() == 'yes'):
            break
        elif (decide_continue_response.choices[0].message.content.lower() == 'no'):
            node_response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{'role': 'user', 'content': 
                        'You are a highly intelligent agent creating a knowledge graph from a given block of text. Your job is to identify nodes (entities and concepts) in the text. Nodes must have short, unique names and a clear type. You have already identified the following nodes\n' + ','.join(str(x) for x in nodes) + '\n Do not duplicate any of the nodes that you have already identified. There is a heavy penalty if you repeat the same node. If you have identified every node in the text, do not call the function.\n\nThe block of text is as follows: \n\n' + text}],
                functions = node_functions,
                function_call = 'auto'
            )

            if (node_response.choices[0].message.function_call and node_response.choices[0].message.function_call.arguments):
                json_response = json.loads(node_response.choices[0].message.function_call.arguments)
                if (json_response not in nodes and json_response is not None):
                    nodes.append(json_response)
        else:
            print("Error: Unexpected response from the model.\n\n" + decide_continue_response.choices[0].message.content)
            print("Nodes identified so far:")
            return nodes
    
    print("Finished identifying nodes.")
    
    if (len(nodes) == 0):
        print("Error: No nodes were identified in the text.")
        print("Trying again...")
        return get_nodes_from_text(text, loop_parameter)
    else:
        return nodes

In [37]:
nodes = get_nodes_from_text(test_string)
for node in nodes:
    print(node)

Finished identifying nodes.
{'name': 'Alice', 'type': 'Person'}
{'name': 'the Pentagon', 'type': 'Place'}
{'name': 'White House', 'type': 'Place'}
{'name': 'Bob', 'type': 'Person'}
{'name': 'Joe', 'type': 'Person'}
{'name': 'French History books', 'type': 'Concept'}


In [74]:
# function to identify relationships in a block of text

def get_relationships_from_text(text, nodes, loop_parameter=loop_parameter):
    relationships = []
    for i in range(loop_parameter):
        decide_continue_response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {'role': 'system', 'content': 'You are a highly intelligent agent creating a knowledge graph from a given block of text. You have already identified all the nodes. Your job is to answer a yes or no question: whether you have found all relationships between nodes in the provided text.'},
                {'role': 'user', 'content': 
                    'The block of text is:\n\n' 
                    + text 
                    + '\n\nYou have already identified these relationships:\n\n'
                    + ','.join(str(x) for x in relationships) 
                    + '\n\nThis is the list of all nodes in the text:\n\n'
                    + ','.join(str(x) for x in nodes)
                    + '\n\nIf you think you have identified every relationship, please respond with the word "yes". Else, please respond with the word "no". Do not respond with anything else. You must respond with either yes or no.'}
                ]
        )
        if (decide_continue_response.choices[0].message.content.lower() == 'yes'):
            break
        elif (decide_continue_response.choices[0].message.content.lower() == 'no'):
            relationship_response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{'role': 'user', 'content': 
                        'You are a highly intelligent agent creating a knowledge graph from a given block of text. Your job is to identify relationships between nodes (entities and concepts) in the text. Relationships must have short, unique names and a clear type. You have already identified the following relationships\n' + ','.join(str(x) for x in relationships) + '\n Do not duplicate any of the relationships that you have already identified. There is a heavy penalty if you repeat the same relationship.' + '\n\nThis is the list of all nodes in the text:\n\n'
                    + ','.join(str(x) for x in nodes) + '\n\nIf you have identified every relationship in the text, do not call the function.\n\nThe block of text is as follows: \n\n' + text}],
                functions = relationship_functions,
                function_call = 'auto'
            )
        
            if (relationship_response.choices[0].message.function_call and relationship_response.choices[0].message.function_call.arguments):
                json_response = json.loads(relationship_response.choices[0].message.function_call.arguments)
                if (json_response not in relationships and json_response is not None):
                    relationships.append(json_response) 
        else:
            print("Error: Unexpected response from the model.\n\n" + decide_continue_response.choices[0].message.content)
            print("Relationships identified so far:")
            return relationships
    
    print("Finished identifying relationships.")
    
    if (len(relationships) == 0):
        print("Error: No relationships were identified in the text.")
        print("Trying again...")
        return get_relationships_from_text(text, nodes, loop_parameter)
    else:
        return relationships

In [39]:
rels = get_relationships_from_text(test_string, nodes, loop_parameter=loop_parameter)
for rel in rels:
    print(rel)

Finished identifying relationships.
{'source': 'Alice', 'target': 'the Pentagon', 'type': 'residence'}
{'source': 'Bob', 'target': 'White House', 'type': 'residence'}
{'source': 'Joe', 'target': 'Alice', 'type': 'sibling'}
{'source': 'Bob', 'target': 'French History books', 'type': 'interest'}
{'source': 'Bob', 'target': 'Joe', 'type': 'interest'}


## Text splitter

In [60]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=0)

In [61]:
with open("moon_speech.txt") as f:
    text = f.read()

In [62]:
documents = text_splitter.split_text(text)

In [63]:
print(documents[0])
print(len(documents))

President Pitzer, Mr. Vice President, Governor, Congressman Thomas, Senator Wiley, and Congressman Miller, Mr. Webb, Mr. Bell, scientists, distinguished guests, and ladies and gentlemen:

I appreciate your president having made me an honorary visiting professor, and I will assure you that my first lecture will be very brief.

I am delighted to be here, and I’m particularly delighted to be here on this occasion.

We meet at a college noted for knowledge, in a city noted for progress, in a state noted for strength, and we stand in need of all three, for we meet in an hour of change and challenge, in a decade of hope and fear, in an age of both knowledge and ignorance. The greater our knowledge increases, the greater our ignorance unfolds.

Despite the striking fact that most of the scientists that the world has ever known are alive and working today, despite the fact that this nation’s own scientific manpower is
14


## Knowledge Graph Generation

In [75]:
for i in range(len(documents)):
    nodes = get_nodes_from_text(documents[i])
    relationships = get_relationships_from_text(documents[i], nodes)
    for node in nodes:
        print(node['name']+ '; ' + node['type'])
    for relationship in relationships:
        print(relationship['source'] + '; ' + relationship['target'] + '; ' + relationship['type'])

Finished identifying nodes.
Finished identifying relationships.
President Pitzer; Person
Mr. Vice President; Person
Governor; Person
Congressman Thomas; Person
Senator Wiley; Person
Congressman Miller; Person
Mr. Webb; Person
Mr. Bell; Person
scientists; Group
distinguished guests; Group
College; knowledge; noted_for
President Pitzer; honorary visiting professor; appointed_as
College; progress; noted_for
College; strength; noted_for
scientists; world; exists_in
College; city; located_in
President Pitzer; Mr. Vice President; addressed
Finished identifying nodes.
Finished identifying relationships.
12 years; time period
rate of growth; concept
unknown; concept
human history; concept
50,000 years; time period
three times; comparative measurement
time span of a half-century; time period
10 years ago; time period
12 years; rate of growth; duration
rate of growth; three times; comparison
50,000 years; time span of a half-century; condensed representation
vast stretches of the unknown; collec

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
with driver.session() as session:
    for i in range(len(documents)):
        nodes = get_nodes_from_text(documents[i])
        relationships = get_relationships_from_text(documents[i], nodes)
        for node in nodes:
            session.execute_write(add_node, node)
        session.execute_write()

In [None]:
with driver.session() as session:
    session.execute_write(add_node, "The Pentagon", "Building")
    session.execute_write(add_edge, "Alice", "The Pentagon", "LIVES_IN")