In [1]:
from neo4j import GraphDatabase
from langchain.graphs import Neo4jGraph
from openai import OpenAI
from dotenv import load_dotenv
import os
import json

load_dotenv(dotenv_path='secrets.env')

url=os.environ["NEO4J_URI"]
username=os.environ["NEO4J_USERNAME"]
password=os.environ["NEO4J_PASSWORD"]


In [2]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [4]:
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [5]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

In [6]:
driver = GraphDatabase.driver(url, auth=(username, password))

In [7]:
def add_node(tx, name, type):
    tx.run(f"CREATE (n:{type} {{name: $name}})", name=name)

def add_edge(tx, source, target, type):
    tx.run(f"MATCH (s), (t) WHERE s.name = $source AND t.name = $target CREATE (s)-[r:{type}]->(t)", source=source, target=target)


In [7]:
# with driver.session() as session:
#     session.execute_write(add_node, "The Pentagon", "Building")
#     session.execute_write(add_edge, "Alice", "The Pentagon", "LIVES_IN")


In [8]:
node_functions = [
    {
        # could add Properties to the node
        "name": 'extract_node',
        "description": "Extract a node entity from a document",
        "parameters": {
            "type": "object",
            "properties": {
                "name": {
                    "type": "string",
                    "description": "A short name for the node entity."
                },
                "type": {
                    "type": "string",
                    "description": "The type of the node entity."
                }
            },
            "required": ["name", "type"]
        }
    }
]

In [9]:
relationship_functions = [
    {
        "name": "extract_relationship",
        "description": "Extract a relationship between two nodesfrom a document",
        "parameters": {
            "type": "object",
            "properties": {
                "source": {
                    "type": "string",
                    "description": "The name of the source node."
                },
                "target": {
                    "type": "string",
                    "description": "The name of the target node."
                },
                "type": {
                    "type": "string",
                    "description": "The type of the relationship."
                }
            },
            "required": ["source", "target", "type"]
        }
    }
]

In [10]:
test_string = "Alice lives in the Pentagon. Bob lives in the White House. Joe is Alice's brother. Bob enjoys reading French History books with Joe."

In [10]:
''' PARAMETERS '''

# Max number of times the identification loops will run
loop_parameter = 10

In [11]:
# function to identify nodes in a block of text

def get_nodes_from_text(text, loop_parameter=loop_parameter):
    nodes = []
    for i in range(loop_parameter):
        decide_continue_response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {'role': 'system', 'content': 'You are a highly intelligent agent creating a knowledge graph from a given block of text. Your job is to answer a yes or no question: whether you have found all nodes (entities and concepts) in the provided text.'},
                {'role': 'user', 'content': 
                    'The block of text is:\n\n' 
                    + text 
                    + '\n\nYou have already identified these nodes:\n\n'
                    + ','.join(str(x) for x in nodes) 
                    + '\n\nIf you think you have identified every node, please respond with the word "yes". Else, please respond with the word "no". Do not respond with anything else. You must respond with either yes or no.'}
                ]
        )
        if (decide_continue_response.choices[0].message.content.lower() == 'yes'):
            break
        elif (decide_continue_response.choices[0].message.content.lower() == 'no'):
            node_response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{'role': 'user', 'content': 
                        'You are a highly intelligent agent creating a knowledge graph from a given block of text. Your job is to identify nodes (entities and concepts) in the text. Nodes must have short, unique names and a clear type. You have already identified the following nodes\n' + ','.join(str(x) for x in nodes) + '\n Do not duplicate any of the nodes that you have already identified. There is a heavy penalty if you repeat the same node. If you have identified every node in the text, do not call the function.\n\nThe block of text is as follows: \n\n' + text}],
                functions = node_functions,
                function_call = 'auto'
            )

            if node_response.choices[0].message.function_call and node_response.choices[0].message.function_call.arguments:
                try:
                    json_response = json.loads(node_response.choices[0].message.function_call.arguments)
                    if json_response not in nodes and json_response is not None:
                        nodes.append(json_response)
                except json.JSONDecodeError as e:
                    print(f"JSONDecodeError: {e}")
                    print(f"Response content: {node_response.choices[0].message.function_call.arguments}")
                    continue
            
        else:
            print("Error: Unexpected response from the model.\n\n" + decide_continue_response.choices[0].message.content)
            print("Nodes identified so far:")
            return nodes
    
    print("Finished identifying nodes.")
    
    if (len(nodes) == 0):
        print("Error: No nodes were identified in the text.")
        print("Trying again...")
        return get_nodes_from_text(text, loop_parameter)
    else:
        return nodes

In [None]:
nodes = get_nodes_from_text(test_string)
for node in nodes:
    print(node)

In [12]:
decide_continue_response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {'role': 'system', 'content': 'You are a highly intelligent agent creating a knowledge graph from a given block of text. Your job is to answer a yes or no question: whether you have found all nodes (entities and concepts) in the provided text.'},
                {'role': 'user', 'content': 'hello'}
                ]
        )

In [13]:
# function to identify relationships in a block of text

def get_relationships_from_text(text, nodes, loop_parameter=loop_parameter):
    relationships = []
    node_names = [node['name'] for node in nodes]
    for i in range(loop_parameter):
        decide_continue_response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {'role': 'system', 'content': 'You are a highly intelligent agent creating a knowledge graph from a given block of text. You have already identified all the nodes. Your job is to answer a yes or no question: whether you have found all relationships between nodes in the provided text.'},
                {'role': 'user', 'content': 
                    'The block of text is:\n\n' 
                    + text 
                    + '\n\nYou have already identified these relationships:\n\n'
                    + ','.join(str(x) for x in relationships) 
                    + '\n\nThis is the list of all nodes in the text:\n\n'
                    + ','.join(str(x) for x in nodes)
                    + '\n\nIf you think you have identified every relationship, please respond with the word "yes". Else, please respond with the word "no". Do not respond with anything else. You must respond with either yes or no.'}
                ]
        )
        if (decide_continue_response.choices[0].message.content.lower() == 'yes'):
            break
        elif (decide_continue_response.choices[0].message.content.lower() == 'no'):
            relationship_response = client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{'role': 'user', 'content': 
                        'You are a highly intelligent agent creating a knowledge graph from a given block of text. Your job is to identify relationships between nodes (entities and concepts) in the text. Relationships must have short, unique names and a clear type. You have already identified the following relationships\n' + ','.join(str(x) for x in relationships) + '\n Do not duplicate any of the relationships that you have already identified. There is a heavy penalty if you repeat the same relationship.' + '\n\nThis is the provided list of all nodes in the text:\n\n'
                    + ','.join(str(x) for x in nodes) + '\n\n The source and target nodes in any relationship must be in the provided list of nodes. You must use nodes from the provided list of nodes. Do not create new nodes that are not in the list. If you have identified every relationship in the text, do not call the function.\n\nThe block of text is as follows: \n\n' + text}],
                functions = relationship_functions,
                function_call = 'auto'
            )
            
            if relationship_response.choices[0].message.function_call and relationship_response.choices[0].message.function_call.arguments:
                try:
                    json_response = json.loads(relationship_response.choices[0].message.function_call.arguments)
                    if json_response not in relationships and json_response is not None:
                        if json_response['source'] in node_names and json_response['target'] in node_names:
                            relationships.append(json_response)
                except json.JSONDecodeError as e:
                    print(f"JSONDecodeError: {e}")
                    print(f"Response content: {relationship_response.choices[0].message.function_call.arguments}")
                    continue
        else:
            print("Error: Unexpected response from the model.\n\n" + decide_continue_response.choices[0].message.content)
            print("Relationships identified so far:")
            return relationships
    
    print("Finished identifying relationships.")
    
    if (len(relationships) == 0):
        print("Error: No relationships were identified in the text.")
        print("Trying again...")
        return get_relationships_from_text(text, nodes, loop_parameter)
    else:
        return relationships

In [None]:
rels = get_relationships_from_text(test_string, nodes, loop_parameter=loop_parameter)
for rel in rels:
    print(rel)

## Text splitter

In [14]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=300, chunk_overlap=0, strip_whitespace=False)

In [81]:
# from langchain.document_loaders import WikipediaLoader
# raw_documents = WikipediaLoader(query="History of France").load()

In [15]:
import wikipediaapi, urllib.parse
wiki = wikipediaapi.Wikipedia(
    user_agent = 'ContentSummary (axchenster@gmail.com)',
    language = 'en',
    extract_format = wikipediaapi.ExtractFormat.WIKI
)

In [16]:
def extract_article_name(url):
    # Parse the URL
    parsed_url = urllib.parse.urlparse(url)
    # Extract the path component and then split it by '/'
    path_segments = parsed_url.path.split('/')
    # The article name is typically the last segment of the path, decode it from URL format to plain text
    article_name = urllib.parse.unquote(path_segments[-1])
    return article_name

def fetch_content_from_url(url):
    # Code to fetch content from URL
    return wiki.page(extract_article_name(url)).text

In [21]:
# documents = text_splitter.split_text(fetch_content_from_url('https://en.wikipedia.org/wiki/History_of_France'))
documents = text_splitter.split_text(fetch_content_from_url('https://en.wikipedia.org/wiki/American_Airlines'))


In [27]:
with open("./texts/moon_speech.txt") as f:
    text = f.read()

In [28]:
documents = text_splitter.split_text(text)

In [None]:
print(documents[0])

## Knowledge Graph Generation

In [23]:
def upload_nodes_rels(document):
    with driver.session() as session:
        nodes = get_nodes_from_text(document)
        relationships = get_relationships_from_text(document, nodes)
        for node in nodes:
            try:
                session.execute_write(add_node, node['name'], node['type'])
                print('Uploaded node: ' + node['name'] + '; ' + node['type'])
            except Exception as e:
                print('Error uploading node: ' + node['name'] + ';\n ' + node['type'] + ';\n ' + str(e))
        for relationship in relationships:
            try:
                session.execute_write(add_edge, relationship['source'], relationship['target'], relationship['type'])
                print('Uploaded relationship: ' + relationship['source'] + '; ' + relationship['target'] + '; ' + relationship['type'])
            except Exception as e:
                print('Error uploading relationship: ' + relationship['source'] + '; ' + relationship['target'] + '; ' + relationship['type'] + ';\n ' + str(e))

In [None]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    upload_nodes_rels(d)


## Misc

In [None]:
with driver.session() as session:
    for i in range(len(documents)):
        nodes = get_nodes_from_text(documents[i])
        relationships = get_relationships_from_text(documents[i], nodes)
        for node in nodes:
            session.execute_write(add_node, node)
        session.execute_write()

In [None]:
with driver.session() as session:
    session.execute_write(add_node, "The Pentagon", "Building")
    session.execute_write(add_edge, "Alice", "The Pentagon", "LIVES_IN")