In [5]:
# import dotenv
# import os

# # Use verify_connectivity() method to ensure that a working connection can be established with a 'Driver' instance
# load_status = dotenv.load_dotenv("Neo4j-e68dbc41-Created-2025-01-27.txt")
# if load_status is False:
#     raise RuntimeError('Environment variables not loaded.')

# URI = os.getenv("NEO4J_URI")
# AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

import os
from neo4j import GraphDatabase
from utils.config import OPENAI_API_KEY, AURA_URI, AURA_USERNAME, AURA_PASSWORD

URI = AURA_URI
AUTH = (AURA_USERNAME, AURA_PASSWORD)

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    print("Connection established.")

Connection established.


In [7]:
from langchain_openai import OpenAI

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

llm = OpenAI()

In [9]:
llm.invoke("What do you know about the EU AI Act?")

'\n\nThe EU AI Act, officially known as the Artificial Intelligence Act, is a proposed legislation by the European Commission aimed at regulating the development and use of artificial intelligence (AI) in the European Union. It was introduced in April 2021 and is currently under review by the European Parliament and the Council of the EU.\n\nSome key points about the EU AI Act include:\n\n1. Scope and definitions: The act defines AI systems as software that is designed to interact with the environment and take decisions or actions without human intervention. It covers AI systems that are developed, sold, or used in the EU, regardless of where they were created.\n\n2. Risk-based approach: The act introduces a risk-based approach, where AI systems are categorized into four levels of risk - unacceptable, high, limited, and minimal. The higher the risk, the stricter the requirements for their development and use.\n\n3. Bans and restrictions: The act proposes a ban on certain AI practices t

In [None]:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

llm_transformer = LLMGraphTransformer(llm=llm)
from langchain_core.documents import Document

text = """
1. High-risk AI systems shall be designed and developed in such a way, including with appropriate human-machine interface tools, that they can be effectively overseen by natural persons during the period in which they are in use.

2. Human oversight shall aim to prevent or minimise the risks to health, safety or fundamental rights that may emerge when a high-risk AI system is used in accordance with its intended purpose or under conditions of reasonably foreseeable misuse, in particular where such risks persist despite the application of other requirements set out in this
Section.

3. The oversight measures shall be commensurate with the risks, level of autonomy and context of use of the high-risk AI system, and shall be ensured through either one or both of the following types of measures:

(a) measures identified and built, when technically feasible, into the high-risk AI system by the provider before it is placed on the market or put into service;

(b) measures identified by the provider before placing the high-risk AI system on the market or putting it into service and that are appropriate to be implemented by the deployer.

4. For the purpose of implementing paragraphs 1, 2 and 3, the high-risk AI system shall be provided to the deployer in such a way that natural persons to whom human oversight is assigned are enabled, as appropriate and proportionate:

(a) to properly understand the relevant capacities and limitations of the high-risk AI system and be able to duly monitor its operation, including in view of detecting and addressing anomalies, dysfunctions and unexpected performance;

(b) to remain aware of the possible tendency of automatically relying or over-relying on the output produced by a high-risk AI system (automation bias), in particular for high-risk AI systems used to provide information or recommendations for decisions to be taken by natural persons;

(c) to correctly interpret the high-risk AI system’s output, taking into account, for example, the interpretation tools and methods available;

(d) to decide, in any particular situation, not to use the high-risk AI system or to otherwise disregard, override or reverse the output of the high-risk AI system;

(e) to intervene in the operation of the high-risk AI system or interrupt the system through a ‘stop’ button or a similar procedure that allows the system to come to a halt in a safe state.

5. For high-risk AI systems referred to in point 1(a) of Annex III, the measures referred to in paragraph 3 of this Article shall be such as to ensure that, in addition, no action or decision is taken by the deployer on the basis of the identification resulting from the system unless that identification has been separately verified and confirmed by at least two natural persons with the necessary competence, training and authority. The requirement for a separate verification by at least two natural persons shall not apply to high-risk AI systems used for the purposes of law enforcement, migration, border control or asylum, where Union or national law considers the application of this requirement to be disproportionate.
"""
documents = [Document(page_content=text)]
graph_documents = llm_transformer.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents[0].nodes}")
print("\n")
print(f"Relationships:{graph_documents[0].relationships}")

In [147]:
import pandas as pd

ai_dataframe = pd.read_csv("data/processed_and_formatted.csv")

In [171]:
from langchain.graphs.neo4j_graph import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_experimental.graph_transformers.llm import LLMGraphTransformer
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from uuid import uuid4

# Step 1: Set up Neo4jGraph
graph = Neo4jGraph(
    url=AURA_URI,
    username=AURA_USERNAME,
    password=AURA_PASSWORD
)

# Ensure the schema avoids duplicate nodes
with graph._driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Entity) REQUIRE n.id IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:Relation) REQUIRE r.id IS UNIQUE")

In [173]:
# Convert text column into LangChain Document objects
documents = [Document(page_content=text, metadata={"id": str(uuid4())}) for text in ai_dataframe['Content']]

In [175]:
from langchain.chains import LLMChain

# Define the LLM and PromptTemplate
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
template = "Extract entities and relationships from the following text: {text}"
prompt = PromptTemplate(input_variables=["text"], template=template)
llm_chain = LLMChain(llm=llm, prompt=prompt)

In [177]:
# Instead of Runnables, use the pipe operator to chain the prompt and the LLM
llm_pipeline = prompt | llm

In [179]:
# Function to split text into chunks based on a token limit
def split_text_into_chunks(text, max_tokens=4000):
    """Splits the text into smaller chunks based on token count."""
    words = text.split()
    chunks = []
    chunk = []
    token_count = 0

    for word in words:
        # Estimate tokens as words (this is an approximation; for more precision, use a tokenizer like tiktoken)
        token_count += len(word.split())
        if token_count > max_tokens:
            chunks.append(" ".join(chunk))
            chunk = [word]
            token_count = len(word.split())
        else:
            chunk.append(word)

    if chunk:
        chunks.append(" ".join(chunk))

    return chunks

In [181]:
# Split large documents into smaller chunks if necessary
max_tokens = 4000  # Adjust this based on the model's context window
chunked_documents = []

for doc in documents:
    chunks = split_text_into_chunks(doc.page_content, max_tokens)
    chunked_documents.extend([Document(page_content=chunk) for chunk in chunks])

In [185]:
# Process the documents using the LLM pipeline
graph_documents = llm_pipeline.invoke(chunked_documents)

In [189]:
# Iterate over the graph documents and print nodes and relationships for each document
for i, graph_doc in enumerate(graph_documents):
    print(f"Document {i + 1}:")
    print(f"Nodes: {graph_doc.nodes}")
    print(f"Relationships: {graph_doc.relationships}")
    print("\n")

Document 1:


AttributeError: 'tuple' object has no attribute 'nodes'

In [197]:
graph_documents

AIMessage(content='Based on the provided text, here are the extracted entities and relationships:\n\n### Entities:\n1. **European Union (EU)**\n2. **European Commission**\n3. **AI Office**\n4. **Member States**\n5. **Market Surveillance Authorities**\n6. **Providers of AI Systems**\n7. **Deployers of AI Systems**\n8. **Notified Bodies**\n9. **General-Purpose AI Models**\n10. **High-Risk AI Systems**\n11. **Vulnerable Groups**\n12. **Scientific Panel of Independent Experts**\n13. **Advisory Forum**\n14. **Conformity Assessment Bodies**\n15. **Data Protection Authorities**\n16. **Civil Society Organizations**\n17. **Small and Medium-sized Enterprises (SMEs)**\n18. **Public Authorities**\n19. **Law Enforcement Authorities**\n20. **Consumers**\n21. **Affected Persons**\n22. **Technical Documentation**\n23. **Risk Management System**\n24. **Biometric Data**\n25. **Deepfake Technology**\n26. **AI Regulatory Sandboxes**\n27. **Harmonised Standards**\n28. **Codes of Conduct**\n29. **Transparen

In [199]:
# Transform documents into a graph
for doc in documents:
    graph_transformer.convert_to_graph_documents(documents=[doc])

# Validate the graph creation
with graph._driver.session() as session:
    results = session.run("MATCH (n) RETURN n LIMIT 10")
    for record in results:
        print(record)

print("Graph creation complete!")

NameError: name 'graph_transformer' is not defined

In [157]:
from langchain.graphs.neo4j_graph import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from uuid import uuid4
import re

# Step 1: Set up Neo4jGraph
graph = Neo4jGraph(
    url=AURA_URI,
    username=AURA_USERNAME,
    password=AURA_PASSWORD
)

# Ensure the schema avoids duplicate nodes
with graph._driver.session() as session:
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (n:Entity) REQUIRE n.id IS UNIQUE")
    session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (r:Relation) REQUIRE r.id IS UNIQUE")

# Convert text column into LangChain Document objects
documents = [Document(page_content=text, metadata={"id": str(uuid4())}) for text in ai_dataframe['Content']]

# Define the LLM and PromptTemplate
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
template = "Extract entities and relationships from the following text: {text}"
prompt = PromptTemplate(input_variables=["text"], template=template)
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Instead of Runnables, use the pipe operator to chain the prompt and the LLM
llm_pipeline = prompt | llm

# Function to split text into chunks based on token limit while preserving context
def split_text_into_chunks(text, max_tokens=4000):
    """Splits the text into smaller chunks while preserving sentences and relationships."""
    
    # Split text into sentences for better chunking
    sentences = re.split(r'(?<=\.|\?)\s', text)  # This will split the text by sentences (e.g., ending with period or question mark)
    
    chunks = []
    current_chunk = []
    current_token_count = 0

    for sentence in sentences:
        # Estimate token count based on word count
        tokens_in_sentence = len(sentence.split())
        
        # If adding this sentence exceeds the token limit, create a new chunk
        if current_token_count + tokens_in_sentence > max_tokens:
            if current_chunk:  # Avoid adding empty chunks
                chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_token_count = tokens_in_sentence
        else:
            current_chunk.append(sentence)
            current_token_count += tokens_in_sentence

    # Add any remaining chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

# Split large documents into smaller chunks if necessary
max_tokens = 4000  # Adjust this based on the model's context window
chunked_documents = []

for doc in documents:
    chunks = split_text_into_chunks(doc.page_content, max_tokens)
    chunked_documents.extend([Document(page_content=chunk) for chunk in chunks])

# Process the documents using the LLM pipeline
graph_documents = llm_pipeline.invoke(chunked_documents)

# # Ensure LLMGraphTransformer is applied for knowledge graph transformation
# graph_transformer = LLMGraphTransformer(graph=graph, documents=graph_documents)

TypeError: LLMGraphTransformer.__init__() got an unexpected keyword argument 'graph'

In [165]:
# Print out the nodes and relationships from graph_documents
for doc in graph_documents:
    print(f"Document ID: {doc.metadata['id']}")
    if 'nodes' in doc.metadata:
        print(f"Nodes: {doc.metadata['nodes']}")
    if 'relationships' in doc.metadata:
        print(f"Relationships: {doc.metadata['relationships']}")

AttributeError: 'tuple' object has no attribute 'metadata'

In [169]:
# Assuming each item in graph_documents is a tuple, where the first element is the Document
for doc_tuple in graph_documents:
    print(doc_tuple)
    doc = doc_tuple[0]  # Access the first element (the actual Document object)
    
    # Now we can access metadata
    nodes = doc.metadata.get('nodes', [])
    relationships = doc.metadata.get('relationships', [])
    
    print(f"Document ID: {doc.metadata['id']}")
    
    print("Nodes:")
    for node in nodes:
        print(f"  Node ID: {node['id']}, Name: {node.get('name', 'N/A')}")
    
    print("Relationships:")
    for relationship in relationships:
        print(f"  Relationship Type: {relationship['type']}, From: {relationship['from']}, To: {relationship['to']}")

('content', 'Based on the provided text, here are the extracted entities and relationships:\n\n### Entities:\n1. **AI Systems**: Refers to various artificial intelligence systems, including high-risk AI systems and general-purpose AI models.\n2. **Providers**: Individuals or organizations that develop and place AI systems on the market.\n3. **Deployers**: Individuals or organizations that use AI systems.\n4. **Market Surveillance Authorities**: National authorities responsible for monitoring compliance with AI regulations.\n5. **European Commission**: The executive branch of the European Union responsible for proposing legislation and implementing decisions.\n6. **AI Office**: A body established to oversee the implementation of AI regulations and provide expertise.\n7. **Notified Bodies**: Organizations designated to assess the conformity of AI systems with regulations.\n8. **Affected Persons**: Individuals who are impacted by the use of AI systems.\n9. **Vulnerable Groups**: Specific 

AttributeError: 'str' object has no attribute 'metadata'

In [159]:
# Iterate over the graph documents and print nodes and relationships for each document
for i, graph_doc in enumerate(graph_documents):
    print(f"Document {i + 1}:")
    print(f"Nodes: {graph_doc.nodes}")
    print(f"Relationships: {graph_doc.relationships}")
    print("\n")

Document 1:


AttributeError: 'tuple' object has no attribute 'nodes'