In [7]:
# !pip install langchain neo4j openai wikipedia tiktoken langchain_openai
# !pip install -U python-dotenv

# Imports

In [49]:
import os
from dotenv import load_dotenv

load_dotenv()

db = 'graph_extract_1'
username = os.getenv('NEO_USER')
password = os.getenv('NEO_PASS')
url = os.getenv('NEO_URL')

In [50]:
from langchain.graphs import Neo4jGraph
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage

# Import chat templates
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel
from tqdm import tqdm
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter

from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument
)

from langchain.chains.openai_functions import (
    create_structured_output_chain
)



# Graph DB Setup

In [51]:
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [16]:
class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [17]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [20]:
def get_system_template(allowed_nodes: Optional[List[str]] = None, allowed_rels: Optional[List[str]] = None):
    system_template = f"""
    # Knowledge Graph Instructions for GPT-4
    ## 1. Overview
    You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
    - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
    - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
    ## 2. Labeling Nodes
    - **Consistency**: Ensure you use basic or elementary types for node labels.
      - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
    - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
    {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
    {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
    ## 3. Handling Numerical Data and Dates
    - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
    - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
    - **Property Format**: Properties must be in a key-value format.
    - **Quotation Marks**: Never use escaped single or double quotes within property values.
    - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
    ## 4. Coreference Resolution
    - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
    If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 
    always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.  
    Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial. 
    ## 5. Strict Compliance
    Adhere to the rules strictly. Non-compliance will result in termination.
    """
    return system_template



def get_extraction_chain( llm, allowed_nodes: Optional[List[str]] = None, allowed_rels: Optional[List[str]] = None):
    human_template = """
    Use the given format to extract information from the following input: {input}
    Tip: Make sure to answer in the correct format
    """
    
    system_prompt = SystemMessagePromptTemplate.from_template(get_system_template(allowed_nodes, allowed_rels))
    human_prompt = HumanMessagePromptTemplate.from_template(human_template)
    
    chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])
    
    return create_structured_output_chain(output_schema=KnowledgeGraph, llm=llm, prompt=chat_prompt, verbose=False)

In [25]:
# topic = "React (software)"

# # Use wikipedia to get the document
# documents = WikipediaLoader(topic).load()



  lis = BeautifulSoup(html).find_all('li')


In [61]:


# Print structure for one document
def print_nodes(data):
    for node in data.nodes:
        print(f"id={node.id} type={node.type}")
        if node.properties:
            print("  <<Properties>>")
            for prop in node.properties:
                print(f"  ->  key={prop.key} value={prop.value}")


In [29]:
doc = documents[0]

In [30]:
nodes:Optional[List[str]] = None
rels:Optional[List[str]]= None

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0, verbose=True)

# Extract graph data using OpenAI functions
extract_chain = get_extraction_chain(llm, nodes, rels)
data = extract_chain.invoke(doc.page_content)['function']



  warn_deprecated(


In [62]:
print_nodes(data)

id=React type=library
  <<Properties>>
  ->  key=description value=React (also known as React.js or ReactJS) is a free and open-source front-end JavaScript library for building user interfaces based on components.
  ->  key=maintainedBy value=Meta
id=Meta type=organization
id=Next.js type=framework
id=Greeting type=component
  <<Properties>>
  ->  key=description value=The Greeting function is a React component that displays 'Hello, world'.
id=React DOM type=library
id=props type=term
  <<Properties>>
  ->  key=description value=Values passed between components.
id=state type=term
  <<Properties>>
  ->  key=description value=Values internal to a component.
id=function components type=term
  <<Properties>>
  ->  key=description value=Components declared with a function that accepts a single 'props' argument and returns JSX.
id=class components type=term
  <<Properties>>
  ->  key=description value=Components declared with a class.
id=React Hooks type=feature
  <<Properties>>
  ->  key=d

In [52]:
# Load into DB
# Construct a graph document
graph_document = GraphDocument(
    nodes = [map_to_base_node(node) for node in data.nodes],
    relationships = [map_to_base_relationship(rel) for rel in data.rels],
    source = doc
)

print(graph_document)

 # Store information into a graph
graph.add_graph_documents([graph_document])

In [53]:
graph.schema

'Node properties are the following:\n\nRelationship properties are the following:\n\nThe relationships are the following:\n'

In [54]:
graph.get_schema

'Node properties are the following:\n\nRelationship properties are the following:\n\nThe relationships are the following:\n'

In [55]:
nodes:Optional[List[str]] = None
rels:Optional[List[str]]= None

nodes = ['Library', 'Framework', 'Feature', 'Term', ]

# Extract graph data using OpenAI functions
extract_chain2 = get_extraction_chain(llm, nodes, rels)
data2 = extract_chain2.invoke(doc.page_content)['function']

In [63]:
print_nodes(data2)

id=React type=Framework
  <<Properties>>
  ->  key=description value=React (also known as React.js or ReactJS) is a free and open-source front-end JavaScript library for building user interfaces based on components.
  ->  key=maintainedBy value=Meta (formerly Facebook) and a community of individual developers and companies
id=Meta type=Company
id=Next.js type=Framework
id=React DOM type=Library
id=Greeting type=Component
  <<Properties>>
  ->  key=description value=The Greeting function is a React component that displays 'Hello, world'.
id=React Hooks type=Feature
  <<Properties>>
  ->  key=description value=Hooks are functions that let developers 'hook into' React state and lifecycle features from function components.
id=useState type=Hook
id=useContext type=Hook
id=useReducer type=Hook
id=useMemo type=Hook
id=useEffect type=Hook
id=Server components type=Feature
  <<Properties>>
  ->  key=description value=React server components or 'RSC's are function components that run exclusively

In [65]:
# Load into DB
# Construct a graph document
graph_document = GraphDocument(
    nodes = [map_to_base_node(node) for node in data2.nodes],
    relationships = [map_to_base_relationship(rel) for rel in data2.rels],
    source = doc
)

print(graph_document)

 # Store information into a graph
graph.add_graph_documents([graph_document])

nodes=[Node(id='React', type='Framework', properties={'description': 'React (also known as React.js or ReactJS) is a free and open-source front-end JavaScript library for building user interfaces based on components.', 'maintainedby': 'Meta (formerly Facebook) and a community of individual developers and companies', 'name': 'React'}), Node(id='Meta', type='Company', properties={'name': 'Meta'}), Node(id='Next.Js', type='Framework', properties={'name': 'Next.Js'}), Node(id='React Dom', type='Library', properties={'name': 'React Dom'}), Node(id='Greeting', type='Component', properties={'description': "The Greeting function is a React component that displays 'Hello, world'.", 'name': 'Greeting'}), Node(id='React Hooks', type='Feature', properties={'description': "Hooks are functions that let developers 'hook into' React state and lifecycle features from function components.", 'name': 'React Hooks'}), Node(id='Usestate', type='Hook', properties={'name': 'Usestate'}), Node(id='Usecontext', 