## Neo4j credentials

In [1]:
from langchain.graphs import Neo4jGraph
from neo4j import GraphDatabase
import os
from dotenv import load_dotenv
load_dotenv()

url = os.getenv('DB_URL')
username = os.getenv('DB_USER')
password = os.getenv('DB_PASS')

graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)
driver = GraphDatabase.driver(url, auth=(username, password))

In [2]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.docstore.document import Document

from typing import List, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [3]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [4]:
def get_nodes_and_labels():
    cypher_query = "MATCH (n) RETURN n.id AS node_id, labels(n) AS node_label"
    dictionary = {}

    with driver.session() as session:
        result = session.run(cypher_query)
        for record in result:
            if record['node_id'] not in dictionary.keys():
                dictionary[record['node_id']] = record['node_label']

    return dictionary

In [67]:
from chat.raw_text_parser import create_label

def check_label(node, label):
    dictionary = get_nodes_and_labels()
    if node in dictionary.keys():
        return dictionary[node][0]
    elif label == 'Node':
        return create_label(node)
    else:
        return label

In [6]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = ""
llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information from neuro scientific papers in structured formats to build a knowledge graph.Nodes represent entities and concepts like disease, treatment, etc.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels, do not be specific. Remember that we are creating knowledge graph for neuroscience domin
  - ALWAYS CREATE MEANINGFUL LABEL
- **Node Names**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers.
  Never include acronyms and abbreviations in nodes id and name. Save them as property "abbreviation" of the respective nodes.
- Do not include information about author, papers and publication year.
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use space for property keys, e.g., `Birth Date`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.

Remember, the knowledge graph should be coherent and easily understandable for scientists, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [48]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:

    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']

    nodes = []
    for node in data.nodes:
        node.type = str(check_label(node.id, node.type))
        map_to_base_node(node)
        nodes.append(map_to_base_node(node))

    for node in data.nodes:
        for rel in data.rels:
            if node.id == rel.source.id:
                rel.source = node
            elif node.id == rel.target.id:
                rel.target = node

    graph_document = GraphDocument(
      nodes=nodes,
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document)

    graph.add_graph_documents([graph_document])

In [8]:
from utils.reader import list_files, txt_to_doc
from utils.debugger import logger

input_papers = list_files('./data/txt_parsed_papers', ending='.txt')
print(input_papers)
documents = []
for paper in input_papers:
        try:
            documents.append(txt_to_doc(f'./data/txt_parsed_papers/{paper}'))
        except Exception as e:
            logger.exception(f'not successfully read txt, exception "{e}"')


['parsed_paper_10.1002@rmv.2278.txt', 'parsed_paper_voon2006.txt', 'parsed_paper_innovations.txt', 'parsed_paper_luigjes2011.txt', 'parsed_paper_10.1002@mds.26723.txt', 'parsed_paper_weaver2005.txt', 'parsed_paper_wishart2003.txt', 'parsed_paper_patients.txt', 'parsed_paper_coley2009.txt', 'parsed_paper_karapinarsenturk2020.txt', 'parsed_paper_prospective.txt', 'parsed_paper_A systematic review of psychiatric indications for deep brain stimulation  with focus on major depressive and obsessive-compulsive disorder.txt', 'parsed_paper_10.1016@j.clineuro.2019.105442.txt', 'parsed_paper_robert2018.txt', 'parsed_paper_bibliometric.txt', 'parsed_paper_side.txt']


In [79]:
extract_and_store_graph(documents[15])

2024-03-21 00:10:42,551 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [60]:
get_nodes_and_labels()

{'Deep Brain Stimulation': ['Treatment'],
 "Parkinson'S Disease": ['Disease'],
 'Dopamine Dysregulation Syndrome': ['Condition'],
 'Subthalamic Nucleus': ['Brain region'],
 'Globus Pallidus Internus': ['Brain region'],
 'Cognitive Morbidity': ['Condition'],
 'Neuropsychiatric Symptoms': ['Symptom']}

In [22]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=(len(documents) - 1)):
    print(d.metadata['title'])
    extract_and_store_graph(d)

  0%|          | 0/15 [00:00<?, ?it/s]

10.1002@rmv.2278


  0%|          | 0/15 [00:04<?, ?it/s]


KeyboardInterrupt: 

# Query the KG

In [12]:
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True,
    verbose=True
)

In [14]:
cypher_chain.invoke({"query": "What Deep Brain Stimulation treats?"})



[1m> Entering new GraphCypherQAChain chain...[0m


2024-03-18 18:26:50,068 [INFO] HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Generated Cypher:
[32;1m[1;3mMATCH (:Concept)-[:TREATS]->(:Disease)
WHERE (:Concept {name: 'Deep Brain Stimulation'})
RETURN DISTINCT (:Disease).name[0m


ValueError: Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Invalid input ':': expected "NOT", an expression or an identifier (line 2, column 8 (offset: 46))
"WHERE (:Concept {name: 'Deep Brain Stimulation'})"
        ^}