# Setup

### Imports

In [None]:
# setup venv
# !pip install langchain langchain-community neo4j openai wikipedia tiktoken langchain-openai torch pinecone
# %pip install llama-index-embeddings-openai
# %pip install llama-index-vector-stores-pinecone
# %pip install llama-index-llms-openai
# !pip install llama-index
# !pip -q install python-dotenv pinecone-client llama-index pymupdf

In [8]:
import os
import torch
import torch.nn as nn
from dotenv import load_dotenv
from openai import OpenAI
from langchain.graphs import Neo4jGraph
import tqdm
import json

import pinecone
from pinecone import Pinecone, ServerlessSpec

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore

from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel


### Env and tooling

In [5]:
load_dotenv(dotenv_path='secrets.env')

True

In [6]:
api_key = os.environ['PINECONE_API_KEY']
environment = os.environ['PINECONE_ENVIRONMENT']
pinecone = Pinecone(api_key=api_key, environment=environment)
index_name = "knowledge-graph"
pinecone_index = pinecone.Index(index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [7]:
load_dotenv(dotenv_path='secrets.env')
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [9]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

# Graph Classes and Functions

In [10]:
class Property(BaseModel):
    key: str = Field(..., description="key")
    value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(None, description="node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(None, description="relationship properties")

class KnowledgeGraph(BaseModel):
    nodes: List[Node] = Field(..., description="nodes in the graph")
    rels: List[Relationship] = Field(..., description="relationships in the graph")

In [11]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return first_word + "".join(capitalized_words)

def props_to_dict(props) -> dict:
    properties = {}
    if not props:
        return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    properties = props_to_dict(node.properties) if node.properties else {}
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )

def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [12]:
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
        allowed_nodes: Optional[List[str]] = None,
        allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
            "system",
            f"""# Knowledge Graph Instructions for GPT-4
    ## 1. Overview
    You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
    - **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
    - The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
    ## 2. Labeling Nodes
    - **Consistency**: Ensure you use basic or elementary types for node labels.
    - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
    - **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
    {'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
    {'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
    ## 3. Handling Numerical Data and Dates
    - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
    - **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
    - **Property Format**: Properties must be in a key-value format.
    - **Quotation Marks**: Never use escaped single or double quotes within property values.
    - **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
    ## 4. Coreference Resolution
    - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
    If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
    always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
    Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
    ## 5. Strict Compliance
    Adhere to the rules strictly. Non-compliance will result in termination.
            """),
            ("human", "Use the given format to extract info from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [13]:
def extract_and_store_graph(
      document:Document,
      nodes:Optional[List[str]] = None,
      rels:Optional[List[str]]=None) -> GraphDocument:
    
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
        
    graph.add_graph_documents([graph_document])


# Loading Text

### Wikipedia Loader

In [14]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter

# Read the wikipedia article
raw_documents = WikipediaLoader(query="History of France").load()

# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=600, chunk_overlap=24)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents)[:10]

### Adversarial Prompt

In [None]:
text = """
Event A came before Event B.
Event B came before Event C.
Event C came before Event A.
"""
documents = [Document(page_content=text)]

# Build Graph

### Delete Graph

In [16]:
# Delete the graph
graph.query("MATCH (n) DETACH DELETE n")

[]

### Construct Graph

In [17]:
from tqdm import tqdm

for i, d in tqdm(enumerate(documents), total=len(documents)):
    extract_and_store_graph(d)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [03:45<00:00, 22.56s/it]


# Get Nodes and Relationships, Upsert to Pinecone

In [None]:
query = "MATCH (n) RETURN n"
all_nodes = graph.query(query)
all_nodes_list = []
for record in all_nodes:
    if (record.get('n').get('name') is not None):
        all_nodes_list.append(record.get('n').get('name'))
print(all_nodes_list)

In [None]:
query = "MATCH (n)-[r]-(m) RETURN distinct type(r)"
all_rel_types = graph.query(query)
all_rel_types_filtered = []
for record in all_rel_types:
    # split into list by :
    rel_type = record.get('type(r)').split(':')
    print(rel_type[0].lower())
    all_rel_types_filtered.append(rel_type[0].lower())

### Upserting to Pinecone

In [None]:
# test text embeddings ada 002
def get_embeddings_ada_002(text):
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input = text
    )
    return response.data[0].embedding

In [None]:
print(len(get_embeddings_ada_002("I eat apples")))

In [None]:
documents = [all_rel_types_filtered, all_nodes_list]
embeddings_to_upsert = []
for d in documents:
    for record in d:
        if (record is not None):
            embeddings_to_upsert.append(get_embeddings_ada_002(record))

# Intelligent Graph Queries

In [None]:
# wrapper function that can edit the original query
def query_runner(query, nodes, relationships):
    # nodes should be a list of nodes
    # relationships should be a list of relationships
    # query should be a string
    answer = ""
    iteration_ctr = 0

    get_prompts = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {'role': 'system', 'content': 'You are a highly intelligent agent whose task is to alter original prompts to make similar prompts which are worded differently. Use synonyms, paraphrases, and other techniques to alter the original prompts. Freely switch out words in the prompt with other nodes and relationships provided.'},
            {'role': 'user', 'content': 
                'Generate 9 prompts in a numbered list with no extra newlines. The nodes in this graph are:\n' 
                + nodes.join(', ')
                + '\nThe relationships in this graph are:\n' 
                + relationships.join(', ')
                + 'The original prompt is:\n' 
                + query
            }
        ]
    )
    
    while answer == "" and iteration_ctr < 10:
        
        iteration_ctr += 1
    

    

# Vanilla Graph Queries

### Original Single Query

In [None]:
# original query for comparison
cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)

res = cypher_chain.run("Did France have any relationship with China?")
print(res)

### Multiple Generated Queries + Summarization

#### Generate Prompts

In [None]:
decide_continue_response_with_context = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {'role': 'system', 'content': 'You are a highly intelligent agent whose task is to alter original prompts to make similar prompts which are worded differently. You will be given a list of relationship types. Use synonyms, paraphrases, and other techniques to alter the original prompts.'},
        {'role': 'user', 'content': 
            'Generate 9 prompts in a numbered list with no extra newlines, using the relationship types provided. The original prompt is:\n' 
            + original_query + '\n'
            + 'The available relationship types in the knowledge graph are:\n' + '\n'.join(all_rel_types_filtered)
            + 'This is a list of all nodes in the knowledge graph:\n' + '\n'.join(all_nodes_list)
        }
    ]
)

response = decide_continue_response_with_context.choices[0].message.content
print(response)

In [None]:

# more loose version


decide_continue_response_with_context = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {'role': 'system', 'content': 'You are a highly intelligent agent whose task is to alter original prompts to make similar prompts which are worded differently. You will be given a list of relationship types. Use synonyms, paraphrases, and other techniques to alter the original prompts.'},
        {'role': 'user', 'content': 
            'Generate 9 prompts in a numbered list with no extra newlines, using the relationship types provided. Feel free to be more creative with the original query. For example, you may replace nodes in the query with nearby nodes. The original prompt is:\n' 
            + original_query + '\n'
            + 'The available relationship types in the knowledge graph are:\n' + '\n'.join(all_rel_types_filtered)
            + 'This is a list of all nodes in the knowledge graph:\n' + '\n'.join(all_nodes_list)
        }
    ]
)

response = decide_continue_response_with_context.choices[0].message.content
print(response)

# make a list of the 9 prompts
prompts = response.split('\n')
for i in range(len(prompts)):
    prompts[i] = prompts[i][3:]

for prompt in prompts:
    print(prompt)

#### Run Queries

In [None]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-4o"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)
outputs = []
for prompt in prompts:
    try:
        res = cypher_chain.run(prompt)
    except Exception as e:
        print(f"An error occurred: {e}")
        continue
    if res is not "I don't know the answer.":
        outputs.append(res)

# merge outputs into string
context = "\n".join(outputs)

query_response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {'role': 'system', 'content': 'You are a highly intelligent agent whose task is to answer questions only based on the context you are given. Your answers must be concise and very short.'},
        {'role': 'user', 'content': 
            'The original question is: '+ original_query + '\n'
            'The given context is: ' + context + '\n'
            'Your task is to answer the original question using only the context you are given. Your answer must be concise and very short.'
        }
    ]
)

print(query_response.choices[0].message.content)


# embed query
# embed all edges 
# cosine similarity
