# Connection

In [None]:
!pip install langchain
!pip install -U langchain-community
!pip install langchain-openai
!pip install langchain-openai tiktoken

Collecting langchain
  Downloading langchain-0.2.5-py3-none-any.whl.metadata (7.0 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.30-cp312-cp312-macosx_10_9_x86_64.whl.metadata (9.6 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.1-py3-none-any.whl.metadata (2.2 kB)
Collecting greenlet!=0.4.17 (from SQLAlchemy<3,>=1.4->langchain)
  Downloading greenlet-3.0.3-cp312-cp312-macosx_11_0_universal2.whl.metadata (3.8 kB)
Downloading langchain-0.2.5-py3-none-any.whl (974 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_text_splitters-0.2.1-py3-none-any.whl (23 kB)
Downloading SQLAlchemy-2.0.30-cp312-cp312-macosx_10_9_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading greenlet-3.0.3-cp312-cp312-m

In [None]:
!pip install neo4j


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
from langchain.graphs import Neo4jGraph

graph = Neo4jGraph(url="bolt://localhost:7687", username= "neo4j", password="isoon2299")

In [None]:
!pip install --upgrade --quiet  langchain-openai

In [None]:
from langchain_openai import ChatOpenAI

In [None]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

In [None]:
class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

In [None]:
class Node(BaseNode):
  properties: Optional[List[Property]] = Field(
      None, description="List of relationship properties"
  )

In [None]:
class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

In [None]:
class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [None]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

In [None]:
def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

In [None]:
def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )

In [None]:
def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [None]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

os.environ["OPENAI_API_KEY"] = "sk-proj-CQbOjrwUwFyNrGHMiEgfT3BlbkFJIAKRts2U70Qs71nFGvx2"
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

Besides the general instructions, I have also added the option to limit which node or relationship types should be extracted from text. You'll see through examples why this might come in handy. We have the Neo4j connection and LLM prompt ready, which means we can define the information extraction pipeline as a single function.

In [None]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

# Data Preprocessing

In [None]:
from langchain.docstore.document import Document
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings

In [None]:
from langchain.text_splitter import CharacterTextSplitter


In [None]:
from langchain.text_splitter import TokenTextSplitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
from langchain_community.vectorstores import Neo4jVector

In [None]:
import PyPDF2

In [None]:
def pdf_to_text(pdf_path, output_txt):
    # Open the PDF file in read-binary mode
    with open(pdf_path, 'rb') as pdf_file:
        # Create a PdfReader object instead of PdfFileReader
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Initialize an empty string to store the text
        text = ''

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    # Write the extracted text to a text file
    with open(output_txt, 'w', encoding='utf-8') as txt_file:
        txt_file.write(text)


In [None]:
if __name__ == "__main__":
    pdf_path = "/Users/jaesoon/Desktop/Data/OneDegreeWar.pdf"

    output_txt = '/Users/jaesoon/Desktop/Data/OneDegreeWar.txt'

    pdf_to_text(pdf_path, output_txt)

    print("PDF converted to text successfully!")

PDF converted to text successfully!


In [None]:
#from PyPDF2 import PdfReader
#reader = PdfReader("/Users/jaesoon/Desktop/Data/OneDegreeWar.pdf")
#number_of_pages = len(reader.pages)
#page = reader.pages[0]
#text = page.extract_text()

In [None]:
loader = TextLoader("/Users/jaesoon/Desktop/Data/OneDegreeWar.txt")

In [None]:
documents = loader.load()
text_splitter = TokenTextSplitter(chunk_size=90, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
text_splitter.split_documents(documents)[:10]

[Document(page_content='The one degree war plan\nJorgen Randers\nNorwegian School of Management BI, Oslo, Norway, and\nPaul Gilding\nProgramme for Sustainability Leadership, University of Cambridge,\nSydney, Australia\nAbstract\nPurpose – The purpose of this paper is to present the idea of a global crisis plan that will be\ndemanded when global society ﬁnally decides that the climate challenge is a', metadata={'source': '/Users/jaesoon/Desktop/Data/OneDegreeWar.txt'}),
 Document(page_content=' real threat, requiring\nimmediate and strong policy action at the super-national level. The authors believe that this demand\nwill arise before 2020, and the authors hope that this paper will encourage others to improve on the\nplan.\nDesign/methodology/approach – The paper seeks to achieve the purpose by presenting the ﬁrst\ndraft of such a plan – “The one degree war plan” – in rather', metadata={'source': '/Users/jaesoon/Desktop/Data/OneDegreeWar.txt'}),
 Document(page_content=' concrete terms,

In [None]:
from tqdm import tqdm

for i, d in tqdm(enumerate(docs), total=len(documents)):
  extract_and_store_graph(d)

  warn_deprecated(
150it [21:36,  8.65s/it]


In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
import sys
import os

py_file_location = "openai-chat.py"
sys.path.append(os.path.abspath(py_file_location))

In [None]:
os.environ["OPENAI_API_KEY"] = 'sk-proj-CQbOjrwUwFyNrGHMiEgfT3BlbkFJIAKRts2U70Qs71nFGvx2'

In [None]:
db = Neo4jVector.from_documents(
    docs, OpenAIEmbeddings(), url="bolt://localhost:7687", username= "neo4j", password="isoon2299"
)

In [None]:
query = "Tell me about the global climate change"
results = db.similarity_search(query, k=1)
print(results[0].page_content)

lobal clim
