In [1]:
# summarize and extract graph

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [16]:
from langchain.graphs import Neo4jGraph
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage

# Import chat templates
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

# Caching
from langchain.cache import InMemoryCache
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache

from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_runnable,
    create_structured_output_chain
)

from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel
from tqdm import tqdm
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_core.output_parsers import StrOutputParser


In [5]:
# Setup DB

NEO_DB = os.getenv('NEO_DB')
NEO_USER = os.getenv('NEO_USER')
NEO_PASS = os.getenv('NEO_PASS')

url="neo4j://192.168.68.84:7687/"

graph = Neo4jGraph(
    url=url,
    username=NEO_USER,
    password=NEO_PASS
)

In [7]:
# Get documnets

# Read the wikipedia article
raw_documents = WikipediaLoader(query="Mistborn").load()


In [10]:
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=2048, chunk_overlap=24)

# Only take the first the raw_documents
documents = text_splitter.split_documents(raw_documents)

In [11]:
len(documents)

25

In [40]:
def get_system_template():
    system_template = f"""
    You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph. Your task is to summarize and clean data as cleanly and consisely as possible. 
    The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
    
    ## Handling Numerical Data and Dates
    - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
    - **Quotation Marks**: Never use escaped single or double quotes within property values.
    
    ## Coreference Resolution
    - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
    If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 
    always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.  
    Remember, the knowledge sentences should be coherent and easily understandable, so maintaining consistency in entity references is crucial. 
    """
    return system_template

"""
    ## Examples
    "Do not forget about Momofuku Ando! He created instant noodles in Osaka. At that location, Nissin was founded." should be converted to 
    - "Momofuku Ando created instant noodles in Osaka."
    - "Nissin was founded in Osaka.
    """

from langchain.chains import LLMChain, SimpleSequentialChain

def get_extraction_chain( llm):
    human_template = """
    Summarize and simplify the given input below in triple backtics: 
    ```{input}```

    Format Instructions: Each sentence should be on a new line. Do not add bullets or numbering.
    """

    # Shot Prompt
    sample_input_1="Do not forget about Momofuku Ando! He created instant noodles in Osaka. At that location, Nissin was founded."
    sample_output_1 = "Sample response = Momofuku Ando created instant noodles in Osaka. Nissin was founded in Osaka."
    sample_human_prompt_1 = HumanMessagePromptTemplate.from_template(sample_input_1)
    sample_ai_output_1 = AIMessagePromptTemplate.from_template(sample_output_1)

    # 
    system_prompt = SystemMessagePromptTemplate.from_template(get_system_template())
    human_prompt = HumanMessagePromptTemplate.from_template(human_template)
    
    chat_prompt = ChatPromptTemplate.from_messages([system_prompt, sample_human_prompt_1, sample_ai_output_1, human_prompt])
    chain = LLMChain(llm=llm, prompt=chat_prompt)
    
    return SimpleSequentialChain(chains=[chain])


In [41]:
llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

chain = get_extraction_chain(llm)


In [42]:
resp = chain.invoke(documents[0].page_content)

In [43]:
print(documents[0].page_content)

Mistborn is a series of epic fantasy novels written by American author Brandon Sanderson and published by Tor Books. The first trilogy, published between 2006 and 2008, consists of The Final Empire, The Well of Ascension, and The Hero of Ages. A second series was released between 2011 and 2022, and consists of the quartet The Alloy of Law, Shadows of Self, The Bands of Mourning and The Lost Metal. Sanderson also released a novella in 2016, Mistborn: Secret History. Sanderson plans to write a third and fourth series.
The first Mistborn trilogy chronicles the efforts of a secret group of Allomancers who attempt to overthrow a dystopian empire and establish themselves in a world covered by ash. The neologism Allomancer comes from two Greek words ἄλλος (allos) meaning otherly or different, and  μαντεία (manteía) meaning divination or sorcery, it indicates well the peculiar powers of this sect. The first trilogy was a huge success. This success pushed Sanderson to further develop his fictio

In [44]:
print(resp['output'])

Mistborn is a series of epic fantasy novels written by Brandon Sanderson and published by Tor Books.
The first trilogy consists of The Final Empire, The Well of Ascension, and The Hero of Ages, published between 2006 and 2008.
A second series, The Alloy of Law, Shadows of Self, The Bands of Mourning, and The Lost Metal, was released between 2011 and 2022.
Sanderson also released a novella in 2016, Mistborn: Secret History.
The first Mistborn trilogy follows a secret group of Allomancers trying to overthrow a dystopian empire.
The second series is set 300 years after the first trilogy and follows Waxillium Ladrian investigating kidnappings and robberies.
The third series will be set in the early computer age with 1980s technology.
The fourth series is planned to be a space-opera.
Sanderson's first idea for Mistborn was to set a story in a world where the "dark lord" triumphed and the "prophesied hero" failed.
His second idea was to tell a heist story in a fantasy setting.
The idea for t

In [52]:
# Expand
import json
def load_data(file):
    with open(file,'r',encoding='utf-8') as f:
        data=f.read()
    return data

def save_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        f.write(data)
        
def load_data_json(file):
    with open(file,'r',encoding='utf-8') as f:
        data = json.load(f)
    return data

def save_data_json(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)



In [66]:
def doc_todict(doc: Document, context='')-> dict:
    if context is not None and context.strip() != '':
        doc.metadata['context'] = context
        
    r = {}
    r['page_content'] = doc.page_content
    r['metadata'] = doc.metadata

    
    return r 

def dict_todoc(data: dict)-> Document:
    move = ['summary', 'cleaned']

    for item in move:
        if item in data:
            data['meta'][item] = data[item]
    
    doc = Document(page_content=data['page_content'], metadata=data['meta'])

In [67]:
all_docs = []
for doc in documents:
    #cleaned = chain.invoke(doc.page_content)
    #doc.metadata['cleaned'] = cleaned['output']

    d = doc_todict(doc,context='Mistborn')
    all_docs.append(d)
    print(f"Proccessed file: {doc.metadata['source']}")    


Proccessed file: https://en.wikipedia.org/wiki/Mistborn
Proccessed file: https://en.wikipedia.org/wiki/Mistborn:_The_Final_Empire
Proccessed file: https://en.wikipedia.org/wiki/Mistborn:_The_Well_of_Ascension
Proccessed file: https://en.wikipedia.org/wiki/Mistborn:_The_Hero_of_Ages
Proccessed file: https://en.wikipedia.org/wiki/Mistborn:_The_Alloy_of_Law
Proccessed file: https://en.wikipedia.org/wiki/Mistborn:_Secret_History
Proccessed file: https://en.wikipedia.org/wiki/Mistborn:_The_Bands_of_Mourning
Proccessed file: https://en.wikipedia.org/wiki/Mistborn:_Shadows_of_Self
Proccessed file: https://en.wikipedia.org/wiki/Mistborn:_The_Lost_Metal
Proccessed file: https://en.wikipedia.org/wiki/Mistborn_Adventure_Game
Proccessed file: https://en.wikipedia.org/wiki/Brandon_Sanderson
Proccessed file: https://en.wikipedia.org/wiki/Michael_Kramer_(narrator)
Proccessed file: https://en.wikipedia.org/wiki/Arcanum_Unbounded:_The_Cosmere_Collection
Proccessed file: https://en.wikipedia.org/wiki/Br

In [68]:
# Write to file
file = 'mistborn_process.json'
save_data_json(data=all_docs, file=file)

In [69]:
d0 = all_docs[0]

In [71]:
print(d0['metadata']['cleaned'])

Mistborn is a series of epic fantasy novels written by Brandon Sanderson and published by Tor Books.
The first trilogy consists of The Final Empire, The Well of Ascension, and The Hero of Ages, published between 2006 and 2008.
A second series, The Alloy of Law, Shadows of Self, The Bands of Mourning, and The Lost Metal, was released between 2011 and 2022.
Sanderson also released a novella in 2016, Mistborn: Secret History.
The first Mistborn trilogy follows a secret group of Allomancers trying to overthrow a dystopian empire.
The second series is set 300 years after the first trilogy and follows Waxillium Ladrian investigating kidnappings and robberies.
The third series will be set in the early computer age with 1980s technology.
The fourth series is planned to be a space-opera.
Sanderson's first idea for Mistborn was to set a story in a world where the "dark lord" triumphed and the "prophesied hero" failed.
His second idea was to tell a heist story in a fantasy setting.
The idea for t

In [73]:
# Triplets

# !pip install textacy

import spacy
import textacy

In [74]:
text = d0['metadata']['cleaned'].replace("\n", " ")
nlp = spacy.load('en_core_web_sm')
sdoc = nlp(text)

[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [77]:
triples = list(textacy.extract.subject_verb_object_triples(sdoc))
for t in triples:
    print(f" - {t}")

 - SVOTriple(subject=[Sanderson], verb=[released], object=[novella])
 - SVOTriple(subject=[Mistborn, trilogy], verb=[follows], object=[group])
 - SVOTriple(subject=[series], verb=[is, set], object=[Waxillium, Ladrian])
 - SVOTriple(subject=[series], verb=[follows], object=[Waxillium, Ladrian])
 - SVOTriple(subject=[series], verb=[is, planned], object=[to, be, a, space, -, opera])
 - SVOTriple(subject=[Sanderson], verb=[developed], object=[feruchemy, allomancy])
 - SVOTriple(subject=[Sanderson], verb=[began], object=[work])
 - SVOTriple(subject=[Sanderson], verb=[planned], object=[to, publish, multiple, trilogies, set, on, the, fictional, planet, Scadrial, but, in, different, eras])
 - SVOTriple(subject=[series], verb=[takes], object=[place])


In [96]:
def get_triplet_chain(llm):
    human_template = """
    For the given input and context below in triple backticks, split each sentence into triplets.
    
    
    ```{input}```

    Context:
    **Mistborn book series by Brandon Sanderson**

    Format Instructions: 
    Each triplet should be on a new line. Each part of teh triplet should be separated by a pipe `|`.
    The parts of the triplet includes the Subject of the sentence like a name, the verb or action with at most 2 words, and finally the object being targetted.
    For context all lines are from a single document and can be referenced if needed. 
    Perform conference resolution where possible e.g. With the context of "Lord of The Rings", the entity "The trilogy" can be phrased as "Lord of The Rings trilogy".
    Use full entity names where possible e.g. "Brandon Sanderson" instead of "Sanderson".
    
    """

    # Shot Prompt
    sample_input_1="Mistborn is a series of epic fantasy novels written by Brandon Sanderson and published by Tor Books. The first trilogy consists of The Final Empire, The Well of Ascension, and The Hero of Ages, published between 2006 and 2008."
    sample_output_1 = "Sample triplets = Mistborn Series|written by|Brandon Sanderson \n Mistborn Series | has book | The Final Empire \n Mistborn Series | has book | The Hero of Ages \n Mistborn Series | has book | The Hero of Ages \n The Hero of Ages \n The Final Empire | published between | 2006 and 2008"
    sample_human_prompt_1 = HumanMessagePromptTemplate.from_template(sample_input_1)
    sample_ai_output_1 = AIMessagePromptTemplate.from_template(sample_output_1)

    # 
    system_prompt = SystemMessagePromptTemplate.from_template(get_system_template())
    human_prompt = HumanMessagePromptTemplate.from_template(human_template)
    
    chat_prompt = ChatPromptTemplate.from_messages([system_prompt, sample_human_prompt_1, sample_ai_output_1, human_prompt])
    chain = LLMChain(llm=llm, prompt=chat_prompt, verbose=True)
    
    return SimpleSequentialChain(chains=[chain])

In [97]:
tchain = get_triplet_chain(llm)

In [98]:
ts = tchain.invoke(text)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: 
    You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph. Your task is to summarize and clean data as cleanly and consisely as possible. 
    The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
    
    ## Handling Numerical Data and Dates
    - Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
    - **Quotation Marks**: Never use escaped single or double quotes within property values.
    
    ## Coreference Resolution
    - **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
    If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 
    always use the most complete ide

In [99]:
print(ts['output'])

Mistborn | is a series of | epic fantasy novels
Mistborn | written by | Brandon Sanderson
Mistborn | published by | Tor Books
The first trilogy | consists of | The Final Empire, The Well of Ascension, and The Hero of Ages
The first trilogy | published between | 2006 and 2008
A second series | released between | 2011 and 2022
A second series | consists of | The Alloy of Law, Shadows of Self, The Bands of Mourning, and The Lost Metal
Brandon Sanderson | released | a novella in 2016
The novella | titled | Mistborn: Secret History
The first Mistborn trilogy | follows | a secret group of Allomancers trying to overthrow a dystopian empire
The second series | set | 300 years after the first trilogy
The second series | follows | Waxillium Ladrian investigating kidnappings and robberies
The third series | will be set | in the early computer age with 1980s technology
The fourth series | is planned to be | a space-opera
Brandon Sanderson | had the idea for | Mistborn as a world where the "dark lo