# Word Document to Graph
- using neo4j knowledge graph


### Generate Document Objects
- prepare data for preprocessing

In [1]:
# create function to extract text from a folder of .docx files
import os
from spire.doc import Document

# Function Definition
def extract_text_from_folder(folder_path):
    """
    Extracts text from all .docx files in a folder and stores them in a list.
    Removes first line of text from each document to remove warning or metadata lines
    from Spire.Doc that is not part of the actual document content.

    Args:
    folder_path (str): The path to the folder containing the .docx files.

    Returns:
    list: A list containing the extracted text from each document.
    """
    documents_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            filepath = os.path.join(folder_path, filename)
        try:
            document = Document()
            document.LoadFromFile(filepath)
            # Extract text and remove the first line (if it exists)
            text = document.GetText()
            lines = text.splitlines(keepends=True)  # Split by lines, keeping newlines
            if lines:  # Check if there are any lines
                text_without_first_line = "".join(lines[1:])  # Join lines from index 1 onwards
            else:
                text_without_first_line = ""  # Empty string if no lines

            documents_text.append(text_without_first_line)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
    return documents_text

In [2]:
# Call function on "Documents" folder
folder_path = os.getenv("DOC_FILEPATH")
documents_text = extract_text_from_folder(folder_path)

# Print the extracted text
# print(documents_text[0])

In [3]:
from langchain_core.documents import Document


# for each document in documents_text, create a Document object for it and add it to a list called document_objects
document_objects = []
for text in documents_text:
    document = Document(page_content=text)
    document_objects.append(document)

# print the first document object in the list
# print(document_objects[0].page_content)


### Generate Main Body Text and Parse to Knowledge Graph
- will be done for each document
- invoking llm


In [4]:
import os
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["NEO4J_URI"] = neo4j_uri
os.environ["NEO4J_USERNAME"] = neo4j_username
os.environ["NEO4J_PASSWORD"] = neo4j_password
    
graph = Neo4jGraph()

# Create LLMGraphTransformer with GPT-3.5 model
# llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")  ## uncomment this line to use GPT-3.5 model

# Create LLMGraphTransformer with GPT-4 model
llm = ChatOpenAI(temperature=0, model_name="gpt-4o") ## uncomment this line to use GPT-4 model

# Initilise graph transformer
llm_transformer = LLMGraphTransformer(llm=llm)

# Define constants for chunking
CHUNK_SIZE = 512
CHUNK_OVERLAP = 24

In [5]:
import re

# clean the text, remove \n and \r
def clean_text(text):
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\r", " ", text)
    return text

def generate_body_text(text, llm):
    prompt = f"""
        You will be given text extracted from trip reports from Word documents.
        The text will contain the main body content, as well as irrelevant section's random metadata, and whitespace characters like \n, \r, \t. 
        Your task is to extract only the relevant body text from the reports, together with its document title and reporting officer, excluding all other irrelevant information.
        To guide you, the reports are typically:
        - written in paragraph form, with 7 chapters regarding the trip and the city visited
        - includes the document title, city & year visited at the beginning of the report and staffing officer at the end
        - includes names of politicians and initiatives organised by the local government, which are relevant
        - includes innovations and impacts of initiatives, challenges faced, which are relevant
        - main body content is generally written in full sentences and are relevant
        Please output relevant body text bound by each chapter, with the corresponding city, year visited and officer, without any additional formatting or comments.

        Extracted document Text:
        {text}
        """

    # Invoke the LLM model to extract the relevant text
    response = llm.invoke(prompt)
    
    return clean_text(response.content)

In [16]:
from langchain_text_splitters import TokenTextSplitter

# # Create a function that iterates through the list of document objects and generates the body text for each document, rewriting over the page_content attribute
# # Uncomment if processing multiple documents at once
# def generate_documents(documents, llm):
#     """
#     Iterates through the list of document objects and generates the body text for each document, rewriting over the page_content attribute.

#     Args:
#     documents (list): A list of document objects.
#     llm (ChatOpenAI): A ChatOpenAI object.

#     Returns:
#     list: A list of document objects with the page_content attribute rewritten to contain the generated body text.
#     """
#     for document in documents:
#         document.page_content = generate_body_text(document.page_content, llm)
#     return documents


# Create a process and store document function that combines the above functions and stores the processed documents in the graph
def process_and_store_documents(documents, llm, llm_transformer, graph):
    """
    Processes the documents by generating the body text and splitting the documents, then stores the processed documents in the graph.

    Args:
    documents (list): A document object.
    llm (ChatOpenAI): A ChatOpenAI LLM object.
    llm_transformer (LLMGraphTransformer): An LLMGraphTransformer object.
    graph (Neo4jGraph): A Neo4jGraph object.
    """
    # Generate the body text for document
    body_text = generate_body_text(documents.page_content, llm)

    # Split the documents into chunks
    text_splitter = TokenTextSplitter(chunk_size = CHUNK_SIZE, chunk_overlap = CHUNK_OVERLAP)
    split_text = text_splitter.split_text(body_text)
    doc = [Document(page_content=text) for text in split_text]

    # print(doc[0].page_content)

    # Convert the split documents to graph documents and add to graph
    graph_docs = llm_transformer.convert_to_graph_documents(doc)
    graph.add_graph_documents(
        graph_docs,
        baseEntityLabel=True,
        include_source=True
    )


In [None]:
# call the required functions on the document_objects list
# iterate through the list and process each document
for document in document_objects:
    process_and_store_documents(document, llm, llm_transformer, graph)



Knowledge graph looks good. Concept is working well. Can move on to querying