# Word Document to Graph
- using neo4j knowledge graph


### Generate Document Objects
- prepare data for preprocessing

In [1]:
# create function to extract text from a folder of .docx files
import os
from spire.doc import Document

# Function Definition
def extract_text_from_folder(folder_path):
    """
    Extracts text from all .docx files in a folder and stores them in a list.
    Removes first line of text from each document to remove warning or metadata lines
    from Spire.Doc that is not part of the actual document content.

    Args:
    folder_path (str): The path to the folder containing the .docx files.

    Returns:
    list: A list containing the extracted text from each document.
    """
    documents_text = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):
            filepath = os.path.join(folder_path, filename)
        try:
            document = Document()
            document.LoadFromFile(filepath)
            # Extract text and remove the first line (if it exists)
            text = document.GetText()
            lines = text.splitlines(keepends=True)  # Split by lines, keeping newlines
            if lines:  # Check if there are any lines
                text_without_first_line = "".join(lines[1:])  # Join lines from index 1 onwards
            else:
                text_without_first_line = ""  # Empty string if no lines

            documents_text.append(text_without_first_line)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")
    return documents_text

In [2]:
# Call function on "Documents" folder
folder_path = os.getenv("DOC_FILEPATH")
documents_text = extract_text_from_folder(folder_path)

In [3]:
print(documents_text[0])

Chee Kai Ming World City Prize 2020 
Arboria Trip Report – Summary

Background:
Arboria, a name synonymous with urban sustainability, pulsates with life. Nestled amidst a network of sprawling urban forests and cutting-edge eco-buildings, this vibrant city stands as a testament to the power of ecological harmony. From fostering a thriving local food movement to pioneering green building practices, Arboria exemplifies how environmental responsibility can fuel progress and prosperity.
This chapter delves into the heart of Arboria, exploring the key pillars that have transformed the city into a model of sustainable development.
Cultivating a Local Bounty:
Unlike many metropolises reliant on distant farms, Arboria prioritizes local food production. A network of rooftop gardens, vertical farms, and community-supported agriculture (CSA) initiatives ensures residents have access to fresh, healthy, and locally grown produce. This not only reduces transportation emissions but also breathes life 

In [6]:
from langchain_core.documents import Document


# for each document in documents_text, create a Document object for it and add it to a list called document_objects
document_objects = []
for text in documents_text:
    document = Document(page_content=text)
    document_objects.append(document)

# print the first document object in the list
# print(document_objects[0].page_content)


Chee Kai Ming World City Prize 2020 
Ecodora Trip Report – Summary

Background:

Ecodora, a flourishing city dedicated to sustainability, harnesses renewable energy sources, boasts a recycling rate of 75%, and has implemented an efficient public transportation system that has reduced car usage by 40%. Verdant parks and green spaces encompass 25% of the city, providing a tranquil oasis for residents. Ecodora's education system prioritizes environmental stewardship and critical thinking, with 90% of high school graduates pursuing higher education. The city actively engages in global collaborations, sharing innovative ideas and supporting sustainable initiatives worldwide. Residents of Ecodora enjoy an exceptional quality of life, with an impressive 85% satisfaction rating. Ecodora exemplifies how sustainability can create a harmonious and prosperous urban environment.


Key Strategies Adopted:
1. Eco-Transit Revolution:
* Strategy: Ecodora has prioritized public transportation, investing

In [10]:
print(document_objects)

[Document(page_content='Chee Kai Ming World City Prize 2020 \r\nArboria Trip Report – Summary\r\n\r\nBackground:\r\nArboria, a name synonymous with urban sustainability, pulsates with life. Nestled amidst a network of sprawling urban forests and cutting-edge eco-buildings, this vibrant city stands as a testament to the power of ecological harmony. From fostering a thriving local food movement to pioneering green building practices, Arboria exemplifies how environmental responsibility can fuel progress and prosperity.\r\nThis chapter delves into the heart of Arboria, exploring the key pillars that have transformed the city into a model of sustainable development.\r\nCultivating a Local Bounty:\r\nUnlike many metropolises reliant on distant farms, Arboria prioritizes local food production. A network of rooftop gardens, vertical farms, and community-supported agriculture (CSA) initiatives ensures residents have access to fresh, healthy, and locally grown produce. This not only reduces tra

### Generate Main Body Text
- will be done for each document
- invoking llm


In [7]:
import os
from dotenv import load_dotenv
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
neo4j_uri = os.getenv("NEO4J_URI")
neo4j_username = os.getenv("NEO4J_USERNAME")
neo4j_password = os.getenv("NEO4J_PASSWORD")

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["NEO4J_URI"] = neo4j_uri
os.environ["NEO4J_USERNAME"] = neo4j_username
os.environ["NEO4J_PASSWORD"] = neo4j_password
    
graph = Neo4jGraph()

# Create LLMGraphTransformer with GPT-3.5 model
# llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")  ## uncomment this line to use GPT-3.5 model

# Create LLMGraphTransformer with GPT-4 model
llm = ChatOpenAI(temperature=0, model_name="gpt-4o") ## uncomment this line to use GPT-4 model

llm_transformer = LLMGraphTransformer(llm=llm)

In [None]:
import re

# clean the text, remove \n and \r
def clean_text(text):
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\r", " ", text)
    return text

def generate_body_text(text, llm):
    prompt = f"""
        You will be given text extracted from trip reports from Word documents.
        The text will contain the main body content, as well as irrelevant section's random metadata, and whitespace characters like \n, \r, \t. 
        Your task is to extract only the relevant body text from the reports, together with its document title and reporting officer, excluding all other irrelevant information.
        To guide you, the reports are typically:
        - written in paragraph form, with 7 chapters regarding the trip and the city visited
        - includes the document title, city & year visited at the beginning of the report and staffing officer at the end
        - includes names of politicians and initiatives organised by the local government, which are relevant
        - includes impacts of initiatives, challenges faced, which are relevant
        Please output relevant body text bound by each chapter, with the corresponding city, year visited and officer, without any additional formatting or comments.

        Extracted document Text:
        {text}
        """

    # Invoke the LLM model to extract the relevant text
    response = llm.invoke(prompt)
    
    return clean_text(response.content)

In [None]:
from langchain_text_splitters import TokenTextSplitter

# Create a function that iterates through the list of document objects and generates the body text for each document, rewriting over the page_content attribute
def generate_body_text_for_documents(documents, llm):
    """
    Iterates through the list of document objects and generates the body text for each document, rewriting over the page_content attribute.

    Args:
    documents (list): A list of document objects.
    llm (ChatOpenAI): A ChatOpenAI object.

    Returns:
    list: A list of document objects with the page_content attribute rewritten to contain the generated body text.
    """
    for document in documents:
        document.page_content = generate_body_text(document.page_content, llm)

    return documents


# Create a function that iterates throught the list of document objects and splits the page content using text_splitter
def split_documents(documents):
    """
    Iterates through the list of document objects and splits the page content using text_splitter.

    Args:
    documents (list): A list of document objects.

    Returns:
    list: A list of document objects with the page_content attribute split using text_splitter.
    """
    text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
    return text_splitter.split_documents(documents)



In [19]:
from langchain_text_splitters import TokenTextSplitter


text_splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=24)
# test_text = [text_splitter.split_documents(obj) for obj in document_objects]
test_text = text_splitter.split_documents(document_objects)


In [24]:
# Call the clean_text function on each text chunk
cleaned_text = [clean_text(chunk.page_content) for chunk in test_text]

# Rerwrite the cleaned text back to the a document object
for i, chunk in enumerate(test_text):
    chunk.page_content = cleaned_text[i]


In [25]:
test_text

[Document(page_content="Chee Kai Ming World City Prize 2020   Arboria Trip Report – Summary    Background:  Arboria, a name synonymous with urban sustainability, pulsates with life. Nestled amidst a network of sprawling urban forests and cutting-edge eco-buildings, this vibrant city stands as a testament to the power of ecological harmony. From fostering a thriving local food movement to pioneering green building practices, Arboria exemplifies how environmental responsibility can fuel progress and prosperity.  This chapter delves into the heart of Arboria, exploring the key pillars that have transformed the city into a model of sustainable development.  Cultivating a Local Bounty:  Unlike many metropolises reliant on distant farms, Arboria prioritizes local food production. A network of rooftop gardens, vertical farms, and community-supported agriculture (CSA) initiatives ensures residents have access to fresh, healthy, and locally grown produce. This not only reduces transportation em

In [9]:

prompt = f"""
    You will be given text extracted from trip reports from Word documents.
    The text will contain the main body content, as well as irrelevant section's random metadata, and whitespace characters like \n, \r, \t. 
    Your task is to extract only the relevant body text from the reports, together with its document title and reporting officer, excluding all other irrelevant information.
    To guide you, the reports are typically:
    - written in paragraph form, with 7 chapters regarding the trip and the city visited
    - includes the document title, city & year visited at the beginning of the report and staffing officer at the end
    - includes names of politicians and initiatives organised by the local government, which are relevant
    - includes impacts of initiatives, challenges faced, which are relevant
    Please output relevant body text bound by each chapter, with the corresponding city, year visited and officer, without any additional formatting or comments.

    Extracted document Text:
    {cleaned_text}
    """

# Invoke the LLM model to extract the relevant text
response = llm.invoke(prompt)

In [17]:
response.content

'Document Title: Chee Kai Ming World City Prize 2020\nCity Visited: Ecodora\nYear Visited: 2020\n\nBackground:\nEcodora, a flourishing city dedicated to sustainability, harnesses renewable energy sources, boasts a recycling rate of 75%, and has implemented an efficient public transportation system that has reduced car usage by 40%. Verdant parks and green spaces encompass 25% of the city, providing a tranquil oasis for residents. Ecodora\'s education system prioritizes environmental stewardship and critical thinking, with 90% of high school graduates pursuing higher education. The city actively engages in global collaborations, sharing innovative ideas and supporting sustainable initiatives worldwide. Residents of Ecodora enjoy an exceptional quality of life, with an impressive 85% satisfaction rating. Ecodora exemplifies how sustainability can create a harmonious and prosperous urban environment.\n\nKey Strategies Adopted:\n1. Eco-Transit Revolution:\n* Strategy: Ecodora has prioritiz

In [16]:
from langchain_core.documents import Document

documents = [Document(page_content=response.content)]
# Extract graph data
graph_documents = llm_transformer.convert_to_graph_documents(documents)
# Store to neo4j
graph.add_graph_documents(
  graph_documents, 
  baseEntityLabel=True, 
  include_source=True
)

test import to neo4j graph seems to work nicely, reflected on neo4j workspace as well

In [None]:
def generate_body_text(text, llm):
    prompt = f"""
        You will be given text extracted from trip reports from Word documents.
        The text will contain the main body content, as well as irrelevant section's random metadata, and whitespace characters like \n, \r, \t. 
        Your task is to extract only the relevant body text from the reports, together with its document title and reporting officer, excluding all other irrelevant information.
        To guide you, the reports are typically:
        - written in paragraph form, with 7 chapters regarding the trip and the city visited
        - includes the document title, city & year visited at the beginning of the report and staffing officer at the end
        - includes names of politicians and initiatives organised by the local government, which are relevant
        - includes impacts of initiatives, challenges faced, which are relevant
        Please output relevant body text bound by each chapter, with the corresponding city, year visited and officer, without any additional formatting or comments.

        Extracted document Text:
        {text}
        """

    # Invoke the LLM model to extract the relevant text
    response = llm.invoke(prompt)
    
    clean_text = re.sub(r'\n', '', response.content)
    return clean_text