In [43]:
%pip install --upgrade --quiet langchain langchain-community langchain-openai langchain-experimental neo4j tiktoken python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [49]:
!pip install --upgrade pymupdf



In [50]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Tuple, List
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import Neo4jGraph
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader, OnlinePDFLoader, PyPDFium2Loader, PDFMinerLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import ChatOpenAI
from langchain_experimental.graph_transformers import LLMGraphTransformer
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_core.runnables import ConfigurableField, RunnableParallel, RunnablePassthrough

In [122]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["NEO4J_URI"] = "neo4j://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "Willthe1!"
os.environ['OPENAI_API_KEY']

graph = Neo4jGraph()

# Data Ingestion

In [72]:
# raw_documents = PyMuPDFLoader('/Users/williamzebrowski/Library/Mobile Documents/com~apple~CloudDocs/transform_llm/rag/AIAYN.pdf')
raw_documents = PyMuPDFLoader("wyatt.pdf")

# pages[0]

In [73]:
data = raw_documents.load()

In [75]:
data[3]

Document(metadata={'source': 'wyatt.pdf', 'file_path': 'wyatt.pdf', 'page': 3, 'total_pages': 174, 'format': 'PDF 1.3', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'PyFPDF 1.7.2 http://pyfpdf.googlecode.com/', 'creationDate': 'D:20240605184108', 'modDate': '', 'trapped': ''}, page_content='Your information will remain completely confidential. I will not share your personal info, including\nfinancial information, with any third party. This includes all information that you share with me over\ntext. For more info, please see the Wyatt Terms & Conditions: http://getfafsahelp.org/terms\n')

In [107]:
# Read the wikipedia article
# raw_documents = WikipediaLoader(query="Elizabeth I").load()
# Define chunking strategy
text_splitter = TokenTextSplitter(chunk_size=180, chunk_overlap=10)
documents = text_splitter.split_documents(data)

In [108]:
llm=ChatOpenAI(temperature=0, model_name="gpt-4o-mini") # gpt-4-0125-preview occasionally has issues
llm_transformer = LLMGraphTransformer(llm=llm)

graph_documents = llm_transformer.convert_to_graph_documents(documents)
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)

In [111]:
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [112]:
# Retriever

graph.query(
    "CREATE FULLTEXT INDEX entity IF NOT EXISTS FOR (e:__Entity__) ON EACH [e.id]")

# Extract entities from text
class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the person, organization, or business entities that "
        "appear in the text",
    )

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting organization and person entities from the text.",
        ),
        (
            "human",
            "Use the given format to extract information from the following "
            "input: {question}",
        ),
    ]
)

entity_chain = prompt | llm.with_structured_output(Entities)

In [113]:
entity_chain.invoke({"question": "what is fafsa?"}).names

['fafsa']

In [114]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    print(f'generate full text query: {full_text_query.strip()}')
    return full_text_query.strip()

# Fulltext index query
def structured_retriever(question: str) -> str:
    """
    Collects the neighborhood of entities mentioned
    in the question
    """
    result = ""
    entities = entity_chain.invoke({"question": question})
    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
        print(f"structured_retriever result: {result}")
    return result

In [115]:
print(structured_retriever("What is fafsa?"))

generate full text query: fafsa~2
structured_retriever result: Fafsa - REQUIRES -> Fsa Id
Fafsa - REQUIRES -> Consent
Fafsa - REQUIRES -> Colleges
Fafsa - REQUIRES -> Currency Conversion
Fafsa - REQUIRES -> Section
Fafsa - REQUIRES -> Contributor
Fafsa - REQUIRES -> Parental_Info
Fafsa - REQUIRES -> Homeless_Youth_Determination
Fafsa - REQUIRES -> Bank Account Info
Fafsa - REQUIRES -> Parent Info
Fafsa - REQUIRES -> 2022 Tax Returns
Fafsa - CONTAINS -> Financial_Questions
Fafsa - QUALIFIES_FOR -> Federal Financial Aid
Fafsa - USES -> Assets
Fafsa - USES -> Income
Fafsa - LINKS_TO -> Video
Fafsa - GENERATES -> Fafsa Submission Summary
Fafsa - GENERATES -> Confirmation Email
Fafsa - DETERMINES -> State Aid
Fafsa - ASSESSES -> Federal Aid
Fafsa - HELPS_PAY_FOR -> College
Fafsa - HELPS_PAY_FOR -> University
Fafsa - HELPS_GET -> Financial Aid
Fafsa - DEPENDENT_ON -> State Financial Aid
Fafsa - APPLICATION_PROCESS -> Colleges
Fafsa - SUBMITS -> Deadline
Fafsa - HAS -> Mailing Address
Fafsa -

In [116]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

In [117]:
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatOpenAI(temperature=0)
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

In [118]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | llm
    | StrOutputParser()
)

In [119]:
chain.invoke({"question": "What is fafsa?"})

Search query: What is fafsa?
generate full text query: fafsa~2
structured_retriever result: Fafsa - REQUIRES -> Fsa Id
Fafsa - REQUIRES -> Consent
Fafsa - REQUIRES -> Colleges
Fafsa - REQUIRES -> Currency Conversion
Fafsa - REQUIRES -> Section
Fafsa - REQUIRES -> Contributor
Fafsa - REQUIRES -> Parental_Info
Fafsa - REQUIRES -> Homeless_Youth_Determination
Fafsa - REQUIRES -> Bank Account Info
Fafsa - REQUIRES -> Parent Info
Fafsa - REQUIRES -> 2022 Tax Returns
Fafsa - CONTAINS -> Financial_Questions
Fafsa - QUALIFIES_FOR -> Federal Financial Aid
Fafsa - USES -> Assets
Fafsa - USES -> Income
Fafsa - LINKS_TO -> Video
Fafsa - GENERATES -> Fafsa Submission Summary
Fafsa - GENERATES -> Confirmation Email
Fafsa - DETERMINES -> State Aid
Fafsa - ASSESSES -> Federal Aid
Fafsa - HELPS_PAY_FOR -> College
Fafsa - HELPS_PAY_FOR -> University
Fafsa - HELPS_GET -> Financial Aid
Fafsa - DEPENDENT_ON -> State Financial Aid
Fafsa - APPLICATION_PROCESS -> Colleges
Fafsa - SUBMITS -> Deadline
Fafsa - H

'FAFSA, or the Free Application for Federal Student Aid, is a form used to assess eligibility for federal financial aid, including grants, loans, and work-study programs for college students. It is managed by the Federal Student Aid office in the U.S. Department of Education and can be completed online for free at https://fafsa.gov.'

In [94]:
chain.invoke(
    {
        "question": "what are some fafsa links?",
        "chat_history": [("What is fafsa?", "FAFSA (Free Application for Federal Student Aid) is an application that requires an FSA ID for completion. ")],
    }
)

Search query: Can you provide some links for FAFSA?
generate full text query: FAFSA~2
structured_retriever result: Fafsa - REQUIRES -> Fsa Id
Fsa Id - LOG_IN_AT -> Https://Fafsa.Gov
Fsa Id - VERIFICATION -> Ssn
Parent/Contributor - COMPLETION -> Fafsa
Fafsa - REQUIRES -> Fsa Id
Parent/Contributor - CREATION -> Fsa Id


'Some FAFSA-related links are:\n\n1. To log in and manage your FSA ID: [https://fafsa.gov](https://fafsa.gov)\n2. To estimate your financial aid eligibility: [https://studentaid.gov/aid-estimator/](https://studentaid.gov/aid-estimator/)'

In [126]:
chain.invoke({"question": "do i need my parents income information?"})

Search query: do i need my parents income information?
generate full text query: parents~2
structured_retriever result: Parents - APPLY_FOR -> Direct Unsubsidized Loan
Parents - PAY_FOR -> College
Parents - PROVIDE -> Aid Sources
Parents - DOES_NOT_AFFECT -> Financial Aid Eligibility
Parents - MUST_ENTER_INFORMATION -> Fafsa
Parents - MUST_CREATE -> Fsa Id
Parent'S Account - PROVIDE -> Month And Day Of Birth
Parent'S Account - ANSWER -> Challenge Questions
Parent'S Account - RECEIVE_CODE -> Email
Parent'S Account - RESET -> Password
Parent'S Account - RECEIVE_CODE -> Phone Number
Parent'S Account - ACCESS -> Https://Fafsa.Gov
Guidance - FOLLOW -> Parents
Https://Studentaid.Gov/Help/Unlock-Fsa-Id - INSTRUCTIONS -> Parent'S Account


"Yes, you will need your parents' income information if you are including their details on the FAFSA. However, if your parent consents to the IRS sharing their federal tax information, they won't need to answer questions about their income directly."