In [1]:
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
from langchain_core.runnables import RunnableLambda
from langchain_ollama import ChatOllama
from typing import TypedDict, Annotated, Dict
from langgraph.graph import StateGraph, END



In [2]:

# Document Processing (same as original)
def process_all_files(directory):
    loaders = [
        DirectoryLoader(directory, glob="**/*.html", show_progress=True),
        DirectoryLoader(directory, glob="**/*.pdf", show_progress=True, loader_cls=PyPDFLoader),
        DirectoryLoader(directory, glob="**/*.txt", show_progress=True, loader_cls=TextLoader),
        DirectoryLoader(directory, glob="**/*.docx", show_progress=True)
    ]
    documents = []
    for loader in loaders:
        data = loader.load()
        documents.extend(data)
    if not documents:
        raise ValueError(f"No supported files found in directory: {directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=150)
    split_docs = text_splitter.split_documents(documents)
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(split_docs, embedding_model)
    return vectorstore

directory = "data/"
vectorstore = process_all_files(directory)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


0it [00:00, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 67 0 (offset 0)
100%|██████████| 2/2 [00:00<00:00,  7.04it/s]
100%|██████████| 1/1 [00:00<00:00, 1683.78it/s]
0it [00:00, ?it/s]
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

In [3]:

# Define the State
class GraphState(TypedDict):
    query: str
    refined_query: Annotated[str, "Refined version of the original query"]
    retrieved_text: Annotated[str, "Text retrieved from documents"]
    generated_response: Annotated[str, "LLM-generated response"]
    verification_result: Annotated[str, "Verification outcome"]
    reasoning: Annotated[Dict[str, str], "Reasoning trace for each step"]


In [4]:

# Initialize LLM (using LangChain's ChatOllama)
llm = ChatOllama(model="llama3.2", base_url="http://localhost:11434")


In [5]:

# Node Functions
def refine_query(state: GraphState) -> GraphState:
    query = state["query"]
    system_prompt = (
        "You are an AI assistant that refines user queries for document retrieval. "
        "Rephrase the query to make it clearer and more specific without altering its intent.\n\n"
        f"Original Query: {query}\n\n"
        "Refined Query:"
    )
    refined_query = llm.invoke([{"role": "system", "content": system_prompt}, {"role": "user", "content": query}]).content.strip()
    reasoning = f"Refined '{query}' to '{refined_query}' to clarify intent and improve retrieval precision."
    return {
        "refined_query": refined_query,
        "reasoning": {**state["reasoning"], "query_refinement": reasoning}
    }


In [6]:

def retrieve_documents(state: GraphState) -> GraphState:
    query = state["refined_query"]
    docs = retriever.invoke(query)
    retrieved_text = "\n".join([doc.page_content for doc in docs]) if docs else ""
    reasoning = (f"Retrieved {len(docs)} document excerpts for query '{query}'. "
                 f"{'No relevant documents found.' if not docs else 'Combined excerpts into a single text block.'}")
    return {
        "retrieved_text": retrieved_text,
        "reasoning": {**state["reasoning"], "document_retrieval": reasoning}
    }


In [7]:

def generate_response(state: GraphState) -> GraphState:
    retrieved_text = state["retrieved_text"]
    if not retrieved_text:
        return {
            "generated_response": "No relevant information found.",
            "reasoning": {**state["reasoning"], "response_generation": "No valid retrieved text provided for analysis."}
        }
    system_prompt = (
        "You are an AI assistant that answers ONLY based on the provided document excerpts. "
        "Do not use external knowledge. If the answer is not found, reply with 'Not found in the document.'\n\n"
        "DOCUMENT EXCERPTS:\n" + retrieved_text
    )
    generated_response = llm.invoke([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": state["query"]}
    ]).content.strip()
    reasoning = f"Analyzed retrieved text of length {len(retrieved_text)} characters. Generated response based solely on excerpts."
    return {
        "generated_response": generated_response,
        "reasoning": {**state["reasoning"], "response_generation": reasoning}
    }


In [8]:

def verify_response(state: GraphState) -> GraphState:
    retrieved_text = state["retrieved_text"]
    generated_response = state["generated_response"]
    system_prompt = (
        "You are an AI assistant that verifies whether a generated response is properly supported "
        "by the given document excerpts.\n\n"
        "DOCUMENT EXCERPTS:\n" + retrieved_text + "\n\n"
        "GENERATED RESPONSE:\n" + generated_response + "\n\n"
        "Verification Output:\n"
        "- If supported, reply with: 'Verified ✅'\n"
        "- If unsupported, highlight unsupported parts."
    )
    verification_result = llm.invoke([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": generated_response}
    ]).content.strip()
    reasoning = (f"Checked if '{generated_response}' aligns with retrieved text of length {len(retrieved_text)}. "
                 f"Result: {verification_result}")
    return {
        "verification_result": verification_result,
        "reasoning": {**state["reasoning"], "response_verification": reasoning}
    }

In [10]:

# Build the Graph
workflow = StateGraph(GraphState)

# Add Nodes
workflow.add_node("refine_query", RunnableLambda(refine_query))
workflow.add_node("retrieve_documents", RunnableLambda(retrieve_documents))
workflow.add_node("generate_response", RunnableLambda(generate_response))
workflow.add_node("verify_response", RunnableLambda(verify_response))

# Define Edges (sequential flow)
workflow.add_edge("refine_query", "retrieve_documents")
workflow.add_edge("retrieve_documents", "generate_response")
workflow.add_edge("generate_response", "verify_response")
workflow.add_edge("verify_response", END)

# Set Entry Point
workflow.set_entry_point("refine_query")

# Compile the Graph
graph = workflow.compile()

# Execute the Graph
initial_state = {
    "query": "what are the Prerequisites to Run the Project",
    "refined_query": "",
    "retrieved_text": "",
    "generated_response": "",
    "verification_result": "",
    "reasoning": {}
}
result = graph.invoke(initial_state)

# Print Results and Reasoning
print("Final State:")
for key, value in result.items():
    if key != "reasoning":
        print(f"{key}: {value}")
print("\nReasoning Trace:")
for step, reasoning in result["reasoning"].items():
    print(f"{step}: {reasoning}")

Final State:
query: what are the Prerequisites to Run the Project
refined_query: To provide a more precise answer, could you please specify which project you are referring to? Additionally, what type of project is it (e.g., software development, research, academic, etc.)?

If you don't have a specific project in mind, I can still attempt to rephrase the query. Here's an alternative:

What specific requirements or conditions need to be met before running a particular project?

Or, if you'd like to provide more context:

* What is the nature of the project (e.g., software development, scientific research, academic project)?
* Are there any specific tools, technologies, or resources required for the project?
* Is this a general query, or are you looking for information on a particular project or industry?
retrieved_text: 2. Install Required Libraries The project requires several Python libraries that can be installed via pip. You can install them by running the following command: pip inst

In [12]:
graph = workflow.compile()

# Visualize the Graph with Graphviz
graph_diagram = graph.get_graph()

In [13]:
from IPython.display import Image
Image(graph_diagram.draw_png())

ImportError: Install pygraphviz to draw graphs: `pip install pygraphviz`.

In [18]:
pip install pygraphviz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pygraphviz
  Using cached pygraphviz-1.14.tar.gz (106 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: pygraphviz
  Building wheel for pygraphviz (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for pygraphviz [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[60 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_py
  [31m   [0m creating build/lib.macosx-10.9-universal2-cpython-311/pygraphviz
  [31m   [0m copying pygraphviz/scraper.py -> build/lib.macosx-10.9-universal2-cpython-311/pygraphviz
  [31m   [0m copying pygraphviz/graphviz.py -> build/lib.macosx-10.9-universal2-cpython-311/py

In [15]:
graph = workflow.compile()

# Visualize the Graph with Graphviz
graph_diagram = graph.get_graph()
graph_diagram.write_png("graph.png")  # Use write_png instead of draw
print("Graph saved as 'graph.png'")

AttributeError: 'Graph' object has no attribute 'write_png'

In [12]:
! brew install graphviz


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m==>[0m [1mAuto-updating Homebrew...[0m
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
[34m==>[0m [1mAuto-updated Homebrew![0m
Updated 2 taps (homebrew/core and homebrew/cask).
[34m==>[0m [1mNew Formulae[0m
bombardier          gersemi             org-formation       taskflow
bpmnlint            globstar            ov                  tgpt
cf-terraforming     gotz                pivy                tml
cloudfoundry-cli    hishtory            pkl-lsp             todoist
cot                 i686-elf-grub       punktf              trdsql
cspell              immich-go           rattler-index       typioca
dbg-macro           infisical           rhai                unciv
dtsroll             jira-cli            rpds-py             visidata
dyff                kafkactl            ruby-lsp            vscli
exomizer            kapp                rustic              

In [14]:
! python3 -m pip install -U --no-cache-dir  --config-settings="--global-option=build_ext"  --config-settings="--global-option=-I$(brew --prefix graphviz)/include/"  --config-settings="--global-option=-L$(brew --prefix graphviz)/lib/"  pygraphviz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pygraphviz
  Downloading pygraphviz-1.14.tar.gz (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.0/106.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hBuilding wheels for collected packages: pygraphviz
  Building wheel for pygraphviz (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pygraphviz: filename=pygraphviz-1.14-cp310-cp310-macosx_10_9_universal2.whl size=115063 sha256=a748143264d75f100dfeb9ecaec5d9ca5471dd011dbaeddb9d181e35fc13a872
  Stored in directory: /private/var/folders/l3/524d7s611rs3hl7hsd6rk25w0000gn/T/pip-ephem-wheel-cache-zndvicrf/wheels/61/ab/cd/e24a22c32830b8b4948c8887d8714d399f0f806f206a034698
Successfully built pygraphviz
Installing collected packages: pygraphviz
Successfully installed pygraphviz-1.14


In [18]:
from IPython.display import Image
Image(workflow.graph.get_graph().draw_png())

AttributeError: 'StateGraph' object has no attribute 'graph'