In [2]:
from PyPDF2 import PdfReader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ollama_embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma, FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Ollama

# 1. Load and parse the PDF
reader = PdfReader("railways.pdf")
docs = []
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text.strip():
        docs.append(Document(page_content=text, metadata={"page": i+1}))


In [3]:

# 2. Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.split_documents(docs)

# 3. Embed the chunks
embed_model = OllamaEmbeddings(model_name="nomic-embed-text")

# 4A. Create the vector index with Chroma
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embed_model,
    persist_directory="railway_chroma_db",
    collection_name="railway_document_embeddings"
)

# 4B. Alternatively, use FAISS for vector indexing
# vector_store = FAISS.from_documents(
#     documents=chunks,
#     embedding=embed_model
# )
# vector_store.save_local("faiss_index")  # Optionally persist FAISS index


In [4]:

# 5. Query the index using DeepSeek LLM via Ollama
from locale import MON_1


retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
qa = RetrievalQA.from_chain_type(
    llm=Ollama(model="qwen3"),  # Specify your local DeepSeek model
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

result = qa({"query": "What are the 4 steps of the TAR?"})
print("Answer:", result["result"])

  llm=Ollama(model="qwen3"),  # Specify your local DeepSeek model
  result = qa({"query": "What are the 4 steps of the TAR?"})


Answer: <think>
Okay, let me try to figure out the answer to the question about the 4 steps of the TAR. The user provided a chunk of text, and I need to extract the information accurately.

First, I'll scan through the context for mentions of the TAR steps. The user's question is about the four steps of the TAR, so I should look for sections that talk about the methodology or steps taken. 

Looking at the text, there's a section labeled "1.4 Methodical Approach" which mentions that the TAR was delivered in four steps. Let me check that part again. 

The text says: "This TAR was delivered in four steps: (a) ORR prepared a Request For Information (RFI)... (b) Meetings with Regional representatives... (c) Interviews with a Head of Maintenance/Infrastructure Director... (d) ORR compiled, reviewed..." 

So the four steps are: 
1. Preparing the RFI and gathering documents.
2. Meetings with regional reps for clarification.
3. Conducting interviews with regional heads.
4. Compiling and reviewi

In [5]:
models = ["deepseek-r1", "gemma3", "qwen3"]
results = {}
for model_name in models:
    qa = RetrievalQA.from_chain_type(
        llm=Ollama(model=model_name),
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
    )
    result = qa({"query": "Summarize the main points of this document."})
    results[model_name] = result["result"]

for model_name, answer in results.items():
    print(f"\n=== Answer from {model_name} ===\n")
    print(answer)


=== Answer from deepseek-r1 ===

<think>
Okay, let's tackle this query. The user wants me to summarize the main points of the document they provided. I need to carefully read through each part to extract key information without adding anything extra.

First, looking at sections 1.4 and 1.6-1.8. It mentions that ORR took a methodical approach involving five steps for their study on NR's maintenance organization. Wait, the original text lists four steps in 1.6 but actually describes five activities including an RFI, meetings, interviews, compiling observations, and a summary. The user might be confused by this discrepancy.

The TAR considered all five regions (NR) to address several questions: how maintenance is conducted, strengths/weaknesses, effectiveness/efficiency, variations across regions, and cost differences. Section 1.7 says there's a high-level summary in Section 2, which the user probably needs to mention as part of the main points.

The report is a snapshot at a specific ti

# Recreate or reload db as needed.

import os
from langchain_community.vectorstores import Chroma

DB_DIR = "chroma_db"
COLLECTION_NAME = "railway_embeddings"

# Check if the Chroma DB already exists (i.e., previously created)
if not os.path.exists(DB_DIR):
    # First-time creation: use from_documents to initialize DB schema and tables
    vector_store = Chroma.from_documents(
        documents,  # list of LangChain Document objects
        embedding=embed_model,
        persist_directory=DB_DIR,
        collection_name=COLLECTION_NAME
    )
    vector_store.persist()  # Optional but ensures changes are saved
else:
    # DB already exists: load existing vector store
    vector_store = Chroma(
        embedding_function=embed_model,
        persist_directory=DB_DIR,
        collection_name=COLLECTION_NAME
    )