In [1]:
!pip install -U langchain langchain-community openai faiss-cpu tiktoken pypdf



In [2]:
# Step 0: Import dependencies
import os
from langchain_community.document_loaders import PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


In [3]:
# Step 1: Load API Key from .env file & load API key
import os
from dotenv import load_dotenv

# Load the contents of the .env file into system environment variables
load_dotenv()

# Retrieve the key from environment variables
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

print("✅ API Key loaded successfully (will not be displayed)")

# Windows-specific: avoid MKL/OpenMP conflicts
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

✅ API Key loaded successfully (will not be displayed)


In [4]:
# Step 2: Select multiple PDFs via system dialog (tkinter)
from tkinter import Tk, filedialog
from langchain_community.document_loaders import PyPDFLoader
import os

# open dialog
root = Tk(); root.withdraw()
pdf_paths = filedialog.askopenfilenames(
    title="Select PDF files",
    filetypes=[("PDF files", "*.pdf")]
)
root.destroy()

pdf_paths = list(pdf_paths)
if not pdf_paths:
    raise SystemExit("No PDF selected. Exiting.")

print("The following files will be loaded:")
for p in pdf_paths:
    print(" -", p)

# load all, keep filename+page metadata
documents = []
for path in pdf_paths:
    docs = PyPDFLoader(path).load()
    for d in docs:
        d.metadata["source"] = os.path.basename(d.metadata.get("source", path))
    documents.extend(docs)
print(f"Total pages loaded: {len(documents)}")



The following files will be loaded:
 - C:/Users/syk_5/main_SS.pdf
 - C:/Users/syk_5/Resume.pdf
Total pages loaded: 34


In [5]:
# Step 3: Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)


In [6]:
# Step 4: Generate vector database
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)


In [10]:
# Step 5: Build a controllable RAG chain with chat memory (LCEL) — add snippets

from pathlib import Path
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableMap, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

retriever = vectorstore.as_retriever(
    search_type="mmr",                      # optional: more diverse retrieval
    search_kwargs={"k": 5, "fetch_k": 30, "lambda_mult": 0.5}
)

llm = ChatOpenAI(temperature=0, timeout=60, max_retries=1)

#SYSTEM = """You must answer ONLY using the provided context.
#If the answer is not contained in the context, say "I don't know."
#Cite sources like [filename p.X] after claims when possible."""
SYSTEM = """"""

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM),
    MessagesPlaceholder("chat_history"),
    ("human", "Question: {question}\n\nContext:\n{context}")
])

def format_docs(docs, max_chars=1200):
    rows, seen = [], set()
    for d in docs:
        meta = d.metadata or {}
        name = Path(meta.get("source", "doc")).name
        page = meta.get("page")
        tag = f"[{name} p.{(page + 1) if isinstance(page, int) else '?'}]"
        text = d.page_content
        key = (name, page, hash(text[:120]))  # light de-dup
        if key in seen:
            continue
        seen.add(key)
        if len(text) > max_chars:
            text = text[:max_chars] + " ..."
        rows.append(f"{tag}\n{text}")
    return "\n\n".join(rows)

# Core pipeline:
rag_core = (
    # 1) pass through fields
    RunnableMap({
        "question": lambda x: x["question"],
        "chat_history": lambda x: x.get("chat_history", []),
    })
    # 2) retrieve
    | RunnableMap({
        "docs":       lambda x: retriever.invoke(x["question"]),
        "question":   lambda x: x["question"],
        "chat_history": lambda x: x["chat_history"],
    })
    # 3) build LLM context
    | RunnableMap({
        "context":      lambda x: format_docs(x["docs"]),
        "question":     lambda x: x["question"],
        "chat_history": lambda x: x["chat_history"],
        "docs":         lambda x: x["docs"],    # keep docs for outputs below
    })
    # 4) in parallel: produce answer + pass through docs + expose snippets/sources
    | RunnableMap({
        "answer":   (prompt | llm | StrOutputParser()),
        "docs":     lambda x: x["docs"],
        # NEW: raw text snippets returned to the caller
        "snippets": lambda x: [d.page_content for d in x["docs"]],
        # Optional convenience: structured sources (filename + page)
        "sources":  lambda x: [
            {
                "source": Path((d.metadata or {}).get("source", "doc")).name,
                "page":   ((d.metadata or {}).get("page") + 1) if isinstance((d.metadata or {}).get("page"), int) else None
            }
            for d in x["docs"]
        ],
    })
)

# memory wrapper
_store = {}
def _get_history(session_id: str):
    if session_id not in _store:
        _store[session_id] = ChatMessageHistory()
    return _store[session_id]

qa = RunnableWithMessageHistory(
    rag_core,
    get_session_history=_get_history,
    input_messages_key="question",
    history_messages_key="chat_history",
    output_messages_key="answer"  # important to silence tracer expecting 'output'
)



In [11]:
# Step 6: Interactive loop (blank line to exit)
session_id = "default_session"

print("Chat mode started. Press Enter on an empty line to exit.\n")
while True:
    query = input("You: ").strip()
    if query == "":
        print("Bye.")
        break

    res = qa.invoke(
        {"question": query},
        config={"configurable": {"session_id": session_id}}
    )

    # res includes: answer (str), docs (List[Document]), snippets (List[str]), sources (List[dict])
    print("Bot:", res["answer"], "\n")

    # Optional: show normalized sources
    if "sources" in res:
        print("Sources:")
        for s in res["sources"]:
            p = f" p.{s['page']}" if s.get("page") else ""
            print(f"  - {s['source']}{p}")
        print()

    # NEW: show retrieved snippets (trim for readability)
    if "snippets" in res:
        print("Retrieved snippets:")
        MAX_PREVIEW = 240
        for i, snip in enumerate(res["snippets"], 1):
            preview = snip if len(snip) <= MAX_PREVIEW else snip[:MAX_PREVIEW] + " ..."
            print(f"  [{i}] {preview}")
        print()


Chat mode started. Press Enter on an empty line to exit.



You:  Statistically feasible?


Bot: The term "statistically feasible" in this context refers to problems that are theoretically solvable using statistical methods, but may not have computationally efficient procedures for solving them. In other words, while the statistical theory may suggest that a problem can be solved, the practical implementation of solving it computationally may be challenging or not well-established. 

Sources:
  - main_SS.pdf p.8
  - main_SS.pdf p.20
  - main_SS.pdf p.28
  - main_SS.pdf p.23
  - main_SS.pdf p.33

Retrieved snippets:
  [1] condition ∥R∥F →∞ is therefore not sufficient.
2.2 Computational Feasibility
The results in Section 2.1 provides a necessary condition for statistical detectability. There are, however, numerous
problems that are statistically feasible with ...
  [2] are correlated. This conclusion, while biologically relevant, is also certainly expected.
20
  [3] Van der Vaart, A. W. (2000). Asymptotic statistics, Volume 3. Cambridge university press.
Varshney, L. R., B. L. 

You:  Compuationally feasible?


Bot: In the context provided, "computationally feasible" refers to problems that can be solved using computational methods, even though there may not be efficient procedures established for solving them. The passage mentions that there are problems that are statistically feasible, meaning they can be theoretically solved using statistical methods, but may lack computationally efficient procedures for solving them. Examples of such problems include community detection, sparse PCA, and estimation in spiked tensor models.

The passage also discusses a statistical vs. computational gap, where the problem of independence testing is transformed into the planted clique detection problem in theoretical computer science. This transformation highlights the challenge of bridging the gap between statistical feasibility and computational feasibility in certain problems. 

Sources:
  - main_SS.pdf p.8
  - main_SS.pdf p.20
  - main_SS.pdf p.27
  - main_SS.pdf p.8
  - main_SS.pdf p.28

Retrieved snipp

You:  What is the trade-off between them?


Bot: The trade-off between statistical feasibility and computational feasibility lies in the balance between the theoretical ability to solve a problem using statistical methods and the practical ability to implement those solutions computationally. 

Statistical feasibility focuses on the theoretical aspects of solving a problem using statistical techniques, such as hypothesis testing, estimation, or inference. These methods are grounded in statistical theory and provide insights into the underlying data generating processes.

On the other hand, computational feasibility deals with the practical implementation of statistical methods to solve real-world problems. This involves considerations such as algorithm efficiency, computational resources, and scalability. While a problem may be statistically feasible in theory, it may not always be computationally feasible due to constraints such as time complexity, memory requirements, or the availability of suitable algorithms.

Finding the ri

You:  


Bye.
