In [1]:
!pip install -U langchain langchain-community openai faiss-cpu tiktoken pypdf



In [1]:
# Step 0: Import dependencies
import os
from langchain_community.document_loaders import PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


In [2]:
# Step 1: Load API Key from .env file & load API key
import os
from dotenv import load_dotenv

# Load the contents of the .env file into system environment variables
load_dotenv()

# Retrieve the key from environment variables
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

print("✅ API Key loaded successfully (will not be displayed)")

# Windows-specific: avoid MKL/OpenMP conflicts
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

✅ API Key loaded successfully (will not be displayed)


In [3]:
# Step 2: Select multiple PDFs via system dialog (tkinter)
from tkinter import Tk, filedialog
from langchain_community.document_loaders import PyPDFLoader
import os

# open dialog
root = Tk(); root.withdraw()
pdf_paths = filedialog.askopenfilenames(
    title="Select PDF files",
    filetypes=[("PDF files", "*.pdf")]
)
root.destroy()

pdf_paths = list(pdf_paths)
if not pdf_paths:
    raise SystemExit("No PDF selected. Exiting.")

print("The following files will be loaded:")
for p in pdf_paths:
    print(" -", p)

# load all, keep filename+page metadata
documents = []
for path in pdf_paths:
    docs = PyPDFLoader(path).load()
    for d in docs:
        d.metadata["source"] = os.path.basename(d.metadata.get("source", path))
    documents.extend(docs)
print(f"Total pages loaded: {len(documents)}")



The following files will be loaded:
 - C:/Users/syk_5/Resume.pdf
 - C:/Users/syk_5/main_SS.pdf
Total pages loaded: 34


In [9]:
# --- Grid search helpers ---
from itertools import product
from statistics import mean
from typing import List, Tuple, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

def _faiss_topk_sims(vs: FAISS, query: str, k: int = 4) -> List[float]:
    """Convert FAISS (doc, distance) to similarity scores."""
    docs_scores = vs.similarity_search_with_score(query, k=k)
    sims = []
    for _, dist in docs_scores:
        try:
            sim = 1.0 / (1.0 + float(dist))
        except Exception:
            sim = 0.0
        sims.append(sim)
    return sims

def _build_vs_for_params(documents, embeddings, chunk_size: int, chunk_overlap: int) -> Tuple[FAISS, int]:
    """Split with given params and build a FAISS index. Returns (vs, num_chunks)."""
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = splitter.split_documents(documents)
    vs = FAISS.from_documents(chunks, embeddings)
    return vs, len(chunks)

def grid_search_chunk_params(
    queries: List[str],
    documents,
    embeddings,
    chunk_sizes: List[int],
    overlaps: List[int],
    k: int = 4,
) -> Dict[str, Any]:
    """Try (chunk_size, overlap) combos; score via mean top-k similarity across queries."""
    results = []
    best = None
    for cs, ov in product(chunk_sizes, overlaps):
        vs, n_chunks = _build_vs_for_params(documents, embeddings, cs, ov)
        per_q_scores = []
        for q in queries:
            sims = _faiss_topk_sims(vs, q, k=k)
            per_q_scores.append(mean(sims) if sims else 0.0)
        avg_score = mean(per_q_scores) if per_q_scores else 0.0
        row = {"chunk_size": cs, "overlap": ov, "avg_score": avg_score, "num_chunks": n_chunks}
        results.append(row)
        if (best is None) or (avg_score > best["avg_score"]):
            best = row
    results_sorted = sorted(results, key=lambda r: r["avg_score"], reverse=True)
    return {"best_params": best, "scoreboard": results_sorted}


In [10]:
# ===== Step 3 (Optional): pick chunk_size & overlap via grid search, then split =====
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

# init embeddings BEFORE grid search
embeddings = OpenAIEmbeddings()

# representative queries for evaluation (small set)
eval_queries = [
    "What is statistically feasible?",
    "computationally feasible?",
    "What is the trade-off between them?"
]

DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 100
use_grid_search = True# False  # set True to enable grid search

if use_grid_search:
    try:
        # (ensure you already defined grid_search_chunk_params somewhere above)
        res = grid_search_chunk_params(
            queries=eval_queries,
            documents=documents,      # Step 2 output
            embeddings=embeddings,
            chunk_sizes=[400, 700, 1000],
            overlaps=[50, 100, 200],
            k=4,
        )
        best_params = res["best_params"] or {}
        CHUNK_SIZE = int(best_params.get("chunk_size", DEFAULT_CHUNK_SIZE))
        OVERLAP    = int(best_params.get("overlap", DEFAULT_OVERLAP))
        print("Grid search best:", best_params)
    except Exception as e:
        print("Grid search failed, fallback to defaults:", e)
        CHUNK_SIZE, OVERLAP = DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP
else:
    CHUNK_SIZE, OVERLAP = DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP

# Split with chosen params
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=OVERLAP
)
docs = text_splitter.split_documents(documents)

# (Optional) filter overly short chunks that hurt retrieval/refinement
docs = [d for d in docs if len(d.page_content) >= 300]

print(f"Total chunks: {len(docs)} (chunk_size={CHUNK_SIZE}, overlap={OVERLAP})")



Grid search best: {'chunk_size': 400, 'overlap': 200, 'avg_score': 0.7176904205215795, 'num_chunks': 345}
Total chunks: 309 (chunk_size=400, overlap=200)


In [12]:
# ===== Step 4: Build the vector database (reuse the SAME embeddings) =====
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(docs, embeddings)
print("FAISS index built.")


FAISS index built.


In [15]:
# --- helpers for refinement ---
import numpy as np
from typing import List, Tuple, Dict, Any
from copy import deepcopy
from langchain_core.documents import Document

def _cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    na = np.linalg.norm(a); nb = np.linalg.norm(b)
    if na == 0 or nb == 0: 
        return 0.0
    return float(np.dot(a, b) / (na * nb))

def _embed_texts(embeddings, texts: List[str]) -> np.ndarray:
    vecs = embeddings.embed_documents(texts)
    return np.array(vecs, dtype=np.float32)

def _embed_query(embeddings, query: str) -> np.ndarray:
    return np.array(embeddings.embed_query(query), dtype=np.float32)

def _generate_candidate_windows(
    text: str,
    max_shift: int = 300,
    step: int = 50,
    min_len: int = 300,
    max_len: int = 1200,
    length_scales=(0.6, 0.8, 1.0)
) -> List[str]:
    L = len(text)
    candidates, seen = [], set()
    if L <= min_len:
        return [text]
    for scale in length_scales:
        target = int(max(min_len, min(max_len, L * scale)))
        if target <= 0 or target > L:
            continue
        base = max(0, (L - target) // 2)
        starts = {0, max(0, L - target), base}
        for shift in range(-max_shift, max_shift + 1, step):
            s = max(0, min(base + shift, max(0, L - target)))
            starts.add(s)
        for s in sorted(starts):
            e = min(L, s + target)
            seg = text[s:e]
            key = (len(seg), hash(seg[:160]))
            if key not in seen:
                seen.add(key)
                candidates.append(seg)
    candidates.append(text if L <= max_len else text[:max_len])
    out, seen2 = [], set()
    for c in candidates:
        k = (len(c), hash(c[:200]))
        if k not in seen2:
            seen2.add(k)
            out.append(c)
    return out

def refine_topk_chunks(
    query: str,
    docs: List[Document],
    embeddings,
    max_shift: int = 300,
    step: int = 50,
    min_len: int = 300,
    max_len: int = 1200,
) -> Tuple[List[Document], List[Dict[str, Any]]]:
    if not docs:
        return [], []
    q_vec = _embed_query(embeddings, query)
    refined_docs, info = [], []
    for d in docs:
        base_text = d.page_content or ""
        cands = _generate_candidate_windows(
            base_text, max_shift=max_shift, step=step, min_len=min_len, max_len=max_len
        )
        cand_vecs = _embed_texts(embeddings, cands)
        sims = [_cosine_sim(q_vec, v) for v in cand_vecs]
        best_idx = int(np.argmax(sims)) if sims else 0
        best_text = cands[best_idx] if sims else base_text
        best_score = sims[best_idx] if sims else 0.0
        # original (raw or truncated) score
        try:
            orig_idx = cands.index(base_text if len(base_text) <= max_len else base_text[:max_len])
            orig_score = sims[orig_idx]
        except ValueError:
            orig_score = 0.0
        rd = deepcopy(d); rd.page_content = best_text
        refined_docs.append(rd)
        info.append({
            "source": (d.metadata or {}).get("source"),
            "page": (d.metadata or {}).get("page"),
            "orig_len": len(base_text),
            "refined_len": len(best_text),
            "orig_score": orig_score,
            "best_score": best_score,
            "improvement": best_score - orig_score,
            "candidates": len(cands),
        })
    return refined_docs, info

from pathlib import Path
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableMap, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "fetch_k": 30, "lambda_mult": 0.5}
)

llm = ChatOpenAI(temperature=0, timeout=60, max_retries=1)
SYSTEM = """"""

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM),
    MessagesPlaceholder("chat_history"),
    ("human", "Question: {question}\n\nContext:\n{context}")
])

def format_docs(docs, max_chars=1200):
    rows, seen = [], set()
    for d in docs:
        meta = d.metadata or {}
        name = Path(meta.get("source", "doc")).name
        page = meta.get("page")
        tag = f"[{name} p.{(page + 1) if isinstance(page, int) else '?'}]"
        text = d.page_content
        key = (name, page, hash(text[:120]))
        if key in seen:
            continue
        seen.add(key)
        if len(text) > max_chars:
            text = text[:max_chars] + " ..."
        rows.append(f"{tag}\n{text}")
    return "\n\n".join(rows)

use_refinement = True  # set False to disable refinement

rag_core = (
    RunnableMap({
        "question": lambda x: x["question"],
        "chat_history": lambda x: x.get("chat_history", []),
    })
    | RunnableMap({
        "docs_raw":    lambda x: retriever.invoke(x["question"]),
        "question":    lambda x: x["question"],
        "chat_history": lambda x: x["chat_history"],
    })
    # refinement branch (optional)
    | (
        RunnableLambda(lambda x: (lambda docs, meta: {
                "docs": docs, "refine_info": meta,
                "question": x["question"], "chat_history": x["chat_history"]
            })(*refine_topk_chunks(
                query=x["question"], docs=x["docs_raw"], embeddings=embeddings,
                max_shift=300, step=50, min_len=300, max_len=1200
            )))
        if use_refinement
        else RunnableLambda(lambda x: {
            "docs": x["docs_raw"], "refine_info": [],
            "question": x["question"], "chat_history": x["chat_history"]
        })
    )
    | RunnableMap({
        "context":      lambda x: format_docs(x["docs"]),
        "question":     lambda x: x["question"],
        "chat_history": lambda x: x["chat_history"],
        "docs":         lambda x: x["docs"],
        "refine_info":  lambda x: x["refine_info"],
    })
    | RunnableMap({
        "answer":   (prompt | llm | StrOutputParser()),
        "docs":     lambda x: x["docs"],
        "snippets": lambda x: [d.page_content for d in x["docs"]],
        "sources":  lambda x: [
            {
                "source": Path((d.metadata or {}).get("source", "doc")).name,
                "page":   ((d.metadata or {}).get("page") + 1) if isinstance((d.metadata or {}).get("page"), int) else None
            } for d in x["docs"]
        ],
        "refine_info": lambda x: x["refine_info"],
    })
)

_store = {}
def _get_history(session_id: str):
    if session_id not in _store:
        _store[session_id] = ChatMessageHistory()
    return _store[session_id]

qa = RunnableWithMessageHistory(
    rag_core,
    get_session_history=_get_history,
    input_messages_key="question",
    history_messages_key="chat_history",
    output_messages_key="answer"
)





In [16]:
# ===== Step 6: Interactive loop (blank to exit) =====
session_id = "default_session"
print("Chat mode started. Press Enter on an empty line to exit.\n")
while True:
    query = input("You: ").strip()
    if query == "":
        print("Bye.")
        break

    res = qa.invoke(
        {"question": query},
        config={"configurable": {"session_id": session_id}}
    )

    print("Bot:", res["answer"], "\n")

    if "sources" in res:
        print("Sources:")
        for s in res["sources"]:
            p = f" p.{s['page']}" if s.get("page") else ""
            print(f"  - {s['source']}{p}")
        print()

    if "snippets" in res:
        print("Retrieved snippets:")
        MAX_PREVIEW = 240
        for i, snip in enumerate(res["snippets"], 1):
            preview = snip if len(snip) <= MAX_PREVIEW else snip[:MAX_PREVIEW] + " ..."
            print(f"  [{i}] {preview}")
        print()

    if "refine_info" in res and res["refine_info"]:
        print("Refinement details (per chunk):")
        for i, inf in enumerate(res["refine_info"], 1):
            src = inf.get("source")
            pg = inf.get("page")
            print(
                f"  [{i}] {src} p.{pg+1 if isinstance(pg,int) else '?'} | "
                f"orig={inf['orig_score']:.4f} -> best={inf['best_score']:.4f} "
                f"(Δ={inf['improvement']:.4f}) | len {inf['orig_len']} -> {inf['refined_len']}"
            )
        print()


Chat mode started. Press Enter on an empty line to exit.



You:  What is statistical feasibility?


Bot: Statistical feasibility refers to the ability to conduct statistical analysis or tests within certain parameter regimes. It involves determining whether it is possible to obtain meaningful statistical results given the available data and the statistical methods being used. In the context provided, it is mentioned that there are problems that are statistically feasible with certain parameter regimes, but there may not be computationally efficient methods available to analyze them. This highlights the importance of considering both statistical and computational feasibility when conducting data analysis. 

Sources:
  - main_SS.pdf p.8
  - Resume.pdf p.1
  - main_SS.pdf p.26
  - main_SS.pdf p.23
  - main_SS.pdf p.25

Retrieved snippets:
  [1] F →∞ is therefore not sufficient.
2.2 Computational Feasibility
The results in Section 2.1 provides a necessary condition for statistical detectability. There are, however, numerous
problems that are statistically feasible with parameter re ...
 

You:  What is the difference between computational feasibility?


Bot: Computational feasibility refers to the ability to perform computational tasks or algorithms within reasonable time and resource constraints. It involves assessing whether the computational resources available are sufficient to carry out the required computations efficiently. In the context provided, it is mentioned that there are problems that are statistically feasible with certain parameter regimes, but there may not be computationally efficient methods available to analyze them. This highlights the importance of considering both statistical and computational feasibility when conducting data analysis. 

Sources:
  - main_SS.pdf p.8
  - main_SS.pdf p.3
  - main_SS.pdf p.8
  - main_SS.pdf p.15
  - main_SS.pdf p.26

Retrieved snippets:
  [1] F →∞ is therefore not sufficient.
2.2 Computational Feasibility
The results in Section 2.1 provides a necessary condition for statistical detectability. There are, however, numerous
problems that are statistically feasible with parameter re ..

You:  Which one is easier to achieve?


Bot: Based on the provided context, it is stated that "while ∥R∥F →∞ for all of these examples, it is nevertheless easier, both statistically and computationally, to..." Unfortunately, the sentence is incomplete and does not provide a clear answer to which one, statistical feasibility or computational feasibility, is easier to achieve. 

However, in general terms, the ease of achieving statistical feasibility versus computational feasibility can vary depending on the specific problem, available resources, and the complexity of the statistical methods or computational algorithms involved. In some cases, statistical feasibility may be easier to achieve if the data is well-structured and the statistical methods are well-established. On the other hand, computational feasibility may be easier if efficient algorithms and computational resources are readily available. 

Sources:
  - main_SS.pdf p.24
  - Resume.pdf p.1
  - main_SS.pdf p.23
  - main_SS.pdf p.2
  - main_SS.pdf p.24

Retrieved sn

You:  


Bye.
