In [1]:
!pip install -U langchain langchain-community openai faiss-cpu tiktoken pypdf



In [1]:
# Step 0: Import dependencies
import os
from langchain_community.document_loaders import PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


In [2]:
# Step 1: Load API Key from .env file & load API key
import os
from dotenv import load_dotenv

# Load the contents of the .env file into system environment variables
load_dotenv()

# Retrieve the key from environment variables
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

print("✅ API Key loaded successfully (will not be displayed)")

# Windows-specific: avoid MKL/OpenMP conflicts
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

✅ API Key loaded successfully (will not be displayed)


In [3]:
# Step 2: Select multiple PDFs via system dialog (tkinter)
from tkinter import Tk, filedialog
from langchain_community.document_loaders import PyPDFLoader
import os

# open dialog
root = Tk(); root.withdraw()
pdf_paths = filedialog.askopenfilenames(
    title="Select PDF files",
    filetypes=[("PDF files", "*.pdf")]
)
root.destroy()

pdf_paths = list(pdf_paths)
if not pdf_paths:
    raise SystemExit("No PDF selected. Exiting.")

print("The following files will be loaded:")
for p in pdf_paths:
    print(" -", p)

# load all, keep filename+page metadata
documents = []
for path in pdf_paths:
    docs = PyPDFLoader(path).load()
    for d in docs:
        d.metadata["source"] = os.path.basename(d.metadata.get("source", path))
    documents.extend(docs)
print(f"Total pages loaded: {len(documents)}")



The following files will be loaded:
 - C:/Users/syk_5/Resume.pdf
 - C:/Users/syk_5/main_SS.pdf
Total pages loaded: 34


In [4]:
# Step 3: Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)


In [5]:
# Step 4: Generate vector database
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)


In [13]:
import math
from typing import List, Tuple, Dict, Any
from copy import deepcopy
import numpy as np
from langchain_core.documents import Document

# Cosine similarity for two numpy vectors
def _cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    na = np.linalg.norm(a); nb = np.linalg.norm(b)
    if na == 0 or nb == 0: 
        return 0.0
    return float(np.dot(a, b) / (na * nb))

def _embed_texts(embeddings, texts: List[str]) -> np.ndarray:
    """Batch-embed texts -> (N, D) numpy array."""
    vecs = embeddings.embed_documents(texts)  # list[list[float]]
    return np.array(vecs, dtype=np.float32)

def _embed_query(embeddings, query: str) -> np.ndarray:
    return np.array(embeddings.embed_query(query), dtype=np.float32)

def _generate_candidate_windows(
    text: str,
    max_shift: int = 200,         # how far to shift left/right (in characters)
    step: int = 100,              # shift stride
    min_len: int = 300,           # min window length
    max_len: int = 1200,          # cap window length
) -> List[str]:
    """
    Create small variations (sub-windows) inside a chunk by trimming a bit from
    left/right. This is 'local' refinement without original doc offsets.
    """
    L = len(text)
    if L <= min_len:
        return [text]  # too short, keep as is

    # target length: clamp to [min_len, max_len]
    target = max(min(L, max_len), min_len)

    # base centered window indices
    base_start = max(0, (L - target) // 2)
    base_end = min(L, base_start + target)

    starts = set()
    # try centered + small shifts left/right
    for shift in range(-max_shift, max_shift + 1, step):
        s = base_start + shift
        s = max(0, min(s, max(0, L - target)))
        starts.add(s)

    # also try hugging left and right edges
    starts.add(0)
    starts.add(max(0, L - target))

    candidates = []
    for s in sorted(starts):
        e = min(L, s + target)
        candidates.append(text[s:e])
    # ensure original text as fallback candidate
    candidates.append(text if L <= max_len else text[:max_len])
    # dedup
    seen, out = set(), []
    for c in candidates:
        key = (len(c), hash(c[:120]))
        if key not in seen:
            seen.add(key); out.append(c)
    return out

def _generate_candidate_windows(
    text: str,
    max_shift: int = 300,         # maximum shift left/right (in characters)
    step: int = 50,               # shift stride (smaller = finer search)
    min_len: int = 300,           # minimum candidate window length
    max_len: int = 1200,          # maximum candidate window length
    length_scales=(0.6, 0.8, 1.0) # try scaled-down windows even if L is within [min,max]
) -> List[str]:
    L = len(text)
    candidates, seen = [], set()

    # If the chunk is too short, keep it as is (cannot expand without global offsets)
    if L <= min_len:
        return [text]

    for scale in length_scales:
        # Compute target length based on scale, clamped to [min_len, max_len]
        target = int(max(min_len, min(max_len, L * scale)))
        if target <= 0 or target > L:
            continue

        # Use center window as baseline, then shift left/right
        base = max(0, (L - target) // 2)
        starts = {0, max(0, L - target), base}
        for shift in range(-max_shift, max_shift + 1, step):
            s = max(0, min(base + shift, max(0, L - target)))
            starts.add(s)

        # Generate candidate windows
        for s in sorted(starts):
            e = min(L, s + target)
            seg = text[s:e]
            key = (len(seg), hash(seg[:160]))
            if key not in seen:
                seen.add(key)
                candidates.append(seg)

    # Always ensure the original or truncated version is included
    candidates.append(text if L <= max_len else text[:max_len])

    # Deduplicate by (length, partial hash)
    out, seen2 = [], set()
    for c in candidates:
        k = (len(c), hash(c[:200]))
        if k not in seen2:
            seen2.add(k)
            out.append(c)

    return out



def refine_topk_chunks(
    query: str,
    docs: List[Document],
    embeddings,
    max_shift: int = 200,
    step: int = 100,
    min_len: int = 300,
    max_len: int = 1200,
) -> Tuple[List[Document], List[Dict[str, Any]]]:
    """
    Refine top-k documents by generating candidate sub-windows inside each chunk,
    embedding them, and selecting the one with the highest cosine similarity to the query.

    Returns:
        refined_docs: list of Documents with page_content replaced by the best window
        info: metadata list including original vs refined scores and lengths
    """
    if not docs:
        return [], []

    q_vec = _embed_query(embeddings, query)

    refined_docs: List[Document] = []
    info: List[Dict[str, Any]] = []

    for d in docs:
        base_text = d.page_content or ""
        cands = _generate_candidate_windows(
            base_text, max_shift=max_shift, step=step, min_len=min_len, max_len=max_len
        )

        # embed candidate windows and compute similarity scores
        cand_vecs = _embed_texts(embeddings, cands)
        sims = [_cosine_sim(q_vec, v) for v in cand_vecs]

        # best candidate
        best_idx = int(np.argmax(sims)) if sims else 0
        best_text = cands[best_idx] if sims else base_text
        best_score = sims[best_idx] if sims else 0.0

        # original candidate score (raw chunk or truncated if > max_len)
        try:
            if len(base_text) <= max_len:
                orig_idx = cands.index(base_text)
            else:
                orig_idx = cands.index(base_text[:max_len])
            orig_score = sims[orig_idx]
        except ValueError:
            orig_score = 0.0

        # keep refined document
        rd = deepcopy(d)
        rd.page_content = best_text
        refined_docs.append(rd)

        # store detailed info
        info.append({
            "source": (d.metadata or {}).get("source"),
            "page": (d.metadata or {}).get("page"),
            "orig_len": len(base_text),
            "refined_len": len(best_text),
            "orig_score": orig_score,
            "best_score": best_score,
            "improvement": best_score - orig_score,
            "candidates": len(cands),
        })

    return refined_docs, info



# Step 5: Build a controllable RAG chain with chat memory (LCEL) — add snippets

from pathlib import Path
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableMap, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

retriever = vectorstore.as_retriever(
    search_type="mmr",                      # optional: more diverse retrieval
    search_kwargs={"k": 5, "fetch_k": 30, "lambda_mult": 0.5}
)

llm = ChatOpenAI(temperature=0, timeout=60, max_retries=1)

#SYSTEM = """You must answer ONLY using the provided context.
#If the answer is not contained in the context, say "I don't know."
#Cite sources like [filename p.X] after claims when possible."""
SYSTEM = """"""

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM),
    MessagesPlaceholder("chat_history"),
    ("human", "Question: {question}\n\nContext:\n{context}")
])

def format_docs(docs, max_chars=1200):
    rows, seen = [], set()
    for d in docs:
        meta = d.metadata or {}
        name = Path(meta.get("source", "doc")).name
        page = meta.get("page")
        tag = f"[{name} p.{(page + 1) if isinstance(page, int) else '?'}]"
        text = d.page_content
        key = (name, page, hash(text[:120]))  # light de-dup
        if key in seen:
            continue
        seen.add(key)
        if len(text) > max_chars:
            text = text[:max_chars] + " ..."
        rows.append(f"{tag}\n{text}")
    return "\n\n".join(rows)

# Core pipeline:
from langchain_core.runnables import RunnableMap, RunnableLambda, RunnablePassthrough

# ... (your llm, prompt, format_docs, memory setup as before)

rag_core = (
    # 1) pass fields
    RunnableMap({
        "question": lambda x: x["question"],
        "chat_history": lambda x: x.get("chat_history", []),
    })
    # 2) retrieve initial top-k docs
    | RunnableMap({
        "docs_raw":    lambda x: retriever.invoke(x["question"]),
        "question":    lambda x: x["question"],
        "chat_history": lambda x: x["chat_history"],
    })
    # 3) refine those top-k docs locally (window shift/trim inside chunk)
    | RunnableLambda(lambda x: (lambda refined_docs, meta: {
            "docs": refined_docs,
            "refine_info": meta,
            "question": x["question"],
            "chat_history": x["chat_history"],
        })(*refine_topk_chunks(
            query=x["question"],
            docs=x["docs_raw"],
            embeddings=embeddings,     # reuse your existing OpenAIEmbeddings()
            max_shift=200,
            step=100,
            min_len=300,
            max_len=1200
        )))
    # 4) build LLM context from refined docs
    | RunnableMap({
        "context":      lambda x: format_docs(x["docs"]),
        "question":     lambda x: x["question"],
        "chat_history": lambda x: x["chat_history"],
        "docs":         lambda x: x["docs"],
        "refine_info":  lambda x: x["refine_info"],
    })
    # 5) in parallel: answer + passthrough docs + snippets + sources + refinement meta
    | RunnableMap({
        "answer":   (prompt | llm | StrOutputParser()),
        "docs":     lambda x: x["docs"],
        "snippets": lambda x: [d.page_content for d in x["docs"]],
        "sources":  lambda x: [
            {
                "source": (d.metadata or {}).get("source"),
                "page": ((d.metadata or {}).get("page") + 1) if isinstance((d.metadata or {}).get("page"), int) else None
            } for d in x["docs"]
        ],
        "refine_info": lambda x: x["refine_info"],
    })
)

# memory wrapper
_store = {}
def _get_history(session_id: str):
    if session_id not in _store:
        _store[session_id] = ChatMessageHistory()
    return _store[session_id]

qa = RunnableWithMessageHistory(
    rag_core,
    get_session_history=_get_history,
    input_messages_key="question",
    history_messages_key="chat_history",
    output_messages_key="answer"  # important to silence tracer expecting 'output'
)



In [14]:
# Step 6: Interactive loop (blank line to exit)
session_id = "default_session"

print("Chat mode started. Press Enter on an empty line to exit.\n")
while True:
    query = input("You: ").strip()
    if query == "":
        print("Bye.")
        break

    res = qa.invoke(
        {"question": query},
        config={"configurable": {"session_id": session_id}}
    )
    
    # After res = qa.invoke(...)

    print("Bot:", res["answer"], "\n")
    
    # Show normalized sources
    if "sources" in res:
        print("Sources:")
        for s in res["sources"]:
            p = f" p.{s['page']}" if s.get("page") else ""
            print(f"  - {s['source']}{p}")
        print()
    
    # Show refined snippets (trimmed)
    if "snippets" in res:
        print("Refined snippets:")
        MAX_PREVIEW = 240
        for i, snip in enumerate(res["snippets"], 1):
            preview = snip if len(snip) <= MAX_PREVIEW else snip[:MAX_PREVIEW] + " ..."
            print(f"  [{i}] {preview}")
        print()
    
    # Show refinement meta (scores, lengths)
    if "refine_info" in res:
        print("Refinement details (per chunk):")
        for i, inf in enumerate(res["refine_info"], 1):
            src = inf.get("source")
            pg = inf.get("page")
            print(f"  [{i}] {src} p.{pg+1 if isinstance(pg,int) else '?'} | "
                  f"orig={inf['orig_score']:.4f} -> best={inf['best_score']:.4f} "
                  f"(Δ={inf['improvement']:.4f}) | len {inf['orig_len']} -> {inf['refined_len']}")

        print()
    


Chat mode started. Press Enter on an empty line to exit.



You:  What is statistical feasibility?


Bot: Statistical feasibility refers to the ability to perform statistical analysis or inference on a given problem or dataset within certain parameter regimes. It involves determining whether statistical methods can be effectively applied to the data at hand to draw meaningful conclusions. In the context provided, it is mentioned that there are problems that are statistically feasible, meaning that statistical analysis can be conducted, but there may not be computationally efficient procedures available to solve them. 

Sources:
  - main_SS.pdf p.8
  - main_SS.pdf p.28
  - main_SS.pdf p.23
  - Resume.pdf p.1
  - main_SS.pdf p.19

Refined snippets:
  [1] condition ∥R∥F →∞ is therefore not sufficient.
2.2 Computational Feasibility
The results in Section 2.1 provides a necessary condition for statistical detectability. There are, however, numerous
problems that are statistically feasible with ...
  [2] Van der Vaart, A. W. (2000). Asymptotic statistics, Volume 3. Cambridge university pres

You:  Comparison with compuational feasibility?


Bot: Statistical feasibility and computational feasibility are related concepts but focus on different aspects of a problem:

1. Statistical feasibility: Refers to the ability to perform statistical analysis or inference on a given problem or dataset within certain parameter regimes. It involves determining whether statistical methods can be effectively applied to the data at hand to draw meaningful conclusions. In the context provided, it is mentioned that there are problems that are statistically feasible, meaning that statistical analysis can be conducted.

2. Computational feasibility: Refers to the ability to solve a problem using computational methods within reasonable time and resources. It involves determining whether there are computationally efficient procedures available to solve a given problem. In the context provided, it is mentioned that there are problems that are statistically feasible but may not have known computationally efficient procedures for solving them.

In su

You:  


Bye.
