In [1]:
!pip install -U langchain langchain-community openai faiss-cpu tiktoken pypdf



In [1]:
# Step 0: Import dependencies
import os
from langchain_community.document_loaders import PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


In [2]:
# Step 1: Load API Key from .env file & load API key
import os
from dotenv import load_dotenv

# Load the contents of the .env file into system environment variables
load_dotenv()

# Retrieve the key from environment variables
api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables.")

print("✅ API Key loaded successfully (will not be displayed)")

# Windows-specific: avoid MKL/OpenMP conflicts
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

✅ API Key loaded successfully (will not be displayed)


In [3]:
# Step 2: Select multiple PDFs via system dialog (tkinter)
from tkinter import Tk, filedialog
from langchain_community.document_loaders import PyPDFLoader
import os

# open dialog
root = Tk(); root.withdraw()
pdf_paths = filedialog.askopenfilenames(
    title="Select PDF files",
    filetypes=[("PDF files", "*.pdf")]
)
root.destroy()

pdf_paths = list(pdf_paths)
if not pdf_paths:
    raise SystemExit("No PDF selected. Exiting.")

print("The following files will be loaded:")
for p in pdf_paths:
    print(" -", p)

# load all, keep filename+page metadata
documents = []
for path in pdf_paths:
    docs = PyPDFLoader(path).load()
    for d in docs:
        d.metadata["source"] = os.path.basename(d.metadata.get("source", path))
    documents.extend(docs)
print(f"Total pages loaded: {len(documents)}")



The following files will be loaded:
 - C:/Users/syk_5/main_SS.pdf
 - C:/Users/syk_5/Resume.pdf
Total pages loaded: 34


In [4]:
# Step 3: Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)


In [5]:
# Step 4: Generate vector database
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)


In [16]:
# Step 5: Build a controllable RAG chain with chat memory (LCEL)

from pathlib import Path
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableMap, RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

# 1) Use your existing vectorstore to create a retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})  # tune k later

# 2) Chat model is better for multi-turn
#llm = ChatOpenAI(temperature=0)  # you can set model=... if needed
llm = ChatOpenAI(temperature=0, timeout=60, max_retries=1)

# 3) Prompt with a strict system rule and a chat history placeholder
#SYSTEM = """You must answer ONLY using the provided context.
#If the answer is not contained in the context, say "I don't know."
#Cite sources like [filename p.X] after claims when possible."""
SYSTEM = """"""
prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM),
    MessagesPlaceholder("chat_history"),
    ("human", "Question: {question}\n\nContext:\n{context}")
])

# 4) Format retrieved docs (filename + page for traceability, trimmed for length)
def format_docs(docs, max_chars=1200):
    rows, seen = [], set()
    for d in docs:
        meta = d.metadata or {}
        name = Path(meta.get("source", "doc")).name
        page = meta.get("page")
        tag = f"[{name} p.{(page + 1) if isinstance(page, int) else '?'}]"
        text = d.page_content
        key = (name, page, hash(text[:120]))  # light de-dup
        if key in seen:
            continue
        seen.add(key)
        if len(text) > max_chars:
            text = text[:max_chars] + " ..."
        rows.append(f"{tag}\n{text}")
    return "\n\n".join(rows)

# 5) LCEL chain: retrieve -> stitch -> prompt -> LLM -> parse
# ✅ FIX: Extract fields instead of passing the whole dict through
rag_core = (
    # Step 1: Pass through the original question and chat history
    RunnableMap({
        "question": lambda x: x["question"],
        "chat_history": lambda x: x.get("chat_history", []),
    })
    # Step 2: Retrieve relevant documents using the question
    | RunnableMap({
        "docs": lambda x: retriever.invoke(x["question"]),   # Use .invoke to avoid deprecation warnings
        "question": lambda x: x["question"],
        "chat_history": lambda x: x["chat_history"],
    })
    # Step 3: Format retrieved documents into a context string for the LLM
    | RunnableMap({
        "context": lambda x: format_docs(x["docs"]),
        "question": lambda x: x["question"],
        "chat_history": lambda x: x["chat_history"],
        "docs": lambda x: x["docs"],  # Keep docs in the pipeline for later source display
    })
    # Step 4: In parallel:
    #   - Generate the answer using the prompt + LLM + output parser
    #   - Pass the docs through unchanged so they are available in the final output
    | RunnableMap({
        "answer": (prompt | llm | StrOutputParser()),
        "docs":   lambda x: x["docs"],
    })
)


# 6) Add per-session chat history (true multi-turn memory)
_store = {}  # in-memory; swap with Redis/SQLite for persistence

def _get_history(session_id: str):
    if session_id not in _store:
        _store[session_id] = ChatMessageHistory()
    return _store[session_id]

qa = RunnableWithMessageHistory(
    rag_core,
    get_session_history=_get_history,
    input_messages_key="question",
    history_messages_key="chat_history",
    output_messages_key="answer",  # <-- add this line
)




In [17]:
# Step 6: Interactive loop (blank line to exit)
session_id = "default_session"  # change if you need multiple concurrent sessions

print("Chat mode started. Press Enter on an empty line to exit.\n")
while True:
    query = input("You: ").strip()
    if query == "":
        print("Bye.")
        break

    # Invoke the chain with session-bound memory
    res = qa.invoke(
        {"question": query},
        config={"configurable": {"session_id": session_id}}
    )

    # res is {"answer": str, "docs": List[Document]}
    print("Bot:", res["answer"], "\n")

    # (Optional) print sources for transparency
    for d in res["docs"]:
        meta = d.metadata or {}
        name = Path(meta.get("source", "doc")).name
        page = meta.get("page")
        p = (page + 1) if isinstance(page, int) else "?"
        print(f"  [source] {name} p.{p}")
    print()



Chat mode started. Press Enter on an empty line to exit.



You:  Statistically feasible?


Bot: The term "statistically feasible" in this context refers to problems for which there exist statistical methods or procedures that can theoretically solve the problem under certain parameter regimes. However, it is important to note that just because a problem is statistically feasible does not necessarily mean that there are computationally efficient procedures available to solve it. In the context provided, examples of statistically feasible but computationally challenging problems include community detection, sparse PCA, and estimation in spiked tensor models. 

  [source] main_SS.pdf p.8
  [source] main_SS.pdf p.28
  [source] main_SS.pdf p.28
  [source] main_SS.pdf p.5



You:  Compuationally feasible?


Bot: In the context provided, the term "computationally feasible" refers to problems for which there exist efficient algorithms or procedures that can be used to solve them within a reasonable amount of time and computational resources. The text mentions that there are numerous problems that are statistically feasible but lack computationally efficient procedures for solving them. Examples include community detection, sparse PCA, and estimation in spiked tensor models.

The text also discusses the transformation of the independence testing problem to the planted clique detection problem, which is known to be computationally hard. By designing runtime-efficient algorithms, the researchers were able to identify structural patterns between graphs and successfully conduct statistical and computational feasible independence tests on data sets such as C.elegans and Wikipedia data. 

  [source] main_SS.pdf p.8
  [source] main_SS.pdf p.26
  [source] main_SS.pdf p.2
  [source] Resume.pdf p.1



You:  What is the trade-off between them?


Bot: The trade-off between statistical feasibility and computational feasibility is a common consideration in data analysis and research. 

Statistical feasibility refers to the ability to theoretically solve a problem using statistical methods or procedures under certain parameter regimes. This involves ensuring that the statistical methods used are appropriate for the data and that the conclusions drawn are valid and reliable.

Computational feasibility, on the other hand, refers to the ability to solve a problem efficiently using computational resources. This involves developing algorithms or procedures that can handle large datasets and complex analyses in a reasonable amount of time.

The trade-off between statistical feasibility and computational feasibility often arises when dealing with complex data analysis problems. In some cases, statistical methods that are theoretically sound may be computationally intensive and difficult to implement on large datasets. On the other hand, 

You:  What issues might happen for multiple graphs?


Bot: When dealing with multiple graphs, such as three or more graphs, several issues may arise, especially in the context of independence testing for inhomogeneous random graphs. Some of the key issues include:

1. Joint Distribution Complexity: When working with multiple graphs, the joint distributions of the graphs cannot always be specified using only the marginal distributions and pairwise edge correlations. This complexity arises because the relationships between the graphs are more intricate and cannot be fully captured by pairwise correlations alone.

2. Statistical vs. Computational Trade-off: Analyzing multiple graphs can lead to a trade-off between statistical detectability and computational complexity. In some cases, the correlations between the graphs may be statistically detectable, but the algorithms required to analyze them may have running times that are quasi-polynomial in the number of vertices. Balancing statistical power with computational efficiency becomes a chall

You:  Why not involved Omnibus?


Bot: The use of Omnibus matrix in the context of recovering latent positions from multiple random graphs is a powerful technique that can overcome limitations of existing approaches that only handle the same distribution. The Omnibus matrix approach allows for the integration of information from multiple graphs with different distributions, providing a more comprehensive and robust analysis.

However, the specific reason why the Omnibus approach was not involved in the research project described in the resume and related documents is not explicitly stated. Possible reasons could include:

1. Scope of the Research: The research project may have focused on specific aspects of inference and pattern recognition on multiple MRI scans, and the use of the Omnibus matrix approach may not have been deemed necessary or relevant to the research objectives.

2. Methodological Considerations: The researchers may have chosen alternative methods or algorithms that were more suitable for the specific 

You:  


Bye.
