In [1]:
import os
from langchain_community.document_loaders import NotebookLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_chunk_ipynb(ipynb_path):
    loader = NotebookLoader(
        ipynb_path,
        include_outputs=True,
        max_output_length=500
    )
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    return splitter.split_documents(documents)

def load_and_chunk_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    return splitter.split_documents(documents)

# 🔁 Recursively walk through ./lectures and process .ipynb and .pdf
ipynb_dir = './lectures/'
all_chunks = []

for root, dirs, files in os.walk(ipynb_dir):
    for filename in files:
        file_path = os.path.join(root, filename)
        rel_path = os.path.relpath(file_path, start=ipynb_dir)

        if filename.endswith(".ipynb"):
            print(f"Processing notebook: {rel_path}")
            chunks = load_and_chunk_ipynb(file_path)

        elif filename.endswith(".pdf"):
            print(f"Processing PDF: {rel_path}")
            chunks = load_and_chunk_pdf(file_path)

        else:
            continue  # Skip non-supported files

        for chunk in chunks:
            chunk.metadata["source"] = rel_path
        all_chunks.extend(chunks)

print(f"\n✅ Total chunks created: {len(all_chunks)}")


Processing notebook: DSCI_554/appendix_blocking_and_stratification.ipynb
Processing notebook: DSCI_554/lecture1_multiple_comparisons.ipynb
Processing notebook: DSCI_554/lecture7_obs_sampling_schemes.ipynb
Processing notebook: DSCI_554/appendix-reg-mindmap.ipynb
Processing notebook: DSCI_554/appendix-causality-exp-cheatsheet.ipynb
Processing notebook: DSCI_554/lecture3_randomization_and_blocking.ipynb
Processing notebook: DSCI_554/appendix-reg-cheatsheet.ipynb
Processing notebook: DSCI_554/appendix-greek-alphabet.ipynb
Processing notebook: DSCI_554/lecture5_more_power_early_stopping.ipynb
Processing notebook: DSCI_554/lecture8_match_constrasts_wrapup.ipynb
Processing notebook: DSCI_554/lecture2_simpson_confounding.ipynb
Processing notebook: DSCI_554/appendix-dist-cheatsheet.ipynb
Processing notebook: DSCI_554/lecture4_more_blocking_and_power.ipynb
Processing notebook: DSCI_554/appendix-prob-cheatsheet.ipynb
Processing notebook: DSCI_554/lecture6_obs_stratifying_modelling.ipynb
Processin

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
# You'll need to choose one of these for your LLM
# from langchain_community.llms import HuggingFacePipeline # For local LLMs
# from langchain_openai import ChatOpenAI # For OpenAI API
# from langchain_google_genai import ChatGoogleGenerativeAI # For Google Gemini API


# --- 1. Load the Embeddings Model (MUST be the same as used for indexing) ---
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Save chunks to Chroma DB
vector_store = Chroma.from_documents(
    documents=all_chunks,
    embedding=embeddings,
    collection_name="example_collection",
    persist_directory="./chroma_langchain_db"
)

print("Vector store saved successfully.")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
  from .autonotebook import tqdm as notebook_tqdm


Vector store saved successfully.


In [6]:
# --- 3. Define the Retriever ---
# This converts your vector store into a retriever
# search_kwargs={"k": 3} means it will retrieve the top 3 most relevant chunks
retriever = vector_store.as_retriever(search_kwargs={"k": 3})


In [7]:
from langchain_openai import ChatOpenAI
import os
# Set your OpenAI API key as an environment variable (e.g., in your .bashrc or .zshrc)
# export OPENAI_API_KEY="your_api_key_here"
if os.environ.get("OPENAI_API_KEY"):
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)
    print("Using OpenAI GPT-3.5-turbo.")
else:
    print("OPENAI_API_KEY environment variable not set. Cannot use OpenAI LLM.")
    llm = None

Using OpenAI GPT-3.5-turbo.


In [8]:
# --- 5. Set up the RAG Chain ---
if llm:
    # Define a custom prompt template for the LLM
    # This guides the LLM on how to use the context.
    template = """Use the following pieces of context from my lecture notes to answer the question at the end.
    If you don't know the answer based on the provided context, just say that you don't know.
    Do not make up an answer or use outside knowledge.
    Be concise and clear in your response.

    Context:
    {context}

    Question: {question}

    Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    # Create the RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff", # 'stuff' means it will combine all retrieved docs into one prompt.
        retriever=retriever,
        return_source_documents=True, # This will show you which chunks were used
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT} # Pass your custom prompt
    )

    # --- 6. Ask a Question! ---
    query = "What is confounding variables?" # Example query
    # Replace with a question relevant to your lecture content

    print(f"\nAsking: {query}")
    response = qa_chain.invoke({"query": query})

    print("\n--- Generated Answer ---")
    print(response["result"])

    print("\n--- Sources Used ---")
    if response["source_documents"]:
        for doc in response["source_documents"]:
            # LangChain's Document objects have .page_content and .metadata
            print(f"  Source: {doc.metadata.get('source', 'Unknown File')}")
            # print(f"  Content Snippet: {doc.page_content[:200]}...") # Uncomment to see a snippet of the chunk
    else:
        print("No source documents were retrieved for this query.")
else:
    print("\nRAG chain cannot be run because no LLM was successfully initialized.")



Asking: What is confounding variables?

--- Generated Answer ---
Confounding variables are third factors associated with the exposure that independently affect the risk of developing the disease, and they can affect the relationship between the exposure and the outcome in a regression framework.

--- Sources Used ---
  Source: appendix-causality-exp-cheatsheet.ipynb
  Source: appendix-causality-exp-cheatsheet.ipynb
  Source: lecture7_obs_sampling_schemes.ipynb
