In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [2]:
# imports for langchain, Chroma, and plotly

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from openai import OpenAI

from langchain_community.embeddings import HuggingFaceEmbeddings


In [3]:
import uuid
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import MessagesState, StateGraph, START
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser


In [4]:
# price is a factor for our company, so we're going to use a low cost model


MODEL = "gpt-oss:20b"
openai = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

#testing the model
response = openai.chat.completions.create(
 model=MODEL,
 messages=[{"role": "user", "content": "What is 2 + 2?"}]
)
print(response.choices[0].message.content)

db_name = "vector_db"

2 + 2 equals **4**.


In [5]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

folders = glob.glob("kvanti-data/*")

# With thanks to CG and Jon R, students on the course, for this fix needed for some users 
text_loader_kwargs = {'encoding': 'utf-8'}
# If that doesn't work, some Windows users might need to uncomment the next line instead
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [6]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [7]:
len(chunks)

45

In [8]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: Hungarian


In [9]:


# Local, open-source embeddings — no API, no setup
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Delete existing database if present
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create and persist your Chroma vectorstore
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=db_name
)

print(f"Vectorstore created with {vectorstore._collection.count()} documents")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Vectorstore created with 45 documents


In [10]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 384 dimensions


In [11]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
colors = [['blue', 'green', 'red', 'orange'][['Hungarian'].index(t)] for t in doc_types]

In [12]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, perplexity=5, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
# Let's try 3D!

tsne = TSNE(n_components=3, perplexity=5, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [14]:
# ============================================================
# CELL 1 — Imports and Environment Setup
# ============================================================

# Point to local Ollama server
os.environ["OPENAI_API_KEY"] = "ollama"
os.environ["OPENAI_BASE_URL"] = "http://localhost:11434/v1"

MODEL = "gpt-oss:20b"

In [15]:
# ============================================================
# CELL 2 — Model and Retriever
# ============================================================

# Initialize your local model via the OpenAI-compatible interface
llm = ChatOpenAI(model=MODEL, temperature=0.5)

# Use your existing Chroma vectorstore created earlier
# (This assumes you already have 'vectorstore' defined)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


In [16]:
# ============================================================
# CELL 3 — Define LangGraph Workflow
# ============================================================

workflow = StateGraph(state_schema=MessagesState)

def call_rag_model(state: MessagesState):
    # Get the last user message
    user_msg = [m for m in state["messages"] if isinstance(m, HumanMessage)][-1]
    question = user_msg.content

    # Retrieve relevant documents from Chroma
    docs = retriever.invoke(question)
    context = "\n\n".join(d.page_content for d in docs)

    # Define and format prompt
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a data scientist aid for students to help with quanitative methods. Your job is to provide context to their question and help fix their R code. Greet the students on the start of the interraction. If you do not know the answer to the question says so. Válaszolj magyarul. Use the context below to answer."),
        ("human", "Question: {question}\n\nContext:\n{context}")
    ])
    msg = prompt.format_messages(question=question, context=context)

    # Generate the model's response
    response = llm.invoke(msg)
    return {"messages": response}

# Build graph
workflow.add_edge(START, "model")
workflow.add_node("model", call_rag_model)


<langgraph.graph.state.StateGraph at 0x32c964c50>

In [17]:
# ============================================================
# CELL 4 — Memory and Graph Compilation
# ============================================================

# MemorySaver stores conversation context between turns
memory = MemorySaver()

# Compile the workflow into an executable app
app = workflow.compile(checkpointer=memory)

# Each session (thread_id) maintains its own memory
thread_id = str(uuid.uuid4())
config = {"configurable": {"thread_id": thread_id}}


In [18]:
# ============================================================
# CELL 5 — Gradio Chat Interface
# ============================================================

def chat(message, history):
    try:
        input_message = HumanMessage(content=message)
        for event in app.stream({"messages": [input_message]}, config, stream_mode="values"):
            pass  # Consume stream events
        last_response = event["messages"][-1].content
        return last_response
    except Exception as e:
        import traceback
        return "Error:\n" + "".join(traceback.format_exception_only(type(e), e))

gr.ChatInterface(
    fn=chat,
    type="messages",
    title="Kvantitaív Módszerek",
    description="Kérdezz elméleti vagy R kód kérdéseket az anyaggal kapcsolatban!",
).launch()


# Quantitative Methods (English)
# Chat with your local knowledge base. Context retrieval + persistent memory using LangGraph.

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




In [42]:
from langchain_core.callbacks import StdOutCallbackHandler

# Create a callback handler that prints events to stdout
callback = StdOutCallbackHandler()


In [43]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model=MODEL,
    temperature=0.7,
    callbacks=[callback],   # <-- attach callback
)


In [44]:
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate

def debug_rag_pipeline(question):
    print("\n--- RETRIEVAL STEP ---")
    docs = retriever.invoke(question)
    print(f"Retrieved {len(docs)} docs")
    for i, d in enumerate(docs, 1):
        print(f"[{i}] {d.metadata.get('source', d.metadata.get('doc_type', 'unknown'))}")
        print(d.page_content[:250].replace("\n", " ") + "...\n")

    context = "\n\n".join(d.page_content for d in docs)

    print("\n--- PROMPT CONSTRUCTION ---")
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are a helpful assistant. Use the context below to answer."),
        ("human", "Question: {question}\n\nContext:\n{context}")
    ])
    msg = prompt.format_messages(question=question, context=context)
    print(msg[0].content + "\n")

    print("\n--- MODEL INVOCATION ---")
    result = llm.invoke(msg, config={"callbacks": [callback]})

    print("\n--- RAW MODEL RESPONSE ---")
    print(result.content)
    return result.content


In [45]:
response = debug_rag_pipeline("Who received the prestigious IIOTY award in 2023?")
print("\nFinal Answer:\n", response)



--- RETRIEVAL STEP ---
Retrieved 5 docs
[1] knowledge-base/employees/Alex Chen.md
## Annual Performance History - **2020:**     - Completed onboarding successfully.     - Met expectations in delivering project milestones.     - Received positive feedback from the team leads.  - **2021:**     - Achieved a 95% success rate in projec...

[2] knowledge-base/employees/Samuel Trenton.md
## Annual Performance History - **2023:** Rating: 4.5/5     *Samuel exceeded expectations, successfully leading a cross-departmental project on AI-driven underwriting processes.*  - **2022:** Rating: 3.0/5     *Some challenges in meeting deadlines an...

[3] knowledge-base/employees/Oliver Spencer.md
## Annual Performance History - **2018**: **3/5** - Adaptable team player but still learning to take initiative. - **2019**: **4/5** - Demonstrated strong problem-solving skills, outstanding contribution on the claims project. - **2020**: **2/5** - S...

[4] knowledge-base/employees/Jordan K. Bishop.md
## Annual