In [32]:
import os
import glob
import numpy as np
from dotenv import load_dotenv
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
import gradio as gr
import time
from sklearn.manifold import TSNE
import plotly.graph_objects as go

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_ollama import ChatOllama

# What will we do

- Part A: We will divide our documents into CHUNKS
- Part B: We will encode our CHUNKS into VECTORS and put in Chroma
- Part C: We will visualize our vectors

## 1. Devide our documents into CHUNKS

In [39]:
# First load all our knowledge-base folder
folders = glob.glob("knowledge-base/**/*")
print(f"Found {len(folders)} files in the knowledge base") # 17 files: 

## How many characters in all the documents?
entire_knowledge_base = ""

for file_path in folders:
    with open(file_path, 'r', encoding='utf-8') as f:
        entire_knowledge_base += f.read()
        entire_knowledge_base += "\n\n"

print(f"Total characters in knowledge base: {len(entire_knowledge_base):,}") # 101,404 words

Found 17 files in the knowledge base
Total characters in knowledge base: 101,404


In [40]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase

folders = glob.glob("knowledge-base/*")

#text_loader_kwargs = {'encoding': 'utf-8'}
# N·∫øu d√≤ng tr√™n kh√¥ng ho·∫°t ƒë·ªông, ng∆∞·ªùi d√πng Windows c√≥ th·ªÉ d√πng d√≤ng d∆∞·ªõi thay th·∫ø
text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print("Total documents loaded:", len(documents))

Total documents loaded: 17


In [7]:
documents[0]

Document(metadata={'source': 'knowledge-base/visas/D-4 General Trainee Visa.md', 'doc_type': 'visas'}, page_content='# Visa H·ªçc T·∫≠p D-4 (D-4 General Trainee Visa)\n\n## T·ªïng Quan\nVisa D-4 l√† lo·∫°i visa d√†nh cho c√°c ho·∫°t ƒë·ªông h·ªçc t·∫≠p, nghi√™n c·ª©u v√† ƒë√†o t·∫°o ng·∫Øn h·∫°n t·∫°i H√†n Qu·ªëc, bao g·ªìm h·ªçc ti·∫øng H√†n, c√°c kh√≥a ƒë√†o t·∫°o ngh·ªÅ v√† ch∆∞∆°ng tr√¨nh trao ƒë·ªïi sinh vi√™n.\n\n## Ph√¢n Lo·∫°i Visa D-4\n\n### D-4-1: H·ªçc Ti·∫øng H√†n\n- Kh√≥a h·ªçc ti·∫øng H√†n t·∫°i c√°c tr∆∞·ªùng ƒë·∫°i h·ªçc\n- Th·ªùi gian: 3 th√°ng - 2 nƒÉm\n\n### D-4-2: Nghi√™n C·ª©u\n- Ch∆∞∆°ng tr√¨nh nghi√™n c·ª©u ng·∫Øn h·∫°n\n- Th·ª±c t·∫≠p nghi√™n c·ª©u\n\n### D-4-6: ƒê√†o T·∫°o Ngh·ªÅ\n- C√°c kh√≥a ƒë√†o t·∫°o k·ªπ nƒÉng ngh·ªÅ nghi·ªáp\n- Ch∆∞∆°ng tr√¨nh ƒë√†o t·∫°o k·ªπ thu·∫≠t\n\n### D-4-7: Ch∆∞∆°ng Tr√¨nh Kh√¥ng C·∫•p B·∫±ng\n- Ch∆∞∆°ng tr√¨nh trao ƒë·ªïi sinh vi√™n\n- Kh√≥a h·ªçc ng·∫Øn h·∫°n\n\n## ƒê·ªëi T∆∞·ª£ng √Åp D·ª•ng\n- Sinh vi√™n mu·ªën h·ªçc ti·∫øng H

In [41]:
# Devide into CHUNKS
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # Slightly smaller chunks for better retrieval
    chunk_overlap=200,  # Reduced overlap for performance
    separators=["\n\n", "\n", ". ", " ", ""]  # Better separation
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")
print(f"First chunk:\n\n{chunks[0]}")

Created 128 chunks
First chunk:

page_content='# Visa H·ªçc T·∫≠p D-4 (D-4 General Trainee Visa)

## T·ªïng Quan
Visa D-4 l√† lo·∫°i visa d√†nh cho c√°c ho·∫°t ƒë·ªông h·ªçc t·∫≠p, nghi√™n c·ª©u v√† ƒë√†o t·∫°o ng·∫Øn h·∫°n t·∫°i H√†n Qu·ªëc, bao g·ªìm h·ªçc ti·∫øng H√†n, c√°c kh√≥a ƒë√†o t·∫°o ngh·ªÅ v√† ch∆∞∆°ng tr√¨nh trao ƒë·ªïi sinh vi√™n.

## Ph√¢n Lo·∫°i Visa D-4

### D-4-1: H·ªçc Ti·∫øng H√†n
- Kh√≥a h·ªçc ti·∫øng H√†n t·∫°i c√°c tr∆∞·ªùng ƒë·∫°i h·ªçc
- Th·ªùi gian: 3 th√°ng - 2 nƒÉm

### D-4-2: Nghi√™n C·ª©u
- Ch∆∞∆°ng tr√¨nh nghi√™n c·ª©u ng·∫Øn h·∫°n
- Th·ª±c t·∫≠p nghi√™n c·ª©u

### D-4-6: ƒê√†o T·∫°o Ngh·ªÅ
- C√°c kh√≥a ƒë√†o t·∫°o k·ªπ nƒÉng ngh·ªÅ nghi·ªáp
- Ch∆∞∆°ng tr√¨nh ƒë√†o t·∫°o k·ªπ thu·∫≠t

### D-4-7: Ch∆∞∆°ng Tr√¨nh Kh√¥ng C·∫•p B·∫±ng
- Ch∆∞∆°ng tr√¨nh trao ƒë·ªïi sinh vi√™n
- Kh√≥a h·ªçc ng·∫Øn h·∫°n

## ƒê·ªëi T∆∞·ª£ng √Åp D·ª•ng
- Sinh vi√™n mu·ªën h·ªçc ti·∫øng H√†n
- Ng∆∞·ªùi tham gia ch∆∞∆°ng tr√¨nh ƒë√†o t·∫°o ngh·ªÅ
- Sinh vi√™n trao ƒë·ªïi qu·ªëc t·∫

## 2. Encode CHUNKS into vectors and store in Chroma

we already setup and save hugging face into .env file now time to use that

In [7]:
load_dotenv(override=True)

True

In [42]:
# choose an embedding model
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [43]:
# ƒê·∫∑t t√™n cho database vector (c√≥ th·ªÉ t√πy ch·ªçn)
db_name = "vector_db"

# Ki·ªÉm tra n·∫øu database Chroma ƒë√£ t·ªìn t·∫°i, th√¨ x√≥a collection ƒë·ªÉ kh·ªüi ƒë·ªông l·∫°i t·ª´ ƒë·∫ßu or remove
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()



In [44]:
# T·∫°o vector store b·∫±ng Chroma
vectorstore = Chroma.from_documents(
    documents=chunks,              # Danh s√°ch c√°c ƒëo·∫°n vƒÉn b·∫£n ƒë√£ chia nh·ªè
    embedding=embeddings,          # H√†m embedding (HuggingFace)
    persist_directory=db_name      # Th∆∞ m·ª•c l∆∞u tr·ªØ database
)
# Ki·ªÉm tra s·ªë l∆∞·ª£ng document ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o vector store
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 128 documents


In [45]:
# #L·∫•y ra b·ªô s∆∞u t·∫≠p vector t·ª´ vectorstore
collection = vectorstore._collection

In [46]:
# ------investiage our vectors-----------------
# #L·∫•y 1 embedding t·ª´ database
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]

# #Ki·ªÉm tra s·ªë chi·ªÅu (s·ªë ph·∫ßn t·ª≠ trong vector)
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")
# -------------------------------------------------

The vectors have 384 dimensions


In [None]:
# sample_embedding # look like this

## 3. Visualization

In [47]:
# Prepare
result = collection.get(include=['embeddings', 'documents', 'metadatas']) # L·∫•y to√†n b·ªô vector, t√†i li·ªáu v√† metadata t·ª´ collection
vectors = np.array(result['embeddings']) # ƒê∆∞a embedding v√†o m·∫£ng numpy
documents = result['documents'] # L∆∞u l·∫°i vƒÉn b·∫£n
doc_types = [metadata['doc_type'] for metadata in result['metadatas']] # Tr√≠ch lo·∫°i t√†i li·ªáu t·ª´ metadata (gi·∫£ s·ª≠ c√≥ 'doc_type')
colors = [['blue', 'green', 'red', 'orange'][['company', 'employees', 'visas', 'schools'].index(t)] for t in doc_types] # G√°n m√†u s·∫Øc t√πy theo lo·∫°i t√†i li·ªáu

In [48]:
# Gi·∫£m s·ªë chi·ªÅu c·ªßa vector xu·ªëng 3D b·∫±ng t-SNE (T-distributed Stochastic Neighbor Embedding)

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# T·∫°o bi·ªÉu ƒë·ªì scatter 2D
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Lo·∫°i: {t}<br>VƒÉn b·∫£n: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='Bi·ªÉu ƒë·ªì 3D Chroma Vector Store',
    scene=dict(xaxis_title='x', yaxis_title='y'),
    width=900,
    height=700,
    margin=dict(r=10, b=10, l=10, t=40)
)

fig.show()

# Chatbot n√¥·ªã b·ªô - C√¥ng ty KorStudy!

In [49]:
retriever = vectorstore.as_retriever()

MODEL = "llama3.2"
llm = ChatOllama(temperature=0.7, model=MODEL)

In [50]:
retriever.invoke("Who is Lan?")

[Document(id='49154c24-d794-4241-8328-ebee1673eef1', metadata={'doc_type': 'employees', 'source': 'knowledge-base/employees/Alex Kim.md'}, page_content='## L·ªãch L√†m Vi·ªác\n\n### Th·ª© 2-6\n- **8:00-12:00**: T∆∞ v·∫•n tr·ª±c ti·∫øp t·∫°i vƒÉn ph√≤ng\n- **13:00-17:00**: X·ª≠ l√Ω h·ªì s∆° v√† li√™n h·ªá tr∆∞·ªùng ƒë·ªëi t√°c\n- **17:00-18:00**: T∆∞ v·∫•n online cho h·ªçc sinh t·ªânh xa\n\n### Th·ª© 7-CN\n- **Th·ª© 7**: T∆∞ v·∫•n v√† workshop cho ph·ª• huynh\n- **Ch·ªß nh·∫≠t**: T∆∞ v·∫•n online v√† emergency support\n\n## Feedback T·ª´ H·ªçc Sinh\n\n> *"Anh Alex r·∫•t t·∫≠n t√¢m v√† hi·ªÉu r√µ t√¢m l√Ω h·ªçc sinh. Nh·ªù anh m√† em ƒë√£ v√†o ƒë∆∞·ª£c tr∆∞·ªùng m∆° ∆∞·ªõc l√† Korea University. Anh lu√¥n ƒë·ªông vi√™n em trong nh·ªØng l√∫c kh√≥ khƒÉn nh·∫•t."*  \n> **- Nguy·ªÖn Minh H∆∞∆°ng, Korea University Business School 2023**\n\n> *"Alex ƒë√£ gi√∫p em t·ª´ vi·ªác ch·ªçn tr∆∞·ªùng ph√π h·ª£p ƒë·∫øn chu·∫©n b·ªã h·ªì s∆° m·ªôt c√°ch chi ti·∫øt. T·ª∑ l·ªá th√†nh c√¥ng c·ªßa anh ·∫•y l√

In [51]:
llm.invoke("Who is Lan?")

AIMessage(content='There are several individuals and characters named "Lan" across different fields, so it\'s possible that you\'re referring to one of the following:\n\n1. Lan Lampe: A Danish professional footballer who plays as a midfielder for the Denmark women\'s national team.\n2. Lan Laryon: A South Korean rapper and singer under the stage name "Lan".\n3. Lan Lan: A Chinese actress, singer, and television personality.\n\nWithout more context or information, it\'s difficult to determine which "Lan" you\'re referring to. If you could provide more details or clarify who Lan is in your context, I\'d be happy to try and provide a more specific answer.', additional_kwargs={}, response_metadata={'model': 'llama3.2', 'created_at': '2025-12-26T13:55:01.308485Z', 'done': True, 'done_reason': 'stop', 'total_duration': 27061007712, 'load_duration': 6883404432, 'prompt_eval_count': 29, 'prompt_eval_duration': 3184219505, 'eval_count': 139, 'eval_duration': 16976131599, 'logprobs': None, 'mode

In [52]:
SYSTEM_PROMPT_TEMPLATE = """
B·∫°n l√† chuy√™n gia t∆∞ v·∫•n du h·ªçc H√†n Qu·ªëc t·∫°i trung t√¢m Korea Study. 
Nhi·ªám v·ª• c·ªßa b·∫°n l√† tr·∫£ l·ªùi c√°c c√¢u h·ªèi li√™n quan ƒë·∫øn trung t√¢m, nh√¢n vi√™n, tr∆∞·ªùng h·ªçc v√† th√¥ng tin visa m·ªôt c√°ch ng·∫Øn g·ªçn v√† ch√≠nh x√°c. 
N·∫øu c√≥ th√¥ng tin li√™n quan trong ng·ªØ c·∫£nh ƒë∆∞·ª£c cung c·∫•p, h√£y s·ª≠ d·ª•ng ƒë·ªÉ tr·∫£ l·ªùi c√¢u h·ªèi.
N·∫øu b·∫°n kh√¥ng bi·∫øt c√¢u tr·∫£ l·ªùi, h√£y n√≥i r√µ r·∫±ng b·∫°n kh√¥ng bi·∫øt. Tuy·ªát ƒë·ªëi kh√¥ng b·ªãa th√¥ng tin n·∫øu kh√¥ng c√≥ ng·ªØ c·∫£nh ph√π h·ª£p.
Ng·ªØ c·∫£nh:
{context}
"""

In [53]:
def answer_question(question: str, history):
    docs = retriever.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)
    response = llm.invoke([SystemMessage(content=system_prompt), HumanMessage(content=question)])
    return response.content

In [54]:
answer_question("Who is Lan", [])

'Lan l√† b√† Nguy·ªÖn Th·ªã Lan, ng∆∞·ªùi ƒëang gi·ªØ ch·ª©c v·ª• Gi√°m ƒê·ªëc ƒêi·ªÅu h√†nh (CEO) c·ªßa trung t√¢m Korea Study t·∫°i Vi·ªát Nam. C√¥ l√† m·ªôt chuy√™n gia t∆∞ v·∫•n du h·ªçc H√†n Qu·ªëc c√≥ kinh nghi·ªám v√† tr√¨nh ƒë·ªô cao, v·ªõi b·∫±ng c·∫•p t·ª´ ƒê·∫°i h·ªçc Yonsei ·ªü Seoul, H√†n Qu·ªëc, v√† ch·ª©ng ch·ªâ t·ª´ c√°c t·ªï ch·ª©c qu·ªëc t·∫ø nh∆∞ ICEF v√† PMP.\n\nB√† Lan ƒë√£ c√≥ h∆°n 15 nƒÉm kinh nghi·ªám trong lƒ©nh v·ª±c t∆∞ v·∫•n du h·ªçc v√† gi√°o d·ª•c qu·ªëc t·∫ø, v√† ƒë√£ gi√∫p h√†ng ng√†n h·ªçc sinh Vi·ªát Nam ƒë·∫°t ƒë∆∞·ª£c m·ª•c ti√™u du h·ªçc m∆° ∆∞·ªõc c·ªßa m√¨nh. C√¥ c≈©ng l√† m·ªôt ng∆∞·ªùi c√≥ ki·∫øn th·ª©c s√¢u v·ªÅ h·ªá th·ªëng gi√°o d·ª•c H√†n Qu·ªëc, ƒë·∫∑c bi·ªát l√† c√°c tr∆∞·ªùng ƒë·∫°i h·ªçc danh gi√° nh∆∞ Korea University, Seoul National University, v√† nh·ªØng th√¥ng tin c·∫≠p nh·∫≠t t·ª´ c√°c t·ªï ch·ª©c n√†y.\n\nB√† Lan c≈©ng c√≥ kh·∫£ nƒÉng giao ti·∫øp t·ªët v·ªõi nhi·ªÅu ng√¥n ng·ªØ, bao g·ªìm Ti·∫øng Vi·ªát (b·∫£n ng·ªØ), Ti·∫øng H√†n

# Yayy!!!!! But ... wait! ---> What should we do next ? ü§î

In [55]:
# Test questions file 
from test_questions import save_tests_to_jsonl, print_test_summary

save_tests_to_jsonl()
print_test_summary()

Saved 40 test questions to tests.jsonl

=== Test Question Summary ===
Total: 40 questions

company_info: 8 questions
employee_info: 7 questions
school_info: 10 questions
visa_info: 8 questions
complex: 7 questions


In [None]:
# Test Single Question (Quick Check)
from test_questions import load_tests_from_jsonl
from evaluate_rag import evaluate_retrieval, evaluate_answer_with_llm

# Load tests
tests = load_tests_from_jsonl()

# Pick one test
test = tests[0]
print(f"Question: {test.question}")
print(f"Expected keywords: {test.keywords}")

# Evaluate retrieval
retrieval_metrics, chunks = evaluate_retrieval(
    question=test.question,
    expected_keywords=test.keywords,
    retriever=retriever,  # Your retriever from setup
    k=10
)

print(f"\nRetrieval Results:")
print(f"  MRR: {retrieval_metrics.mrr:.3f}")
print(f"  nDCG: {retrieval_metrics.ndcg:.3f}")
print(f"  Keywords found: {retrieval_metrics.keywords_found}/{retrieval_metrics.total_keywords}")
print(f"  Coverage: {retrieval_metrics.keyword_coverage:.1f}%")

# Generate answer
generated_answer = answer_question(test.question, [])
print(f"\nGenerated Answer:\n{generated_answer}")

# Evaluate answer (using LLM judge)
answer_metrics = evaluate_answer_with_llm(
    question=test.question,
    generated_answer=generated_answer,
    reference_answer=test.reference_answer,
    llm=llm  
)

print(f"\nAnswer Evaluation:")
print(f"  Accuracy: {answer_metrics.accuracy:.2f}/5")
print(f"  Completeness: {answer_metrics.completeness:.2f}/5")
print(f"  Relevance: {answer_metrics.relevance:.2f}/5")
print(f"  Feedback: {answer_metrics.feedback}")

Question: Korea Study ƒë∆∞·ª£c th√†nh l·∫≠p nƒÉm n√†o?
Expected keywords: ['2018', 'th√†nh l·∫≠p', 'nƒÉm']

Retrieval Results:
  MRR: 1.000
  nDCG: 0.973
  Keywords found: 3/3
  Coverage: 100.0%

Generated Answer:
Korea Study ƒë∆∞·ª£c th√†nh l·∫≠p v√†o nƒÉm 2018.

Answer Evaluation:
  Accuracy: 5.00/5
  Completeness: 4.00/5
  Relevance: 5.00/5
  Feedback: The generated answer is factually correct, but it lacks some details from the reference answer. It provides a brief overview of Korea Study's establishment year and its mission, which is relevant to answering the question.


In [57]:
from evaluate_rag import run_full_evaluation, analyze_by_category

# Run full evaluation (automated with LLM judge)
results = run_full_evaluation(
    retriever=retriever,           # Your retriever
    answer_function=answer_question,  # Your answer function
    use_manual_eval=False,         # Use LLM for evaluation
    llm=llm,                       # Your LLM
    test_file="tests.jsonl"
)

# Analyze by category
analyze_by_category(results)


Running evaluation on 40 test questions


Test 1/40: company_info
Question: Korea Study ƒë∆∞·ª£c th√†nh l·∫≠p nƒÉm n√†o?
  MRR: 1.000
  nDCG: 0.973
  Coverage: 100.0%
  Accuracy: 5.00/5
  Completeness: 4.00/5
  Relevance: 5.00/5

Test 2/40: company_info
Question: VƒÉn ph√≤ng Korea Study ·ªü ƒë√¢u?
  MRR: 0.100
  nDCG: 0.130
  Coverage: 20.0%


KeyboardInterrupt: 