## RAG Chatbot

In [2]:
%pwd


'/home/ubunchuu/Documents/WORKSPACE/data_engineer/FASTAPI_DEPLOY/notebook'

In [3]:
import os

os.chdir("../")

In [4]:
%pwd

'/home/ubunchuu/Documents/WORKSPACE/data_engineer/FASTAPI_DEPLOY'

In [7]:
# Di chuyển vào thư mục backend
os.chdir("backend")

In [8]:
%pwd

'/home/ubunchuu/Documents/WORKSPACE/data_engineer/FASTAPI_DEPLOY/backend'

### Langchain Library

In [31]:

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os


### Extract data

In [12]:
# Extract text from PDF files
def load_pdf_files(data):

    loader = DirectoryLoader(
        data
        , glob="*.pdf"
        , loader_cls=PyPDFLoader
    )

    documents = loader.load()

    return documents

In [13]:

extracted_data = load_pdf_files("data")


In [14]:

len(extracted_data)


187

In [15]:


extracted_data[0:3]



[Document(metadata={'producer': 'Skia/PDF m80', 'creator': 'Acrobat Pro DC 20.9.20067', 'creationdate': '2020-06-11T10:54:37+07:00', 'author': 'Holistics.io', 'moddate': '2020-07-02T14:28:01+07:00', 'subject': 'Restructure your knowledge of the complex data analytics landscape, and learn how to build scalable analytics & BI stacks in the modern cloud era.', 'title': 'The Analytics Setup Guidebook', 'source': 'data/the-analytics-stack-guidebook.pdf', 'total_pages': 187, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'Skia/PDF m80', 'creator': 'Acrobat Pro DC 20.9.20067', 'creationdate': '2020-06-11T10:54:37+07:00', 'author': 'Holistics.io', 'moddate': '2020-07-02T14:28:01+07:00', 'subject': 'Restructure your knowledge of the complex data analytics landscape, and learn how to build scalable analytics & BI stacks in the modern cloud era.', 'title': 'The Analytics Setup Guidebook', 'source': 'data/the-analytics-stack-guidebook.pdf', 'total_pages': 187, 'pa

In [16]:
from typing import List
from langchain.schema import Document

def filter_to_minial_docs(docs: list[Document]) -> List[Document]:
    minimal_docs : List[Document] = []

    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content
                , metadata = {"source": src}
            )
        )

    return minimal_docs


In [17]:
minimal_docs = filter_to_minial_docs(extracted_data)

In [18]:


minimal_docs[0:5]



[Document(metadata={'source': 'data/the-analytics-stack-guidebook.pdf'}, page_content=''),
 Document(metadata={'source': 'data/the-analytics-stack-guidebook.pdf'}, page_content="The Analytics Setup Guidebook\nWe are Holistics. We've been making data analytics tools for over four \nyears, and helped more than a hundred companies build their business \nintelligence capabilities, sometimes from scratch.\nA huge chunk of our time is spent educating and assisting companies \nas they migrate to this new world of cloud-based business intelligence \ntools. For the first time ever, we're putting that experience up for the \nworld to read.\nwww.holistics.io\nBesides the guidebook, we also regularly share our thoughts on data \nanalytics, business intelligence and how we build Holistics.\nwww.holistics.io/blog\nWritten by: Huy Nguyen, Ha Pham, Cedric Chin\nLast edited: July 2nd, 2020\nDesigned and Published by: My Nguyen, Dung Pham, Son Hoang, Tuan Nguyen, Khai To, \nAnthony T. Do\nCopyright © 20

### Chunking

In [25]:


def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400
        , chunk_overlap = 50
        , separators = ["\n\n", "\n", " ", ""]
    )
    texts = text_splitter.split_documents(minimal_docs)
    return texts



In [26]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

texts_chunk[0:3]

Number of chunks: 633


[Document(metadata={'source': 'data/the-analytics-stack-guidebook.pdf'}, page_content="The Analytics Setup Guidebook\nWe are Holistics. We've been making data analytics tools for over four \nyears, and helped more than a hundred companies build their business \nintelligence capabilities, sometimes from scratch.\nA huge chunk of our time is spent educating and assisting companies \nas they migrate to this new world of cloud-based business intelligence"),
 Document(metadata={'source': 'data/the-analytics-stack-guidebook.pdf'}, page_content="tools. For the first time ever, we're putting that experience up for the \nworld to read.\nwww.holistics.io\nBesides the guidebook, we also regularly share our thoughts on data \nanalytics, business intelligence and how we build Holistics.\nwww.holistics.io/blog\nWritten by: Huy Nguyen, Ha Pham, Cedric Chin\nLast edited: July 2nd, 2020"),
 Document(metadata={'source': 'data/the-analytics-stack-guidebook.pdf'}, page_content="Last edited: July 2nd, 2020

### Embedding model and vector embedding

In [55]:

from dotenv import load_dotenv
import os

load_dotenv()


PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY


In [57]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding = GoogleGenerativeAIEmbeddings(model="text-embedding-004")

In [58]:

vector = embedding.embed_query("My name is Nghia")
vector


[0.005305747967213392,
 -0.013471610844135284,
 -0.029214469715952873,
 -0.009849978610873222,
 0.011281467042863369,
 0.0015289145521819592,
 -0.0009948324877768755,
 0.009864372201263905,
 -0.0392080694437027,
 0.020263168960809708,
 -0.027062397450208664,
 0.03467141464352608,
 0.07491064071655273,
 -0.0022499903570860624,
 -0.055616673082113266,
 -0.038980141282081604,
 0.00039098644629120827,
 0.025406500324606895,
 -0.10897557437419891,
 -0.0021657897159457207,
 0.021640561521053314,
 -0.016585247591137886,
 0.046728480607271194,
 -0.010267126373946667,
 -0.03791150450706482,
 -0.0427946001291275,
 -0.0026853724848479033,
 -0.0293100755661726,
 -0.03925672546029091,
 -0.05560600385069847,
 0.08324326574802399,
 0.0668097585439682,
 -0.029392259195446968,
 -0.007091793231666088,
 0.019204506650567055,
 0.06693758070468903,
 0.009926259517669678,
 -0.007509930990636349,
 0.041988614946603775,
 -0.08321689814329147,
 -0.06103595718741417,
 0.038014061748981476,
 -0.00489478698000311

### Vector Database

#### Indexing

In [59]:

from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)



In [60]:


pc



<pinecone.pinecone.Pinecone at 0x7745b3a0b280>

In [61]:


from pinecone import ServerlessSpec

index_name = "analytic-stack"

if not pc.has_index(index_name):

    pc.create_index(
        name = index_name,
        dimension = 768, # Dimension of embedding
        metric = "cosine" ,# Cosine similarity
        spec = ServerlessSpec(
            cloud = "aws",
            region = "us-east-1"
        )
    )


index = pc.Index(index_name)
   



  from .autonotebook import tqdm as notebook_tqdm


In [62]:

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = texts_chunk,
    embedding = embedding,
    index_name = index_name
)



In [63]:
# Load Existing index
from langchain_pinecone import PineconeVectorStore

# Embed each chunk and upsert the embeddings into your Pinecone index

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embedding
)



### Retriever

In [64]:

retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [66]:
retrieved_docs = retriever.invoke("What is Postgres, ETL?")
retrieved_docs

[Document(id='0142ef8a-d48a-4a7d-90e3-c03e58b340a4', metadata={'source': 'data/the-analytics-stack-guidebook.pdf'}, page_content='with different subsets of data, but we do not wish this pain on even our \nworst enemies, so we do not wish it on you. \nYour central analytics database is usually powered by a data \nwarehouse, which is a type of database that is optimized for analytical \nworkloads. The process by which such consolidation happens is \ncommonly called ETL (Extract Transform Load).'),
 Document(id='76c83fca-ff2a-46e3-8dfc-62e72ca975cb', metadata={'source': 'data/the-analytics-stack-guidebook.pdf'}, page_content='commonly called ETL (Extract Transform Load).\nChapter 2 of the book will go into more detail about this step.\n21\nThe Analytics Setup Guidebook – High-level Overview of an Analytics Setup'),
 Document(id='7f9688da-fb4e-49f1-a3dc-055d43ff0548', metadata={'source': 'data/the-analytics-stack-guidebook.pdf'}, page_content="Second, it allows analysts to write transforma

### Generate

In [67]:


from langchain_google_genai import ChatGoogleGenerativeAI

chatModel = ChatGoogleGenerativeAI(model="gemini-2.5-flash", api_key=GOOGLE_API_KEY)



In [68]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [69]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [70]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)



In [71]:
response = rag_chain.invoke({"input": "what is Holistic?"})
print(response["answer"])



Based on the provided text, Holistics is a tool or company that helps set up, run, and maintain an analytics stack. It is designed to operate without the need for data engineering assistance.
