In [2]:
from langchain.memory import ConversationBufferMemory
from langchain_chroma import Chroma
import pandas as pd
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
import os, getpass

from agents import RAGAgentLangChainOpenAI

In [3]:
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

In [4]:
def load_excel_and_convert_to_text(path: str) -> list[str]:
    df = pd.read_excel(path)

    chunks = []
    for idx, row in df.iterrows():
        row_text = ", ".join([f"{col}: {row[col]}" for col in df.columns])
        chunks.append(row_text)

    return chunks

In [5]:
def create_vectorstore_from_text_chunks(chunks: list[str]):
    docs = [Document(page_content=chunk) for chunk in chunks]

    splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    split_docs = splitter.split_documents(docs)

    embeddings = OpenAIEmbeddings()
    if os.path.exists('../../db/excel_db'):
        Chroma(persist_directory='excel_db', embedding_function=embeddings).delete_collection()
    vectorstore = Chroma.from_documents(documents=split_docs,
                                            embedding=embeddings,
                                            persist_directory='excel_db')
    return vectorstore

In [7]:
chunks = load_excel_and_convert_to_text("../../resources/Out_9.xlsx")
vector_store = create_vectorstore_from_text_chunks(chunks)

In [25]:
llm_openai = ChatOpenAI(temperature=0.7 , model="gpt-4o-mini")

In [29]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [30]:
custom_prompt = ChatPromptTemplate.from_template(
"""Use the information below to answer the user's question.
If you can't find a suitable answer, just say you don't know.
Don't give wrong or incorrect answers.

{context}

Câu hỏi: {question}
"""
)

ragLangChainOpenAI = RAGAgentLangChainOpenAI(memory=memory , vector_store=vector_store, prompt_template=custom_prompt, llm=llm_openai)

In [31]:
ragLangChainOpenAI.invoke("Tell me something about the data of Canada")



[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mHuman: Use the information below to answer the user's question.
If you can't find a suitable answer, just say you don't know.
Don't give wrong or incorrect answers.

Unnamed: 0: 251, country: Canada, continent: Americas, year: 2007, lifeExp: 80.653, pop: 33390141, gdpPercap: 36319.23501, iso_alpha: CAN, iso_num: 124

Unnamed: 0: 242, country: Canada, continent: Americas, year: 1962, lifeExp: 71.3, pop: 18985849, gdpPercap: 13462.48555, iso_alpha: CAN, iso_num: 124

Unnamed: 0: 250, country: Canada, continent: Americas, year: 2002, lifeExp: 79.77, pop: 31902268, gdpPercap: 33328.96507, iso_alpha: CAN, iso_num: 124

Unnamed: 0: 244, country: Canada, continent: Americas, year: 1972, lifeExp: 72.88, pop: 22284500, gdpPercap: 18970.57086, iso_alpha: CAN, iso_num: 124

Câu hỏi: Tell me something a

{'question': 'Tell me something about the data of Canada',
 'chat_history': [HumanMessage(content='Tell me something about the data of Canada', additional_kwargs={}, response_metadata={}),
  AIMessage(content='The data for Canada includes information from different years, showcasing various indicators such as life expectancy, population, and GDP per capita. For instance, in 2007, Canada had a life expectancy of 80.653 years, a population of 33,390,141, and a GDP per capita of approximately $36,319.24. In contrast, in 1962, the life expectancy was lower at 71.3 years, with a population of 18,985,849 and a GDP per capita of about $13,462.49. This data indicates an increase in life expectancy and GDP per capita over the years, reflecting improvements in health and economic conditions in Canada.', additional_kwargs={}, response_metadata={})],
 'answer': 'The data for Canada includes information from different years, showcasing various indicators such as life expectancy, population, and GDP