In [1]:
import time
import pandas as pd
from langchain_ollama import OllamaLLM
from langchain_ollama import OllamaEmbeddings
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [2]:
# 1. Data Preparation
df = pd.read_csv("product_data.csv")

In [3]:
# 2. Initialize Components
llm = OllamaLLM(model="llama3.1", temperature=0.75)
embeddings = OllamaEmbeddings(
    model="bge-m3",
    base_url="http://localhost:11434"
)

In [4]:
# 3. Create Detailed Text Description
def create_detailed_text(df):
    texts = []
    for index, row in df.iterrows():
        text = f"Product Details - "
        for col in df.columns:
            text += f"{col}: {row[col]}. "
        texts.append(text)
    return texts

In [5]:
# 4. Prepare Documents
texts = create_detailed_text(df)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
)
splits = text_splitter.create_documents(texts)

In [6]:
# 5. Create Vector Store
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

In [7]:
# 6. Create Prompt Template
template = """
Answer the question based on the following information:

Context Information:
{context}

Question: {question}

If there is no relevant information in the data, please clearly state "No information about this product in the database".
Please answer based on actual data, do not guess or make up information.

Answer:
"""

PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"]  
)

In [8]:
# 7. Setup QA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={
        "prompt": PROMPT
    }
)

In [15]:
# 8. Create Agent
agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=False,
    agent_type="zero-shot-react-description",  
    max_iterations=20,
    allow_dangerous_code=True
)

In [10]:

# 9. Create Universal Query Function
def query_data(query_text):
    """
    Universal data query function
    Args:
        query_text: User's query question
    Returns:
        Query results and related context
    """
    try:
        # Use RAG system to get context
        context = qa_chain.invoke({
            "query": query_text
        })
            
        # Build enhanced prompt
        enhanced_prompt = f"""
        Answer the question based on the retrieved relevant information: {context}
        Please provide a concise answer, if calculations or analysis are needed, please base them on the data using pandas methods.
        """
        
        # Use agent to process query
        response = agent.invoke(enhanced_prompt)
        
        return {
            "answer": response["output"] if isinstance(response, dict) else response,
            "context": context['source_documents']
        }
    except Exception as e:
        return {
            "error": str(e),
            "suggestion": "Please try a more specific question or check if the query is related to the dataset"
        }
        
        

In [11]:
# 10. Query Result Display Function
def display_query_result(query_text):
    """
    Function to display query results
    Args:
        query_text: User's query question
    """
    print("\n" + "="*50)
    print(f"📝 Query: {query_text}")
    print("="*50)
    
    result = query_data(query_text)
    
    if "error" in result:
        print(f"❌ Error: {result['error']}")
        print(f"💡 Suggestion: {result['suggestion']}")
    else:
        print("\n📊 Answer:")
        print("-"*30)
        print(f"{result['answer']}")
        print("\n📚 Related Context:")
        print("-"*30)
        for i, doc in enumerate(result['context'], 1):
            print(f"{i}. {doc.page_content}\n")
    print("="*50 + "\n")





In [12]:
display_query_result("What is the average price?")


📝 Query: What is the average price?

📊 Answer:
------------------------------
$194.99

📚 Related Context:
------------------------------
1. Product Details - product_id: 7. product_name: Sneakers. category: Clothing. price: 59.99. stock_quantity: 60. rating: 4.2.

2. Product Details - product_id: 7. product_name: Sneakers. category: Clothing. price: 59.99. stock_quantity: 60. rating: 4.2.

3. Product Details - product_id: 7. product_name: Sneakers. category: Clothing. price: 59.99. stock_quantity: 60. rating: 4.2.




In [16]:
display_query_result("What is the most expensive product in the dataset?")


📝 Query: What is the most expensive product in the dataset?

📊 Answer:
------------------------------
The most expensive product in the dataset is $899.99, which is a Laptop.

📚 Related Context:
------------------------------
1. Product Details - product_id: 6. product_name: Headphones. category: Electronics. price: 19.99. stock_quantity: 150. rating: 4.4.

2. Product Details - product_id: 6. product_name: Headphones. category: Electronics. price: 19.99. stock_quantity: 150. rating: 4.4.

3. Product Details - product_id: 6. product_name: Headphones. category: Electronics. price: 19.99. stock_quantity: 150. rating: 4.4.




In [17]:
# Usage Examples
# You can perform various queries
queries = [
    "How many different product categories are there?",
    "Which product has the highest inventory?"
]

for query in queries:
    display_query_result(query)
    time.sleep(0.5)


📝 Query: How many different product categories are there?

📊 Answer:
------------------------------
The final answer is 3.

📚 Related Context:
------------------------------
1. Product Details - product_id: 6. product_name: Headphones. category: Electronics. price: 19.99. stock_quantity: 150. rating: 4.4.

2. Product Details - product_id: 6. product_name: Headphones. category: Electronics. price: 19.99. stock_quantity: 150. rating: 4.4.

3. Product Details - product_id: 6. product_name: Headphones. category: Electronics. price: 19.99. stock_quantity: 150. rating: 4.4.



📝 Query: Which product has the highest inventory?
❌ Error: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse LLM output: `It looks like we've hit a few syntax errors along the way. Let's simplify the code and make it more readable.

Here's what you can do to find which product

In [18]:
df

Unnamed: 0,product_id,product_name,category,price,stock_quantity,rating
0,1,Smartphone,Electronics,299.99,50,4.5
1,2,Laptop,Electronics,899.99,30,4.7
2,3,Coffee Maker,Home Appliances,49.99,100,4.3
3,4,Blender,Home Appliances,24.99,75,4.1
4,5,T-Shirt,Clothing,9.99,200,4.0
5,6,Headphones,Electronics,19.99,150,4.4
6,7,Sneakers,Clothing,59.99,60,4.2


In [19]:
df['price'].mean()

194.99

In [20]:
df['price'].max()

899.99

In [23]:

len(df['category'].unique())

3