In [46]:
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.vectorstores.utils import filter_documents
from dotenv import load_dotenv
import shutil

import json
import os

In [47]:
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [48]:
loader = TextLoader("data.txt")  # <- replace with your file
docs = loader.load()

In [49]:
llm = ChatOpenAI(temperature=0)

LLM for Tagging

In [50]:
from langchain.prompts import ChatPromptTemplate
tagging_prompt = ChatPromptTemplate.from_template("""
You will be given a document. Extract the 'company' and 'year' mentioned.

Document:
{input}

Return in JSON: {{ "company": ..., "year": ... }}
""")

In [51]:
from langchain.chains import LLMChain
tag_chain = LLMChain(llm=llm, prompt=tagging_prompt)

tagged_docs = []

for chunk in chunks:
    tag_result = tag_chain.run(input=chunk.page_content)
    try:
        tag_data = eval(tag_result)  # safer in production: use json.loads
        chunk.metadata.update({
            "company": tag_data.get("company"),
            "year": tag_data.get("year")
        })
        tagged_docs.append(chunk)
    except:
        pass  # skip if tagging fails

Build vector store with metadata

In [53]:
if os.path.exists("chroma_db"):
    shutil.rmtree("chroma_db")

vectordb = Chroma.from_documents(
    documents=tagged_docs,
    embedding=OpenAIEmbeddings(),
    persist_directory="chroma_db"
)
vectordb.persist()

InternalError: Query error: Database error: error returned from database: (code: 1032) attempt to write a readonly database

Create chain to extract tags from user query

In [None]:
query_tag_prompt = ChatPromptTemplate.from_template("""
Analyze the query and extract 'company' and 'year'. Return in JSON.

Query:
{query}

Return: {{ "company": ..., "year": ... }}
""")

query_tag_chain = LLMChain(llm=llm, prompt=query_tag_prompt)

Filter query and answer

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain
def ask_question(query):
    tag_result = query_tag_chain.run(query=query)
    try:
        query_info = eval(tag_result)
    except:
        query_info = {"company": None, "year": None}

    # fallback to no-filter if missing
    filter_dict = {}
    if query_info.get("company") and query_info.get("year"):
        filter_dict = {
            "$and": [
                {"company": query_info["company"]},
                {"year": query_info["year"]}
            ]
        }

    retriever = vectordb.as_retriever(search_kwargs={"filter": filter_dict} if filter_dict else {})

    prompt = ChatPromptTemplate.from_template("""
Answer the user's question based on the provided documents.

Question: {input}
Documents:
{context}
""")

    qa_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)

    relevant_docs = retriever.get_relevant_documents(query)
    response = qa_chain.invoke({"input": query, "context": relevant_docs})
    return response

# 6. Ask a question
query = "What did Tesla announced in 2024?"
answer = ask_question(query)
print("Answer:", answer)

Answer: In 2024, Tesla announced the release of the Model Y, which saw huge demand in Europe.


Split and Embedd Documents