In [None]:
from core.retriever import DocumentRetriever
from core.models import ResponseGenerator
from core.database import ChatDatabase
import uuid

class Chatbot:
    def __init__(self):
        self.session_id = str(uuid.uuid4())
        self.db = ChatDatabase()
        self.retriever = DocumentRetriever()  # Auto-loads processed docs
        self.generator = ResponseGenerator()

    def chat(self, user_input):
        # Retrieve relevant documents
        docs = self.retriever.retrieve_documents(user_input)

        # Generate response using retrieved content
        response = self.generator.generate_response(user_input, docs)
        response_text = response.content
        # Save chat history
        self.db.save_chat(self.session_id, user_input, response_text)

        return response, docs

    def get_history(self):
        return self.db.get_chat_history(self.session_id)


In [None]:
import sqlite3
import os
from dotenv import load_dotenv

load_dotenv()
DB_PATH = os.getenv("DB_PATH", "data/chat_history.db")

class ChatDatabase:
    def __init__(self):
        self.conn = sqlite3.connect(DB_PATH, check_same_thread=False)
        self.create_table()

    def create_table(self):
        cursor = self.conn.cursor()
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS chat_history (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                session_id TEXT,
                user_input TEXT,
                bot_response TEXT,
                timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
            )
        """)
        self.conn.commit()

    def save_chat(self, session_id, user_input, bot_response):
        cursor = self.conn.cursor()
        cursor.execute("INSERT INTO chat_history (session_id, user_input, bot_response) VALUES (?, ?, ?)", 
                       (session_id, user_input, bot_response))
        self.conn.commit()

    def get_chat_history(self, session_id):
        cursor = self.conn.cursor()
        cursor.execute("SELECT user_input, bot_response FROM chat_history WHERE session_id = ?", (session_id,))
        return cursor.fetchall()


In [21]:
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain.schema import HumanMessage, AIMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
import os

load_dotenv()
LLM_API_KEY = os.getenv("LLM_API_KEY")
LLM_MODEL = os.getenv("LLM_MODEL", "default-llm")

class ResponseGenerator:
    def __init__(self):
        self.llm = ChatGroq(api_key=LLM_API_KEY, model=LLM_MODEL)

    def generate_response(self, query, retrieved_docs):
        context = "\n\n".join([doc.page_content for doc in retrieved_docs])
        messages = [
            SystemMessage(content="You are an AI assistant trained to provide accurate answers based on context."),
            HumanMessage(content=f"Context:\n{context}\n\nQuestion: {query}\nAnswer:")
        ]

        return self.llm(messages)


In [22]:
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

load_dotenv()
VECTOR_STORE_PATH = os.getenv("VECTOR_STORE_PATH", "data/chroma_db")
PROCESSED_FOLDER = "data/processed"

class DocumentRetriever:
    def __init__(self):
        self.db = None
        self.embeddings = HuggingFaceEmbeddings()
        self.load_database()

    def load_database(self):
        """Automatically loads processed documents and builds vector store."""
        if not os.path.exists(PROCESSED_FOLDER):
            os.makedirs(PROCESSED_FOLDER)
        
        docs = []
        for file in os.listdir(PROCESSED_FOLDER):
            if file.endswith(".pdf"):
                loader = PyPDFLoader(os.path.join(PROCESSED_FOLDER, file))
                docs.extend(loader.load())

        if docs:
            text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
            texts = text_splitter.split_documents(docs)

            if not os.path.exists(VECTOR_STORE_PATH):
                os.makedirs(VECTOR_STORE_PATH)
            
            self.db = Chroma.from_documents(texts, self.embeddings, persist_directory=VECTOR_STORE_PATH)
            self.db.persist()
        else:
            self.db = Chroma(persist_directory=VECTOR_STORE_PATH, embedding_function=self.embeddings)

    def retrieve_documents(self, query):
        return self.db.similarity_search(query, k=3)

In [23]:
bot = Chatbot()

In [24]:
query = "What is the amount of tax I need to pay for my income?"
response, docs = bot.chat(query)

User Input What is the amount of tax I need to pay for my income?
Bot Response content="I can't answer that, as I don't have information about your specific income. However, I can tell you that according to Article 2 of the provided text, the Office of the Official Participant may claim a refund of Tax incurred by it on the import or supply of Goods or Services. It doesn't mention anything about the amount of tax that needs to be paid for income." additional_kwargs={} response_metadata={'token_usage': {'completion_tokens': 78, 'prompt_tokens': 376, 'total_tokens': 454, 'completion_time': 0.048962544, 'prompt_time': 0.060837974, 'queue_time': 0.278886054, 'total_time': 0.109800518}, 'model_name': 'llama-3.2-3b-preview', 'system_fingerprint': 'fp_a926bfdce1', 'finish_reason': 'stop', 'logprobs': None} id='run-6d07c4aa-9373-4c8f-a232-48a16a5dc80f-0' usage_metadata={'input_tokens': 376, 'output_tokens': 78, 'total_tokens': 454}


In [25]:
response

AIMessage(content="I can't answer that, as I don't have information about your specific income. However, I can tell you that according to Article 2 of the provided text, the Office of the Official Participant may claim a refund of Tax incurred by it on the import or supply of Goods or Services. It doesn't mention anything about the amount of tax that needs to be paid for income.", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 78, 'prompt_tokens': 376, 'total_tokens': 454, 'completion_time': 0.048962544, 'prompt_time': 0.060837974, 'queue_time': 0.278886054, 'total_time': 0.109800518}, 'model_name': 'llama-3.2-3b-preview', 'system_fingerprint': 'fp_a926bfdce1', 'finish_reason': 'stop', 'logprobs': None}, id='run-6d07c4aa-9373-4c8f-a232-48a16a5dc80f-0', usage_metadata={'input_tokens': 376, 'output_tokens': 78, 'total_tokens': 454})

In [26]:
docs

[Document(metadata={'page': 2, 'page_label': '3', 'source': 'data/upload\\279926_cabinet_decision_1_2020__refund_of_vat_paid_on_goods_and_services_connected_with_expo_2020.pdf'}, page_content='work for those family members. \nConsideration All that is received or expected to be received for the supply \nof Goods or Services, whether in money or other acceptable \nforms of payment. \n \nArticle 2 – Refund of Tax \n1. The Office of the Official Participant may claim a refund of Tax incurred by it on \nthe import or supply of Goods or Services provided that the Goods and Services \nare any of the following: \na. In direct connection with the construction, installation, alteration,'),
 Document(metadata={'page': 2, 'page_label': '3', 'source': 'data/upload\\279926_cabinet_decision_1_2020__refund_of_vat_paid_on_goods_and_services_connected_with_expo_2020.pdf'}, page_content='work for those family members. \nConsideration All that is received or expected to be received for the supply \nof Go