In [11]:
import bs4
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama.llms import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
import os

In [2]:
async def get_website_docs(urls_path='./data/ITFAQ_websit.txt'):
    # Clean URLs
    with open(urls_path, 'r') as f:
        urls = [url.strip() for url in f.readlines() if url.strip()]

    # Initialize loader
    loader = WebBaseLoader(web_paths=urls)

    # Collect documents asynchronously
    web_docs = []
    async for doc in loader.alazy_load():
        web_docs.append(doc)

    print(f"Number of websites loaded: {len(web_docs)}")
    return web_docs

def get_document_docs(doc_folder = './data/policies/'):
    
    doc_docs = []
    for file in os.listdir(doc_folder):
        if file.endswith('.pdf'):
            loader = PyPDFLoader(os.path.join(doc_folder, file))
        elif file.endswith('.docx'):
            loader = Docx2txtLoader(os.path.join(doc_folder, file))
        doc_docs.extend(loader.load())
        
    print(f"number of documents loaded: {len(doc_docs)}")
    return doc_docs

In [3]:
doc_docs = get_document_docs()

number of documents loaded: 70


In [4]:
web_docs = await get_website_docs()

Fetching pages: 100%|##################################################################| 10/10 [00:03<00:00,  2.95it/s]


Number of websites loaded: 10


In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunked_documents = text_splitter.split_documents(doc_docs + web_docs)

In [6]:
embeddings = OllamaEmbeddings(model="deepseek-r1")

In [7]:
llm = OllamaLLM(model="deepseek-r1")

In [8]:
vector_store = FAISS.from_documents(chunked_documents, embeddings)

In [12]:
prompt = ChatPromptTemplate.from_template(
    """
    Answer the question based on the provied context only.
    Please provide the most accurate respone based on the question
    <context>
    {context}
    <context>
    Question: {input}
    """
)

In [13]:
doc_chain = create_stuff_documents_chain(llm, prompt)
retriever = vector_store.as_retriever()
retriever_chain  = create_retrieval_chain(retriever, doc_chain)