In [23]:
import os
import chromadb
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.schema.runnable import RunnablePassthrough
from langchain.chains import create_retrieval_chain
from transformers import pipeline

# Install necessary dependencies for Google Colab
!pip install chromadb langchain python-docx sentence-transformers transformers torch unstructured

# Load a local transformer model for embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def get_embedding(text):
    return embedding_model.embed_documents([text])[0]

# Load a single DOCX document
def load_document(doc_path):
    if doc_path.endswith(".docx"):
        loader = UnstructuredWordDocumentLoader(doc_path)
        return loader.load()
    else:
        raise ValueError("Unsupported file format. Please provide a .docx file.")

# Split text into smaller chunks
def split_documents(documents, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(documents)

# Initialize vector database
def setup_vectorstore(docs, persist_directory="./chroma_db"):
    vectordb = Chroma.from_documents(
        documents=docs, embedding=embedding_model, persist_directory=persist_directory
    )
    return vectordb

# Load local transformer model for QA
qa_pipeline = pipeline("question-answering", model="facebook/bart-large-cnn")
llm = HuggingFacePipeline(pipeline=qa_pipeline)

def retrieve_and_answer(vectorstore, query):
    retriever = vectorstore.as_retriever()

    # Updated Retrieval Chain
    retrieval_chain = create_retrieval_chain(retriever, RunnablePassthrough())  # Pass the LLM, not RunnablePassthrough directly
    # Prepare the input as a dictionary
    result = retrieval_chain.invoke({"input": query})
    #retrieval_chain = create_retrieval_chain(retriever, llm)  # Pass the LLM, not RunnablePassthrough directly
    # Prepare the input as a dictionary
    #result = retrieval_chain.invoke({"input":query})


    return result

if __name__ == "__main__":
    doc_path = "/content/ChristUniversity.docx"  # Change this to your DOCX file path
    documents = load_document(doc_path)
    split_docs = split_documents(documents)
    vectorstore = setup_vectorstore(split_docs)
    print("✅ Vector database setup complete!")

    # Example query
    query = "What is the document about?"
    answer = retrieve_and_answer(vectorstore, query)
    print("Answer:", answer)




Some weights of BartForQuestionAnswering were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


✅ Vector database setup complete!
Answer: {'input': 'What is the document about?', 'context': [Document(metadata={'source': '/content/ChristUniversity.docx'}, page_content='holistic development programs that foster both intellectual and personal growth. With its state-of-the-art campus, modern infrastructure, and focus on research and innovation, Christ University attracts students from across the globe, offering a dynamic environment for learning and growth.'), Document(metadata={'source': '/content/ChristUniversity.docx'}, page_content='holistic development programs that foster both intellectual and personal growth. With its state-of-the-art campus, modern infrastructure, and focus on research and innovation, Christ University attracts students from across the globe, offering a dynamic environment for learning and growth.'), Document(metadata={'source': '/content/ChristUniversity.docx'}, page_content='holistic development programs that foster both intellectual and personal growth. Wi