In [1]:
import torch
torch.cuda.empty_cache()

!pip install -qU langchain rank_bm25 bitsandbytes accelerate peft safetensors sentencepiece unstructured tiktoken langchain_community langchain_chroma langchain-huggingface huggingface-hub sentence_transformers chromadb langchainhub transformers
!pip install -qU flash-attn --no-build-isolation

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m939.2 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m436.4/436.4 kB[0m [31m33.1 MB/s[0m eta [36m0

In [24]:
from langchain_huggingface import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from IPython.display import display, Markdown
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
# Initialize embeddings
embedding_model_name = "BAAI/bge-small-en-v1.5"
embedding_model_kwargs = {"device": "cuda"}
embedding_encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=embedding_model_kwargs,
    encode_kwargs=embedding_encode_kwargs
)

# Initialize vector store and retriever
vectorstore = Chroma(
    persist_directory="/content/drive/MyDrive/UWA/Sem 4/Capstone/Project/vector1",
    embedding_function=hf
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Initialize the language model
model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1000)
llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
# Function to calculate the total number of tokens in the vector database
def count_total_tokens_in_vectorstore(vectorstore, tokenizer):
    # Retrieve all documents from the vector store
    all_docs = vectorstore.get()['documents']

    total_tokens = 0

    # Iterate over each document and calculate the number of tokens
    for doc in all_docs:
        tokens_in_doc = len(tokenizer.encode(doc))  # Tokenize the document content (which is a string)
        total_tokens += tokens_in_doc

    return total_tokens

# Count the total number of tokens in the vector store
#total_tokens_in_vectorstore = count_total_tokens_in_vectorstore(vectorstore, tokenizer)
#print(f"Total number of tokens in the vector database: {total_tokens_in_vectorstore}")


# Step 2: Set up BM25 retriever (if the raw text is available)
# Assuming vectorstore stores text content, we initialize BM25Retriever
bm25_retriever = None

def initialize_bm25_retriever(docs):
    """Initialize BM25Retriever with raw document text."""
    global bm25_retriever
    # Check if docs have page_content and convert if necessary
    if docs:
        formatted_docs = [doc.page_content if hasattr(doc, "page_content") else doc for doc in docs]
        bm25_retriever = BM25Retriever.from_texts(formatted_docs)  # Create BM25 retriever




Total number of tokens in the vector database: 1052988

In [39]:
# Define the RAG Chat Model class
class RAGChatModel:
    def __init__(self, retriever, llm, tokenizer, max_token_limit=count_total_tokens_in_vectorstore(vectorstore, tokenizer)//10):
        self.retriever = retriever
        self.llm = llm
        self.tokenizer = tokenizer
        self.max_token_limit = max_token_limit
        self.current_token_count = 0
        self.template_standard = """
        <|system|>
        Answer the question and all the page numbers where this information is found based in the information provided in the context.
        Providing all the relevant page numbers is essential.

        Context: {context}

        <|user|>
        Question: {question}
        <|end|>

        <|assistant|>
        """
        self.template_exceeded = """
        <|system|>
        Answer the question in detail; warn that information is not taken from the prescribed textbook and provide the page numbers where they can find the correct information in the prescribed textbook.

        Context: {context}
        Providing all the relevant page numbers is essential.
        <|end|>

        <|user|>
        Question: {question}
        <|end|>

        <|assistant|>
        """

    def num_tokens_from_string(self, string: str) -> int:
        """Returns the number of tokens in a text string using the tokenizer."""
        return len(self.tokenizer.encode(string))

    def format_docs(self, docs, full_content=True):
        """Format the documents to be used as context in the prompt."""
        if full_content:
            return "\n\n".join(f"Information in Page number: {(doc['metadata'].get('page', 0)+1)}\n{doc['page_content']}" for doc in docs)
        else:
            return "Information available in prescribed textbook " + ", ".join(f"Page number: {doc['metadata'].get('page', 0)}" for doc in docs)

    def get_prompt(self, docs, question):
        """Generate the prompt based on token count and context formatting."""
        # Format the context with full content
        context = self.format_docs(docs, full_content=True)
        total_tokens_in_context = self.num_tokens_from_string(context)

        # Add tokens to the running total
        self.current_token_count += total_tokens_in_context

        # Decide whether to use full content or only page numbers
        if self.current_token_count > self.max_token_limit:
            print("Token limit exceeded. Information from prescribed textbook will not be used.")
            # Reformat context to include only page numbers
            context = self.format_docs(docs, full_content=False)
            template = self.template_exceeded
        else:
            template = self.template_standard

        # Create the prompt
        prompt = template.format(context=context, question=question)
        return prompt

    def extract_clean_answer(self, raw_output):
        """Extract only the answer from the raw output."""
        assistant_tag = "<|assistant|>"
        if assistant_tag in raw_output:
            clean_answer = raw_output.split(assistant_tag)[-1].strip()
            return clean_answer
        return raw_output.strip()

    def ask_question(self, question):
        """Main function to retrieve relevant docs and generate a response."""
        # Retrieve relevant documents from vectorstore
        docs = self.retriever.invoke(question)

        # Initialize BM25 retriever if not already initialized
        initialize_bm25_retriever(docs)

        # Re-rank documents using BM25 (if initialized)
        if bm25_retriever:
            bm25_results = bm25_retriever.get_relevant_documents(question)
            docs = bm25_results  # Re-rank the retrieved docs with BM25

        # Generate prompt based on token count
        prompt = self.get_prompt(docs, question)

        # Pass the prompt to the LLM
        result = self.llm.generate([prompt])

        # Extract the generated text
        raw_answer = result.generations[0][0].text

        # Get the clean answer
        clean_answer = self.extract_clean_answer(raw_answer)

        # Display the answer
        display(Markdown(clean_answer))

# Initialize the RAGChatModel
rag_chat_model = RAGChatModel(retriever, llm, tokenizer)

# Start the interactive chat
print("Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):")
while True:
    print("\n\n")
    question = input("Your question: ")
    if question.lower() == 'exit':
        print("Exiting the chat.")
        break
    rag_chat_model.ask_question(question)

Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):



Your question: what is entropy from the perspective of a molecule?


  bm25_results = bm25_retriever.get_relevant_documents(question)


TypeError: 'Document' object is not subscriptable

In [32]:
# Initialize the RAGChatModel
rag_chat_model = RAGChatModel(retriever, llm, tokenizer)

In [33]:
# Start the interactive chat
print("Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):")
while True:
  print("\n\n")
  question = input("Your question: ")
  if question.lower() == 'exit':
    print("Exiting the chat.")
    break
  rag_chat_model.ask_question(question)

Welcome to the RAG Chat Model! Ask any question (type 'exit' to quit):



Your question: what is entropy from the perspective of a molecule?


AttributeError: 'str' object has no attribute 'page_content'

what is entropy from the perspective of a molecule? page - 1053,1112

