## 1. Importing necessary libraries and functions, and defining Global Variables

#### Importing Libraries and functions

In [5]:
import fitz
from tqdm.autonotebook import tqdm
from spacy.lang.en import English
import re
from spacy.lang.en import STOP_WORDS
import pandas as pd
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
from sentence_transformers import util
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig

#### Defining Global Variables

In [6]:
# variable storing the path of the dataset(PDF) that is to be loaded for the RAG model to fetch the data from
PDF_PATH = "Dataset.pdf"

# variable storing value of the chunk i.e. the number of sentences to be clubbed together to form a paragraph in a page
SENTENCE_CHUNK_SIZE = 5

# variable storing the name of the model that will be used to embed the sentence chunks to numerical values
EMBEDDING_MODEL = "all-mpnet-base-v2"

# variable storing the query the is to be passed into the RAG model
QUERY = "What is a Binary Tree?"

# variable storing the value i.e. the number of top answers to fetch based on the highest similarity scores
K = 3

# variable storing the name of the LLM that will be used for the text generation
LLM_MODEL_NAME = "google/gemma-2b-it"

## 2. Data Acquisition

#### Removing Stop words

In [7]:
def remove_stop_words(text: str) -> str:

    '''
    :param text:
    :return:

    This functions takes in a string and removes stop words from it.
    The stop words are defined in the global variable STOP_WORDS imported from spacy.lang.en
    '''

    # Split the text by space into a list of words
    text = text.split()

    # Initialize a list for storing the non-stop words
    cleaned_text = list()

    # Iterate over the split text
    for word in text:

        # If the current word is not in stop words add it to the cleaned text list
        if word not in STOP_WORDS:
            cleaned_text.append(word)

    # Return
    return " ".join(cleaned_text)

#### Converting the text to a desired format

In [8]:
def text_formatter(text: str) -> str:

    '''
    :param text:
    :return:

    The function takes in a string and formats according to the instructions provided
    '''

    # Instruction replacing newline character with space
    cleaned_text = text.replace("\n", " ").strip()

    # Instruction replacing dual spaces with single spaces
    cleaned_text = cleaned_text.replace("  ", " ")

    # Instruction converting the whole text to lowercase text
    cleaned_text = cleaned_text.lower()

    # Instruction replacing HTML comments with null string
    cleaned_text = re.sub(r"<!--.*?-->", "", cleaned_text)

    # Instruction replacing HTML tags with null string
    cleaned_text = re.sub(r"<.*?>", "", cleaned_text)

    # Instruction to remove the stop words from the text
    cleaned_text = remove_stop_words(cleaned_text)

    # Return
    return cleaned_text

#### Converting sentences to embedding vectors

In [9]:
def sentence_embedder(sentences: list[str]) -> list[list[float]]:

    '''
    :param sentences:
    :return:

    This function takes in a list of sentence chunks and returns a list of embeddings for each sentence.
    The model used for sentence embedding is defined in GLOBAL VARIABLES as EMBEDDING_MODEL.
    The function iterates over the sentence chunks, then converts each sentence chunk to an embedding and returns a list of embeddings for each sentence.
    '''

    embedding_model = SentenceTransformer(model_name_or_path=EMBEDDING_MODEL)
    embeddings = list()
    for sentence in sentences:
        embedding = embedding_model.encode(sentence,
                                           batch_size=32,
                                           convert_to_tensor=True,
                                           show_progress_bar=True)
        embedding = np.stack(embedding.tolist(), axis=0)
        embedding = torch.tensor(embedding)
        embedding = embedding.type(torch.float32)
        embeddings.append(embedding)
    return embeddings

#### Opening and Reading the PDF for the RAG model

In [10]:
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    '''
    :param pdf_path:
    :return:

    This function is used to open the pdf file and store it alongside its metadata in a list of dictionaries representing each page.
    '''

    # Open the PDF document
    doc = fitz.open(pdf_path)

    # List to store pages and their text data
    pages_and_text = []

    # Initialize spaCy model for sentence segmentation
    nlp = English()
    nlp.add_pipe("sentencizer")

    # Iterate over each page in the document
    for page_number, page in tqdm(enumerate(doc), total=len(doc)):
        text = page.get_text()

        # Format the text (Assuming `text_formatter` is defined elsewhere)
        formatted_text = text_formatter(text)

        # Split formatted text into sentences
        sentences = list(nlp(formatted_text).sents)
        sentences = [str(sentence) for sentence in sentences]

        # Skip pages with 1 or fewer sentences
        if len(sentences) <= 1:
            continue

        # Chunk sentences into groups (Assuming SENTENCE_CHUNK_SIZE is defined)
        sentence_chunks = [sentences[i: i + SENTENCE_CHUNK_SIZE] for i in range(0, len(sentences), SENTENCE_CHUNK_SIZE)]
        sentence_chunks = [" ".join(chunk) for chunk in sentence_chunks]

        # Generate embeddings for sentence chunks (Assuming `sentence_embedder` is defined)
        sentence_embeddings = sentence_embedder(sentence_chunks)

        # Ensure embeddings are NumPy arrays
        sentence_embeddings = [np.array(emb) for emb in sentence_embeddings]

        # Add page data to the list
        pages_and_text.append({
            "page_number": page_number + 1,
            "raw_text": text,
            "formatted_text": formatted_text,
            "number_of_sentences": len(sentences),
            "number_of_words": len(formatted_text.split()),
            "number_of_tokens": len(formatted_text) / 4,  # Rough estimate
            "sentences": sentences,
            "sentence_chunks": sentence_chunks,
            "number_of_sentence_chunks": len(sentence_chunks),
            "sentence_chunk_embeddings": sentence_embeddings
        })

    return pages_and_text

## 3. Fetch similarity

#### Fetching the embeddings of the data and converting them to a desired format of (number of documents, number of embeddings)

In [11]:
# Function to extract embeddings and convert to PyTorch tensor
def get_data_embeddings(pages_and_text: list[dict]) -> torch.Tensor:

    '''
    :param pages_and_text:
    :return:

    This function takes in a list of pages and returns a tensor of embeddings for each page.
    '''

    # List to store all embeddings
    pages_and_text_embeddings = []

    # Iterate through pages and collect sentence chunk embeddings
    for page in tqdm(pages_and_text):
        page_embeddings = []

        for chunk_embedding in page["sentence_chunk_embeddings"]:
            # Convert to list if it's a tensor
            if isinstance(chunk_embedding, torch.Tensor):
                chunk_embedding = chunk_embedding.tolist()
            page_embeddings.append(chunk_embedding)

        # Append the page's embeddings
        pages_and_text_embeddings.append(page_embeddings)

    # Ensure all embeddings have the same dimension
    if pages_and_text_embeddings:
        embedding_dim = len(pages_and_text_embeddings[0][0])  # Get the first embedding's size

        # Pad or truncate embeddings if necessary
        pages_and_text_embeddings = [
            [np.pad(chunk, (0, max(0, embedding_dim - len(chunk))), mode='constant')[:embedding_dim]
             for chunk in page]
            for page in pages_and_text_embeddings
        ]

    # Flatten nested list
    flat_embeddings = [chunk for page in pages_and_text_embeddings for chunk in page]

    # Convert to NumPy array
    pages_and_text_embeddings = np.array(flat_embeddings, dtype=np.float32)

    # Convert to a PyTorch tensor
    pages_and_text_embeddings = torch.tensor(pages_and_text_embeddings, dtype=torch.float32)
    print(f"LENGTH OF PAGES_AND_TEXT_EMBEDDINGS: {len(pages_and_text_embeddings)} AFTER")
    return pages_and_text_embeddings

#### Getting the similarity scores using dot product between the query and the document embeddings

In [12]:
def get_similarity_score_by_query(query: str, pages_and_text_embeddings: list[list[float]]) -> list[dict]:

    '''
    :param query:
    :param pages_and_text_embeddings:
    :return:

    This function takes in the query and the embedding of the document as returns the similarity score between the query and the pages of the documents.
    '''

    # Initializing the embedding model for converting the query to numbers
    embedding_model = SentenceTransformer(model_name_or_path=EMBEDDING_MODEL)

    # Converting the query to numbers using the embedding model
    query_embedding = embedding_model.encode(query, convert_to_tensor=True).to("cpu")

    # Finding the dot scores between the query embeddings and the document embeddings
    dot_score = util.dot_score(query_embedding, pages_and_text_embeddings)[0]

    # Return
    return dot_score

#### Getting the top k similarity scores

In [13]:
def get_top_k_similarity_scores(dot_scores: list[float], k: int):

    '''
    :param dot_scores:
    :param k:
    :return:

    Function that returns the top k similarity scores.
    '''

    # Return the top k similarity scores
    return torch.topk(dot_scores, k=k)

#### Getting the items having the top k similarity scores

In [14]:
def get_top_k_similar_data(dot_scores, pages_and_text):

    '''
    :param dot_scores:
    :param pages_and_text:
    :return:

    The function fetches the pages that have the highest similarity scores.
    '''

    context = list()
    for index in dot_scores[1]:
        print(f"Fetching data from page number: {index.item()}")
        context.append(pages_and_text[index.item()//SENTENCE_CHUNK_SIZE]["formatted_text"])
    return context

## 4. Augmentation

In [15]:
def hugging_face_login():

    '''
    :return:

    This model is used to log in to huggingface.co.
    '''

    from huggingface_hub import notebook_login
    notebook_login()

#### Loading the LLM model for augmentation

In [16]:
def load_llm_model(model_name: str):

    '''
    :param model_name:
    :return:

    This function takes in the name of the model and loads it into a model instance.
    '''

    # Load the model from the transformers (HuggingFace)
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=False,
    )

    # Return
    return model

#### Getting the model information

In [17]:
def get_model_info(model):

    '''
    :param model:
    :return:

    This model takes in a model instance and returns a dictionary containing information about the model.
    '''

    model_information = dict()
    try:
        model_information["model_name"] = LLM_MODEL_NAME
    except:
        pass
    try:
        model_information["vocab_size"] = model.config.vocab_size
    except:
        pass
    try:
        model_information["max_position_embeddings"] = model.config.max_position_embeddings
    except:
        pass
    try:
        model_information["num_hidden_layers"] = model.config.num_hidden_layers
    except:
        pass
    try:
        model_information["num_attention_heads"] = model.config.num_attention_heads
    except:
        pass
    try:
        model_information["hidden_size"] = model.config.hidden_size
    except:
        pass
    try:
        model_information["num_labels"] = model.config.num_labels
    except:
        pass
    try:
        model_information["num_key_value_heads"] = model.config.num_key_value_heads
    except:
        pass
    try:
        model_information["layer_norm_eps"] = model.config.layer_norm_eps
    except:
        pass
    try:
        mem_params = sum([param.nelement() for param in model.parameters()])
        mem_buffers = sum([buf.nelement() for buf in model.buffers()])
        model_mem_bytes = mem_params + mem_buffers
        model_information["model_memory_bytes"] = model_mem_bytes
        model_information["model_memory_in_megabytes"] = model_mem_bytes / (1024 * 1024)
        model_information["model_memory_in_gigabytes"] = model_mem_bytes / (1024 * 1024 * 1024)
    except:
        pass
    try:
        model_information["number_of_parameters"] = sum([param.numel() for param in model.parameters()])
    except:
        pass
    return model_information

#### Augmenting the prompt for instructing the LLM in a better way

In [18]:
def prompt_formatter(query: str, related_pages_and_text: list[str], llm_model_name) -> str:

    '''
    :param query:
    :param related_pages_and_text:
    :param llm_model_name:
    :return:

    This function is responsible for formatting the query into a prompt for the LLM, this prompt will consist of:
    1. Format the LLM will answer in
    2. Context using which the LLM will answer the query
    3. Query
    '''

    # Initializing the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(llm_model_name)

    # Joining all the similar content into a single paragraph
    context = "\n -".join(related_pages_and_text)

    # Creating a prompt format in which the data will be filled
    base_prompt = """Based on the following context items, please answer the query
    Context items:
    {context}
    Query: {query}
    Answer:"""

    # Augmenting the base prompt to create the final prompt that would be passed to the LLM
    base_prompt = base_prompt.format(context=context, query=query)

    # Creating the dialogue template
    dialogue_template = [{
        "role": "user",
        "content": base_prompt
    }]

    # Applying the prompt to the dialogue template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                           tokenize=False,
                                           add_generation_prompt=True)

    # Return
    return prompt

#### Providing the prompt and retrieving the answer from it

In [19]:
def ask_question(prompt: str, llm_model, llm_model_name, temperature: int=0.4, do_sample: bool=True, max_new_tokens: int=256):

    '''
    :param prompt:
    :param llm_model:
    :param llm_model_name:
    :param temperature:
    :param do_sample:
    :param max_new_tokens:
    :return:

    This function is responsible for asking the question using the LLM and returning the answer after the augmentation based on the similar context.
    '''

    # Initializing the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(llm_model_name)

    # Fetching the mapping from the tokenizer in PyTorch tensor format
    input_ids = tokenizer(prompt, return_tensors="pt")

    # Generating the output by passing in the prompt and context alongside various other parameters to the LLM model
    outputs = llm_model.generate(**input_ids, temperature=temperature, do_sample=do_sample, max_new_tokens=max_new_tokens)

    # Decoding the encoded (embedded) output
    output_text = tokenizer.decode(outputs[0])

    # Return
    return output_text

# PROJECT FLOW

In [20]:
pages_and_text = open_and_read_pdf(pdf_path=PDF_PATH)

  0%|          | 0/105 [00:00<?, ?it/s]
Batches:   0%|          | 0/1 [00:00<?, ?it/s][A
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s][A

Batches: 100%|██████████| 1/1 [00:00<00:00, 45.38it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 52.50it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 59.21it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 93.51it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 72.95it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 39.93it/s]
  sentence_embeddings = [np.array(emb) for emb in sentence_embeddings]
  1%|          | 1/105 [00:09<16:04,  9.28s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00, 101.80it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 110.94it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 105.09it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 64.15it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 108.78it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 96.15it/s]

Batches: 100%|██████████| 1/1 [00:00<00:00, 71.90it/s]
  

KeyboardInterrupt: 

In [None]:
data_embeddings = get_data_embeddings(pages_and_text=pages_and_text)

In [None]:
query = QUERY
dot_scores = get_similarity_score_by_query(query=query, pages_and_text_embeddings=data_embeddings)

In [None]:
k = K
top_k_similarity_scores = get_top_k_similarity_scores(dot_scores=dot_scores, k=k)

In [None]:
similar_top_k_context = get_top_k_similar_data(top_k_similarity_scores, pages_and_text)

In [None]:
hugging_face_login()

In [None]:
llm_model = load_llm_model(model_name=LLM_MODEL_NAME)

In [None]:
get_model_info(llm_model)

In [None]:
prompt = prompt_formatter(QUERY, similar_top_k_context, LLM_MODEL_NAME)
print(prompt)

In [None]:
ask_question(prompt, llm_model, LLM_MODEL_NAME)