# 1. IMPORTING LIBRARIES, FUNCTION AND DEFINING GLOBAL VARIABLES

## 1.1 Importing Libraries

In [1]:
# Library needed to load the pdf
import fitz

# Library needed to process the text using regular expressions
import re

# Library needed to display or process the data in forms of dataframes
import pandas as pd

# Library needed to handle the operations in deep learning
import torch

# Library needed to convert the data into arrays for faster processing
import numpy as np

# Library to handle operating system related operations
import os

## 1.2 Importing Functions

In [2]:
# (OPTIONAL) Function to beautify the waiting process with a loading bar
from tqdm.notebook import tqdm as tqdm

# Function to process the text in English
from spacy.lang.en import English

# Function to convert paragraphs to sentences
from sentence_transformers import SentenceTransformer

# Function to provide utility services to process the text such as tokenization, sentencizer
from sentence_transformers import util

# Functions for loading the LLM model
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig

# Function for fetching the paths to pdfs
from glob import glob

## 1.3 Defining Global Variables

In [3]:
# Global variable consisting of all the stop words
from spacy.lang.en import STOP_WORDS

# Global variable telling about the number of sentences in each chunk stored in the dictionary
SENTENCE_CHUNKS = 10

# Global variable storing the name of the model that is used for the embedding
EMBEDDING_MODEL = 'all-mpnet-base-v2'

# Global variable storing the names of the pdfs that are to be loaded to be fed into the RAG model
PDF_PATHS = list()

# Global variable storing the integer telling to fetch the top k similar records for further processing
K = 50

# Global variable storing the name of the LLM model that will be used for augmenting the similar data
LLM_MODEL = 'google/gemma-2b-it'

# (FOR TESTING) Global variable storing the query that user wants to ask
QUERY = "What is overfitting in machine learning? Explain in 500 words"

# Setting up the device agnostic code
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Global variable for setting the temperature of the LLM model i.e how much data should LLM generate
TEMPERATURE = 0.3

## Global variable defining the length of tokens that the LLM has to generate
MAX_NEW_TOKENS = 512

# 2. DATA ACQUISITION

## 2.1 Getting the paths to all the pdfs in the `Dataset` folder

In [None]:
PDF_PATHS = glob('.\\Dataset\\*.pdf')

In [None]:
for idx, path in tqdm(enumerate(PDF_PATHS), total=len(PDF_PATHS)):
    print(f"{idx+1}. {path[10:-4]}")

## 2.2 Opening all the documents

In [None]:
documents = list()
for path in tqdm(PDF_PATHS, total=len(PDF_PATHS)):
    doc = fitz.open(path)
    documents.append(doc)

In [None]:
for doc in tqdm(documents, total=len(documents)):
    print(doc)

## 2.3 Getting the text from all the documents

In [None]:
pages = dict()
for doc in tqdm(documents, total=len(documents)):
    for page_number, page in tqdm(enumerate(doc), total=len(doc)):
        page_number = len(pages)
        pages[page_number] = page.get_text()

In [None]:
for page_number, page in tqdm(pages.items(), total=len(documents)):
    print(f"{page_number}. {pages[page_number]}")
    print()

## 2.4 Getting the metadata of each page

In [None]:
pages_and_metadata = list()
for page_number, page in tqdm(pages.items(), total=len(pages)):
    metadata = dict()
    metadata['page_number'] = page_number
    metadata['raw_text'] = page
    metadata['number_of_characters'] = len(page)
    metadata['number_of_tokens'] = len(page)/4
    metadata['number_of_words'] = len(page.split())
    pages_and_metadata.append(metadata)

In [None]:
df = pd.DataFrame(pages_and_metadata)

In [None]:
df.describe().round(2)

In [None]:
print(f"The number of tokens this model is being trained on are: {df["number_of_tokens"].sum()/1000000:.2f} million tokens")

In [None]:
print(f"The number of pages in the database are: {len(pages_and_metadata)}")

## 2.4 Preprocessing the `raw_text` from metadata

In [None]:
def convert_to_lowercase(text):
    new_text = text.lower()
    return new_text

In [None]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in STOP_WORDS:
            new_text.append(word)
    return " ".join(new_text)

In [None]:
def remove_html_tags(text):
    new_text = re.sub(r"<!--.*?-->", "", text)
    return new_text

In [None]:
def remove_newlines(text):
    new_text = re.sub(r"\n+", " ", text)
    return new_text

In [None]:
def remove_multiple_spaces(text):
    new_text = text.replace("  ", " ")
    return new_text

In [None]:
def remove_comments(text):
    new_text = re.sub(r"<!--.*?-->", "", text)
    return new_text

In [None]:
def remove_unnecessary_text(text):
    new_text = text.replace("answer:","").replace("question", "").replace(":","").replace("  "," ")
    return new_text

In [None]:
def preprocess_text(text):
    text = convert_to_lowercase(text)
    text = remove_stopwords(text)
    text = remove_html_tags(text)
    text = remove_newlines(text)
    text = remove_multiple_spaces(text)
    text = remove_comments(text)
    text = remove_unnecessary_text(text)
    return text

In [None]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    page["formatted_text"] = preprocess_text(page["raw_text"])

In [None]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["formatted_text"])
    print()

## 2.5 Converting the paragraphs to sentences

In [None]:
nlp = English()
nlp.add_pipe('sentencizer')
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    sentences = nlp(page["formatted_text"]).sents
    sentences = list(set([str(sentence).strip() for sentence in sentences if len(str(sentence).split())>10]))
    pages_and_metadata[page["page_number"]]["sentences"] = sentences

In [None]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["sentences"])
    print()

## 2.6 Update the metadata

In [None]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    pages_and_metadata[page["page_number"]]["number_of_sentences"] = len(page["sentences"])

In [None]:
for key in pages_and_metadata[0].keys():
    print(key)

## 2.7 Converting sentences to sentence_chunks

In [None]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    sentences = pages_and_metadata[page["page_number"]]["sentences"]
    sentence_chunk = [sentences[i : i+SENTENCE_CHUNKS] for i in range(0, len(sentences), SENTENCE_CHUNKS)]
    page["sentence_chunk"] = sentence_chunk

In [None]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(page["sentence_chunk"])
    print()

## 2.8 Converting sentence_chunks into sentence embeddings

In [None]:
embedding_model = SentenceTransformer(model_name_or_path=EMBEDDING_MODEL).to(device)

In [None]:
for page in pages_and_metadata:
    embeddings = list()
    print(f"Processing Page {page['page_number']}")
    for sentence in page["sentences"]:
        # sentence = sentence.to(device)
        embedding = embedding_model.encode(sentence, batch_size=512, convert_to_tensor=True, show_progress_bar=False, device=device)
        embedding = np.stack(embedding.tolist(), axis=0)
        embedding = torch.tensor(embedding)
        embedding = embedding.type(torch.float32)
        embeddings.append(embedding)
    sentence_embeddings = [np.array(embedding) for embedding in embeddings]
    pages_and_metadata[page["page_number"]]["embeddings"] = sentence_embeddings

In [None]:
for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    print(np.array(page["embeddings"]).shape)

## 2.9 Checking the metadata present for use

In [None]:
for key in pages_and_metadata[0].keys():
    print(key)

# 3. FETCHING SIMILAR CONTENT

## 3.1 Getting the data embeddings

In [None]:
pages_and_metadata_embeddings = []

for page in tqdm(pages_and_metadata, total=len(pages_and_metadata)):
    page_embeddings = []
    for chunk_embedding in pages_and_metadata[page["page_number"]]["embeddings"]:
        if isinstance(chunk_embedding, torch.Tensor):
            chunk_embedding = chunk_embedding.tolist()
        page_embeddings.append(chunk_embedding)
    pages_and_metadata_embeddings.append(page_embeddings)

## 3.2 Converting each embedding into the same dimensions

In [None]:
if pages_and_metadata_embeddings:
    embedding_dim = len(pages_and_metadata_embeddings[0][0])
    pages_and_metadata_embeddings = [
            [np.pad(chunk, (0, max(0, embedding_dim - len(chunk))), mode='constant')[:embedding_dim]
             for chunk in page]
            for page in pages_and_metadata_embeddings
        ]

## 3.3 Flattening the nested list of embeddings and the sentence to fetch by index

In [None]:
flat_embeddings = [chunk for page in pages_and_metadata_embeddings for chunk in page]
flat_data = [sentence for page in pages_and_metadata for sentence in page["sentences"]]

## 3.4 Saving the flattened embeddings and the flattened data

In [None]:
df = pd.DataFrame(flat_embeddings)
df.to_csv("embeddings_v4.csv", index=False)

df = pd.DataFrame(flat_data)
df.to_csv("data_v4.csv", index=False)

## 3.5 Loading the flattened embeddings and flattened data

In [4]:
flat_embeddings = pd.read_csv("embeddings_v4.csv").to_numpy()
flat_data = pd.read_csv("data_v4.csv")["0"].tolist()

In [5]:
print(f"The size of the flat_embeddings is {len(flat_embeddings)}")
print(f"The size of the flat_data is {len(flat_data)}")

The size of the flat_embeddings is 39293
The size of the flat_data is 39293


## 3.6 Converting embeddings to numpy array

In [6]:
pages_and_metadata_embeddings = np.array(flat_embeddings, dtype=np.float32)

## 3.7 Converting the numpy array embeddings to torch tensors

In [7]:
pages_and_metadata_embeddings = torch.tensor(pages_and_metadata_embeddings, dtype=torch.float32).to(device)

## 3.8 Getting the similarity score by query

In [8]:
embedding_model = SentenceTransformer(EMBEDDING_MODEL)
query_embeddings = embedding_model.encode(QUERY, convert_to_tensor=True).to(device)
dot_score = util.dot_score(query_embeddings, pages_and_metadata_embeddings)[0]

## 3.9 Getting the top k similar scores

In [9]:
top_scores, top_indices = torch.topk(dot_score, k=K)

In [10]:
print(f"Top scores: {top_scores}")
print(f"Top indices: {top_indices}")

Top scores: tensor([0.7010, 0.6388, 0.6295, 0.6056, 0.6056, 0.6056, 0.5908, 0.5883, 0.5883,
        0.5865, 0.5865, 0.5824, 0.5809, 0.5809, 0.5684, 0.5684, 0.5669, 0.5636,
        0.5628, 0.5588, 0.5574, 0.5574, 0.5533, 0.5527, 0.5523, 0.5522, 0.5515,
        0.5514, 0.5509, 0.5448, 0.5448, 0.5445, 0.5439, 0.5436, 0.5414, 0.5392,
        0.5389, 0.5348, 0.5346, 0.5327, 0.5323, 0.5305, 0.5295, 0.5272, 0.5267,
        0.5264, 0.5262, 0.5258, 0.5257, 0.5257], device='cuda:0')
Top indices: tensor([ 1444,  3262, 28329, 38917, 28202, 34307, 38980,   669, 35475,  1407,
        28328,  1557, 28198, 34303,  7270, 13283,  4531, 38879, 12143, 27897,
         7214, 13227, 27921, 28136,  5486, 28000,  8058,  7032,  3345,   641,
        35447, 38156,  2654, 37872, 38978,  1560,  1418, 34178, 28609, 12096,
        38882, 36657, 32895, 34766, 21797,  7002, 32897,  6190, 37852, 15709],
       device='cuda:0')


## 3.10 Getting the top k content based on the scores

In [11]:
context = list()
for index in top_indices:
    print(f"Fetching data from page {index}")
    context.append(flat_data[index.item()])

Fetching data from page 1444
Fetching data from page 3262
Fetching data from page 28329
Fetching data from page 38917
Fetching data from page 28202
Fetching data from page 34307
Fetching data from page 38980
Fetching data from page 669
Fetching data from page 35475
Fetching data from page 1407
Fetching data from page 28328
Fetching data from page 1557
Fetching data from page 28198
Fetching data from page 34303
Fetching data from page 7270
Fetching data from page 13283
Fetching data from page 4531
Fetching data from page 38879
Fetching data from page 12143
Fetching data from page 27897
Fetching data from page 7214
Fetching data from page 13227
Fetching data from page 27921
Fetching data from page 28136
Fetching data from page 5486
Fetching data from page 28000
Fetching data from page 8058
Fetching data from page 7032
Fetching data from page 3345
Fetching data from page 641
Fetching data from page 35447
Fetching data from page 38156
Fetching data from page 2654
Fetching data from page 37

In [12]:
print(context)

['overfitting comes light data associated complexity, means associated parameters relative number observations.', 'main reasons long time understood, introduce concepts terms utilized extensively machine learning topics.', 'follow steve nouri ai data science posts https//lnkd.in/gzu463x overfitting, statistical model describes random error noise instead underlying relationship.', 'simple restatement fundamental problem machine learning possibility overfitting training data carrying noise data test set, providing inaccurate generalizations.', 'lot data overﬁtting avoided, overﬁtting happens relatively small dataset, try learn it.', 'lot data overﬁtting avoided, overﬁtting happens relatively small dataset, try learn it.', 'decision trees prone overfitting, pruning tree helps reduce size minimizes chances overfitting.', 'overfitting evaluation 83 (from weiss, s., kulikowski, c., computer systems learn, morgan kaufmann, 1991) training errors validation errors 1 2 3 4 5 6 7 8 9 0.2 0.4 0.6 

# 4. Augmentation

## 4.1 Login to HuggingFace CLI

In [13]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## 4.2 Loading the LLM model

In [14]:
model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=False,
).to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## 4.3 Augmenting the prompt for instructing the LLM in a better way

In [15]:
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
context = "\n -".join(context)
base_prompt = f'''Bases on the following context items, please answer the query
Context Items:
{context}
Query:
{QUERY}
Answer:'''

In [16]:
base_prompt = base_prompt.format(context=context, query=QUERY)

## 4.4 Creating the dialogue template for the LLM

In [17]:
dialogue_template = [{
    "role": "user",
    "content": base_prompt,
}]

## 4.5 Applying the prompt to the dialogue template

In [18]:
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False,
                                       add_generation_prompt=True)

## 4.6 Providing the prompt and retrieving the answer from the LLM model

In [19]:
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
input_ids = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, temperature=TEMPERATURE, do_sample=True, max_new_tokens=MAX_NEW_TOKENS)
output_text = tokenizer.decode(outputs[0])

In [20]:
print(output_text)

<bos><bos><start_of_turn>user
Bases on the following context items, please answer the query
Context Items:
overfitting comes light data associated complexity, means associated parameters relative number observations.
 -main reasons long time understood, introduce concepts terms utilized extensively machine learning topics.
 -follow steve nouri ai data science posts https//lnkd.in/gzu463x overfitting, statistical model describes random error noise instead underlying relationship.
 -simple restatement fundamental problem machine learning possibility overfitting training data carrying noise data test set, providing inaccurate generalizations.
 -lot data overﬁtting avoided, overﬁtting happens relatively small dataset, try learn it.
 -lot data overﬁtting avoided, overﬁtting happens relatively small dataset, try learn it.
 -decision trees prone overfitting, pruning tree helps reduce size minimizes chances overfitting.
 -overfitting evaluation 83 (from weiss, s., kulikowski, c., computer syst

In [21]:
idx = output_text.find("Answer")

In [22]:
answer = output_text[idx+7:]

In [23]:
answer = answer.replace("**", "")
answer = answer.replace("<start_of_turn>model","")
answer = re.sub("<.*?>", "", answer)
# answer = answer[]

In [24]:
print(f"The cleaned answer is: {answer}")

The cleaned answer is: 

Sure, here's a 500-word explanation of overfitting in machine learning:

Overfitting is a major problem in machine learning (ML) that occurs when a model becomes too closely fit to the training data and fails to generalize well to new, unseen data. This can result in poor predictive performance, even on the same data that the model was trained on.

Key characteristics of overfitting:

* The model performs well on the training data but performs poorly on unseen data.
* The model becomes too complex and captures irrelevant features, leading to overfitting.
* The model's predictions become too specific to the training data and fail to capture the underlying patterns in the data.
* The model becomes insensitive to changes in the data, resulting in poor performance on unseen data.

Causes of overfitting:

* High training data quality: Overfitting can be exacerbated when the training data is of high quality, as it provides the model with more data points to learn fro