local from scratch: not using llamaindex or langchain

# Librairies Installation

In [None]:
import os
!pip install -U torch --timeout=1000
!pip install PyMuPDF --timeout=1000
!pip install tqdm --timeout=1000
!pip install sentence-transformers --timeout=1000
!pip install accelerate --timeout=1000 # for quantization model loading
!pip install bitsandbytes --timeout=1000 # for quantizing models
!pip install flash-attn --no-build-isolation --timeout=1000# for faster attention mechanism = faster LLM inference

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.10 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.10-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.10-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m106.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.10 PyMuPDFb-1.24.10
Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# PDF Data Processing

The process is as follows:
1. Break the doc into pages
2. Break the pages into sentences
3. Break the sentences into chunks (To prevent overloading the embedding models capacity for tokens)

Tokens can be thought of as pieces of words. Before the API processes the request, the input is broken down into tokens. These tokens are not cut up exactly where the words start or end - tokens can include trailing spaces and even sub-words. Here are some helpful rules of thumb for understanding tokens in terms of lengths:

1. **1 token ~= 4 chars in English**
2. 1 token ~= ¾ words
3. 100 tokens ~= 75 words

In [None]:
doc_path='nutrition.pdf'
import pymupdf
from tqdm import tqdm
def open_and_read_pdf(pdf_path):
  def format(text) :
    cleaned_text = text.replace("\n", " ").strip()
    cleaned_text = " ".join(cleaned_text.split())
    cleaned_text = cleaned_text.lower()
    return cleaned_text
  doc = pymupdf.open(doc_path)
  pages_and_texts = []
  for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        formatted_text = format(text)
        pages_and_texts.append({"page_number": page_number+1,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token =4 chars
                                "text": formatted_text})
  return pages_and_texts
pages_and_texts=open_and_read_pdf(doc_path)

1208it [00:01, 610.66it/s]


In [None]:
import pandas as pd

df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,1,31,5,1,7.75,human nutrition: 2020 edition
1,2,0,1,1,0.0,
2,3,322,43,1,80.5,human nutrition: 2020 edition university of ha...
3,4,214,31,2,53.5,human nutrition: 2020 edition by university of...
4,5,799,115,2,199.75,contents preface university of hawai‘i at māno...


In [None]:
avg_token_count=df['page_token_count'].mean()
print('The avg nb of tokens in a page is: ',avg_token_count)

The avg nb of tokens in a page is:  287.25289735099335


In [None]:
max_token_count=df['page_token_count'].max()
print('The max nb of tokens in a page is: ',max_token_count)

The max nb of tokens in a page is:  577.25


In [None]:
from spacy.lang.en import English

def break_into_sentences_spacy(pages_and_texts):
  nlp = English()
  nlp.add_pipe("sentencizer") #component that helps detect sentence boundaries
  for item in tqdm(pages_and_texts):
      item["sentences"] = list(nlp(item["text"]).sents)

      # Make sure all sentences are strings
      item["sentences"] = [str(sentence) for sentence in item["sentences"]]

      # Count the sentences
      item["page_sentence_count_spacy"] = len(item["sentences"])
break_into_sentences_spacy(pages_and_texts)

100%|██████████| 1208/1208 [00:01<00:00, 918.92it/s]


In [None]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy
0,1,31,5,1,7.75,human nutrition: 2020 edition,[human nutrition: 2020 edition],1
1,2,0,1,1,0.0,,[],0
2,3,322,43,1,80.5,human nutrition: 2020 edition university of ha...,[human nutrition: 2020 edition university of h...,1
3,4,214,31,2,53.5,human nutrition: 2020 edition by university of...,[human nutrition: 2020 edition by university o...,1
4,5,799,115,2,199.75,contents preface university of hawai‘i at māno...,[contents preface university of hawai‘i at mān...,2


In [None]:
avg_page_sentence_count_spacy=df['page_sentence_count_spacy'].mean()
print('The avg_page_sentence_count_spacy is: ',avg_page_sentence_count_spacy)

The avg_page_sentence_count_spacy is:  10.350165562913908


On average each of our pages has 10 sentences.
And an average total of 287 tokens per page.
So our groups of 10 sentences will also be ~287 tokens long.

In [None]:
sentence_chunk_size = 10

def split_into_chunks(pages_and_texts):
  def split_list(input_list: list, sentence_chunk_size: int):
    return [input_list[i:i + sentence_chunk_size] for i in range(0, len(input_list), sentence_chunk_size)]

  for item in tqdm(pages_and_texts):
      item["sentence_chunks"] = split_list(input_list=item["sentences"],sentence_chunk_size=sentence_chunk_size)
      item["num_chunks"] = len(item["sentence_chunks"])
split_into_chunks(pages_and_texts)

100%|██████████| 1208/1208 [00:00<00:00, 297814.57it/s]


In [None]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy,sentence_chunks,num_chunks
0,1,31,5,1,7.75,human nutrition: 2020 edition,[human nutrition: 2020 edition],1,[[human nutrition: 2020 edition]],1
1,2,0,1,1,0.0,,[],0,[],0
2,3,322,43,1,80.5,human nutrition: 2020 edition university of ha...,[human nutrition: 2020 edition university of h...,1,[[human nutrition: 2020 edition university of ...,1
3,4,214,31,2,53.5,human nutrition: 2020 edition by university of...,[human nutrition: 2020 edition by university o...,1,[[human nutrition: 2020 edition by university ...,1
4,5,799,115,2,199.75,contents preface university of hawai‘i at māno...,[contents preface university of hawai‘i at mān...,2,[[contents preface university of hawai‘i at mā...,1


In [None]:
import re

def create_chunks(pages_and_texts):
  chunks = []
  for item in tqdm(pages_and_texts):
      for sentence_chunk in item["sentence_chunks"]:
          chunk_dict = {}
          chunk_dict["page_number"] = item["page_number"]

          # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
          joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
          joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo
          chunk_dict["sentence_chunk"] = joined_sentence_chunk

          # Get stats about the chunk
          chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
          chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
          chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters

          chunks.append(chunk_dict)
  return chunks
chunks=create_chunks(pages_and_texts)

100%|██████████| 1208/1208 [00:00<00:00, 38737.27it/s]


In [None]:
len(chunks)

1845

In [None]:
chunks_df = pd.DataFrame(chunks)
chunks_df.head()

Unnamed: 0,page_number,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,1,human nutrition: 2020 edition,29,4,7.25
1,3,human nutrition: 2020 edition university of ha...,308,42,77.0
2,4,human nutrition: 2020 edition by university of...,210,30,52.5
3,5,contents preface university of hawai‘i at māno...,765,115,191.25
4,6,lifestyles and nutrition university of hawai‘i...,939,142,234.75


Assuming that sentences with a maximum of 20 tokens(=15 words) re irrelevnt (page titles, etc...) let's filer them out.

In [None]:
min_token_length = 20
chunks= chunks_df[chunks_df["chunk_token_count"] > min_token_length].to_dict(orient="records")

In [None]:
chunks_df = pd.DataFrame(chunks)
chunks_df.to_csv('chunks.csv', index=False)

In [None]:
len(chunks)

1748

# Embedding the Data

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",device="cuda")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
"""text_chunks = [item["sentence_chunk"] for item in chunks]
text_chunk_embeddings = embedding_model.encode(text_chunks,batch_size=32,convert_to_tensor=True)"""
embedding_model.to("cuda")
for item in tqdm(chunks):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

100%|██████████| 1748/1748 [00:38<00:00, 45.18it/s]


In [None]:
text_chunks_and_embeddings_df = pd.DataFrame(chunks)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
len(text_chunks_and_embeddings_df['embedding'][0])

768

Our model turns text inputs up to 384 tokens long in embedding vectors of size 768.

# HERE !!!

In [None]:
"""import pandas as pd
text_chunks_and_embedding_df2 = pd.read_csv('/content/text_chunks_and_embeddings_df.csv')
len(text_chunks_and_embedding_df2['embedding'][0])"""


In [None]:
embeddings=text_chunks_and_embeddings_df['embedding']
embeddings = torch.stack([torch.tensor(embedding) for embedding in text_chunks_and_embeddings_df['embedding']]).to("cuda")  # Move embeddings to 'cuda'


In [None]:
embeddings.shape

torch.Size([1748, 768])

Since our dataset is relatively small (1748 examples), i'm not going to use a vector database to store them.

# Semantic Search

The 2 most common vector similarity metrics you'll across are the dot product and cosine similarity. The **main difference** between them is that **cosine similarity has a normalization step**.

For text similarity, you generally want to use cosine similarity as you are after the semantic measurements (direction) rather than magnitude.

In **our case**, our embedding model all-mpnet-base-v2 outputs **normalized outputs** so dot product and cosine similarity return the same results.

In [None]:
query = "what are the macronutrients functions"
k=5

In [None]:
#text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))
pages_and_chunks = text_chunks_and_embeddings_df.to_dict(orient="records")

In [None]:
import textwrap
from sentence_transformers import util


def get_dotproduct_results(query,k,embeddings):
  query_embedding = embedding_model.encode(query, convert_to_tensor=True)
  dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
  scores,indices = torch.topk(dot_scores, k=k)
  return scores,indices
def print_responses():
  scores,indices=get_dotproduct_results(query,k,embeddings)
  print("Results:")
  for score, idx in zip(scores,indices):
      print(f"Score: {score:.4f}")
      idx = idx.item()
      print("Text: ",textwrap.fill(pages_and_chunks["sentence_chunk"][idx],80))
      print(f"Page number: {pages_and_chunks['page_number'][idx]}")
      print("\n")

get_dotproduct_results(query,k,embeddings)

(tensor([0.6817, 0.6559, 0.6462, 0.6453, 0.6327], device='cuda:0'),
 tensor([43, 48, 42, 47, 52], device='cuda:0'))

# Loading the LLM locally

In [None]:
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

Available GPU memory: 15 GB


In [None]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 15 | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma-2b-it


For 15GB , we'll use the 'google/gemma-7b-it' model quantized version (4-bit precision)

In [None]:
use_quantization_config = True
model_id = "google/gemma-7b-it"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

In [None]:
!pip install bitsandbytes accelerate



In [None]:
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.float16)

In [None]:
!pip install flash-attn

Collecting flash-attn
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/2.6 MB[0m [31m17.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.6/2.6 MB[0m [31m47.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24
  Stored in directory: /root/.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9a

In [None]:
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

[INFO] Using attention implementation: sdpa


In [None]:
from huggingface_hub import login
YOUR_ACCESS_TOKEN="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"
# Login using your token
login(YOUR_ACCESS_TOKEN)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

In [None]:
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

In [None]:
if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")

In [None]:
llm_model

In [None]:
def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

# Prompt

In [None]:
def prompt_formatter(query,context_items) :

    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    \nExample 1:
    Query: What are the fat-soluble vitamins?
    Answer: The fat-soluble vitamins include Vitamin A, Vitamin D, Vitamin E, and Vitamin K. These vitamins are absorbed along with fats in the diet and can be stored in the body's fatty tissue and liver for later use. Vitamin A is important for vision, immune function, and skin health. Vitamin D plays a critical role in calcium absorption and bone health. Vitamin E acts as an antioxidant, protecting cells from damage. Vitamin K is essential for blood clotting and bone metabolism.
    \nExample 2:
    Query: What are the causes of type 2 diabetes?
    Answer: Type 2 diabetes is often associated with overnutrition, particularly the overconsumption of calories leading to obesity. Factors include a diet high in refined sugars and saturated fats, which can lead to insulin resistance, a condition where the body's cells do not respond effectively to insulin. Over time, the pancreas cannot produce enough insulin to manage blood sugar levels, resulting in type 2 diabetes. Additionally, excessive caloric intake without sufficient physical activity exacerbates the risk by promoting weight gain and fat accumulation, particularly around the abdomen, further contributing to insulin resistance.
    \nExample 3:
    Query: What is the importance of hydration for physical performance?
    Answer: Hydration is crucial for physical performance because water plays key roles in maintaining blood volume, regulating body temperature, and ensuring the transport of nutrients and oxygen to cells. Adequate hydration is essential for optimal muscle function, endurance, and recovery. Dehydration can lead to decreased performance, fatigue, and increased risk of heat-related illnesses, such as heat stroke. Drinking sufficient water before, during, and after exercise helps ensure peak physical performance and recovery.
    \nNow use the following context items to answer the user query:
    {context}
    \nRelevant passages: <extract relevant passages from the context here>
    User query: {query}
    Answer:"""
    base_prompt = base_prompt.format(context=context, query=query)
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [None]:
scores, indices =get_dotproduct_results(query=query,k=k,embeddings=embeddings)
context_items = [pages_and_chunks[i] for i in indices]
prompt = prompt_formatter(query=query,context_items=context_items)

In [None]:
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")